Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 47 additions & 3 deletions pure-magic/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3357,7 +3357,9 @@ impl MagicDb {
};

let buf = haystack.read_range(0..FILE_BYTES_MAX as u64)?;
let mut reader = csv::Reader::from_reader(io::Cursor::new(buf));
let mut reader = csv::ReaderBuilder::new()
.has_headers(false)

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

has_headers(false) is the load-bearing bit for two-line CSVs. With the default true, csv::Reader consumes a,b as a header and records() then yields only one data row from a,b\n1,2\nn stays at 1 and the n < 2 reject fires. libmagic counts newlines, not data rows, so we need every line in the count.

.from_reader(io::Cursor::new(buf));
let mut records = reader.records();

let Some(Ok(first)) = records.next() else {
Expand All @@ -3384,8 +3386,8 @@ impl MagicDb {
n += 1;
}

// we need at least 10 lines
if n != 10 {
// we need at least 2 lines (matches file command https://github.com/file/file/commit/b4e621d1d5b3e9d142dd23030cca09f6f198e18b)
if n < 2 {

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

n < 2 matches upstream is_csv.c: return tf > 1 && nl >= 2. The first.len() <= 1 check just above is the tf > 1 half (≥2 fields per row), and this is the nl >= 2 half (≥2 records).

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't see the code equivalent to tf > 1

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Minimum number of fields requirement is handled by the check above

        // very not likely a CSV otherwise all programming
        // languages having ; line terminator would be
        // considered as CSV
        if first.len() <= 1 {
            return Ok(false);
        }

return Ok(false);
}

Expand Down Expand Up @@ -4887,4 +4889,46 @@ HelloWorld
db.load_bulk(rules.into_iter());
assert!(matches!(db.verify(), Err(Error::Verify(_, _, _))));
}

// try_csv runs before any rule; pass a never-matching rule so the
// harness only exercises the hardcoded CSV detector.
fn csv_magic(content: &[u8]) -> Magic<'static> {
first_magic(
"0\tstring\t__NEVER_MATCH__\tnope\n",
content,
StreamKind::Text(TextEncoding::Utf8),
)
.unwrap()
}

#[test]
fn test_csv_two_rows_two_cols() {
let m = csv_magic(b"a,b\n1,2\n");
assert_eq!(m.mime_type(), "text/csv");
}

#[test]
fn test_csv_short_consistent_rows() {
let m = csv_magic(b"a,b,c\n1,2,3\n4,5,6\n7,8,9\n10,11,12\n");
assert_eq!(m.mime_type(), "text/csv");
}

#[test]
fn test_csv_many_rows_still_detected() {
let body: &[u8] = b"a,b,c\n1,2,3\n4,5,6\n7,8,9\n10,11,12\n13,14,15\n16,17,18\n19,20,21\n22,23,24\n25,26,27\n28,29,30\n31,32,33\n";
let m = csv_magic(body);
assert_eq!(m.mime_type(), "text/csv");
}

#[test]
fn test_csv_single_field_rejected() {
let m = csv_magic(b"hello\nworld\nfoo\n");
assert_ne!(m.mime_type(), "text/csv");
}

#[test]
fn test_csv_ragged_columns_rejected() {
let m = csv_magic(b"a,b,c\n1,2\n3,4,5\n");
assert_ne!(m.mime_type(), "text/csv");
}
}
Loading