diff --git a/fuzz/fuzz_targets/differential.rs b/fuzz/fuzz_targets/differential.rs index 9b708c7..d8c817d 100644 --- a/fuzz/fuzz_targets/differential.rs +++ b/fuzz/fuzz_targets/differential.rs @@ -30,6 +30,12 @@ //! character/block devices, symbolic links, hard links) but whose `size` //! field is non-zero. tar-rs silently accepts such archives and treats the //! non-zero size as content bytes, which can lead to stream desynchronisation. +//! +//! - **GNU LongName/LongLink NUL truncation**: tar-core truncates the resolved +//! path/link-target at the first NUL byte, matching GNU tar's C-string +//! convention. tar-rs returns the full content without truncation. This is +//! normalized in `parse_tar_rs` (in testutil) before comparison, not treated +//! as a hard error. #![no_main] @@ -58,7 +64,8 @@ fn dump_headers(data: &[u8]) { } /// Returns true if the error is a known behavioral difference where -/// tar-core is intentionally stricter than tar-rs. +/// tar-core is intentionally stricter than tar-rs in ways that produce +/// hard errors (not just output normalization). /// /// When this returns true, tar-rs may have parsed more entries than /// tar-core, and that's expected. @@ -84,6 +91,11 @@ fn is_allowlisted_divergence(err: &ParseError) -> bool { /// all-null numeric fields are accepted as 0), so we only require that /// tar-core parses *at least* as many entries as tar-rs and that those /// entries match. +/// +/// Note: paths and link_targets from tar-rs are pre-normalized by +/// `truncate_at_nul` in `parse_tar_rs` to account for tar-core correctly +/// truncating GNU LongName/LongLink content at the first NUL byte (matching +/// GNU tar's C-string convention) while tar-rs does not. fn compare_entries( data: &[u8], tar_rs_entries: &[OwnedEntry], diff --git a/src/parse.rs b/src/parse.rs index 4dac36d..53aa0c3 100644 --- a/src/parse.rs +++ b/src/parse.rs @@ -1006,14 +1006,20 @@ impl Parser { let content_end = content_start + size as usize; let mut data: &'a [u8] = &input[content_start..content_end]; - // Strip trailing null for GNU long name/link + // Truncate GNU long name/link content at the first NUL byte. + // + // The GNU tar format specifies that LongName/LongLink content is a + // NUL-terminated C string. Stopping at the first NUL (not just + // stripping a trailing one) is correct: content that is padded with + // zeros to fill a block boundary, or that has an embedded NUL due to + // archive corruption or overlay, should not contribute bytes to the + // path beyond the terminator. This matches what GNU tar, Python + // tarfile, and Go archive/tar all do. if matches!( kind, ExtensionKind::GnuLongName | ExtensionKind::GnuLongLink ) { - if let Some(trimmed) = data.strip_suffix(&[0]) { - data = trimmed; - } + data = crate::truncate_null(data); self.limits.check_path_len(data.len())?; } @@ -1959,6 +1965,73 @@ mod tests { // GNU long name tests // ========================================================================= + /// Build a GNU `L` header whose content is `raw_content` verbatim + /// (no trailing NUL added). Used for testing embedded-NUL truncation. + fn make_gnu_long_name_raw(raw_content: &[u8]) -> Vec { + let padded = raw_content.len().next_multiple_of(HEADER_SIZE); + let header = make_header(b"././@LongLink", raw_content.len() as u64, b'L'); + let mut result = Vec::with_capacity(HEADER_SIZE + padded); + result.extend_from_slice(&header); + result.extend_from_slice(raw_content); + result.extend(zeroes(padded - raw_content.len())); + result + } + + /// GNU LongName content is NUL-terminated: an embedded NUL must truncate + /// the path just like a trailing NUL does. This matches GNU tar, Python + /// tarfile, and Go archive/tar. + #[test] + fn test_parser_gnu_long_name_embedded_nul_truncates() { + // Content: "safe\x00evil" — the first NUL terminates at "safe". + let raw: &[u8] = b"safe\x00evil"; + + let mut archive = Vec::new(); + archive.extend(make_gnu_long_name_raw(raw)); + archive.extend_from_slice(&make_header(b"placeholder", 0, b'0')); + archive.extend(zeroes(1024)); + + let mut parser = Parser::new(Limits::default()); + match parser.parse(&archive).unwrap() { + ParseEvent::Entry { entry, .. } => { + assert_eq!( + entry.path.as_ref(), + b"safe", + "embedded NUL should truncate LongName path" + ); + } + other => panic!("Expected Entry, got {:?}", other), + } + } + + /// A GNU LongName padded with NUL bytes to 100 bytes, followed by mode-like + /// ASCII bytes within the declared size — the pattern produced by the overlay + /// mutation strategy. Only the bytes before the first NUL should be used. + #[test] + fn test_parser_gnu_long_name_nul_padded_header_bytes() { + // "safe" + 96 NUL bytes (fills a ustar name field) + "0000644\x00" + let mut raw: Vec = b"safe".to_vec(); + raw.resize(100, 0u8); + raw.extend_from_slice(b"0000644\x00"); + assert_eq!(raw.len(), 108); + + let mut archive = Vec::new(); + archive.extend(make_gnu_long_name_raw(&raw)); + archive.extend_from_slice(&make_header(b"placeholder", 0, b'0')); + archive.extend(zeroes(1024)); + + let mut parser = Parser::new(Limits::default()); + match parser.parse(&archive).unwrap() { + ParseEvent::Entry { entry, .. } => { + assert_eq!( + entry.path.as_ref(), + b"safe", + "NUL padding after short name must truncate, not include mode-field bytes" + ); + } + other => panic!("Expected Entry, got {:?}", other), + } + } + #[test] fn test_parser_gnu_long_name() { // Create archive with GNU long name entry followed by actual file diff --git a/testutil/src/lib.rs b/testutil/src/lib.rs index c5b2c22..2d61397 100644 --- a/testutil/src/lib.rs +++ b/testutil/src/lib.rs @@ -171,6 +171,20 @@ pub fn parse_tar_core_detailed(data: &[u8], limits: Limits) -> TarCoreParseResul } } +/// Truncate a byte slice at the first NUL byte, if any. +/// +/// GNU LongName/LongLink content is NUL-terminated (C-string convention). +/// tar-core truncates at the first NUL when resolving these extension headers, +/// matching GNU tar and POSIX filesystem semantics (NUL is not a valid filename +/// character). tar-rs does not perform this truncation, so we normalize its +/// output here before comparison. +fn truncate_at_nul(bytes: Vec) -> Vec { + match bytes.iter().position(|&b| b == 0) { + Some(pos) => bytes[..pos].to_vec(), + None => bytes, + } +} + /// Parse a tar archive with the `tar` crate, returning owned entries. pub fn parse_tar_rs(data: &[u8]) -> Vec { let mut results = Vec::new(); @@ -190,7 +204,11 @@ pub fn parse_tar_rs(data: &[u8]) -> Vec { let header = entry.header().clone(); let entry_type = header.entry_type().as_byte(); - let path = entry.path_bytes().into_owned(); + // Normalize NUL-termination: tar-rs does not truncate GNU LongName/ + // LongLink content at the first NUL byte; tar-core does (matching the + // C-string convention used by GNU tar). Truncate here so we compare + // equivalent representations. + let path = truncate_at_nul(entry.path_bytes().into_owned()); let size = entry.size(); // Require that numeric fields parse successfully. tar-core @@ -229,10 +247,12 @@ pub fn parse_tar_rs(data: &[u8]) -> Vec { } // entry.link_name_bytes() applies PAX linkpath and GNU long link // overrides, unlike header.link_name_bytes() which is raw. + // Also truncate at the first NUL to match tar-core's behavior for + // GNU LongLink content (same NUL-termination normalization as path). let link_target = entry .link_name_bytes() .filter(|b| !b.is_empty()) - .map(|b| b.to_vec()); + .map(|b| truncate_at_nul(b.to_vec())); // Extract PAX-overridden uname/gname and xattrs from PAX extensions. // tar-rs does not expose PAX uname/gname through entry-level methods,