From 68d010975cfecc43b0d474449dc41c7614237c48 Mon Sep 17 00:00:00 2001 From: Timo Naroska Date: Fri, 22 May 2026 09:43:18 -0700 Subject: [PATCH] fix: apply binary/text test gate at rule entry only MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The skip for is_only_binary/is_only_text in Match::matches ran on every recursion, so child tests of an already-matched parent were dropped on text streams. This silently stripped any rule whose message lived on a scalar/float/pstring/string16/indirect child — most visibly the RTF rule (magic-db/src/magdir/rtf), which carries its message on a level-2 ubyte qualifier and was reported as plain "ASCII text". Hoist the gate into EntryNode::matches and run it only when self.root, mirroring upstream libmagic's softmagic.c::match() semantics: the binary/text decision is taken once at the outer match loop, not re-applied to children. Top-level perf filtering for the >300 rule files is preserved. Add two regression tests covering (a) the RTF-shaped rule emitting its message on a text stream and (b) the upstream DROID-skip qualifier still rejecting fmt-355-signature-id-522.rtf. Co-Authored-By: Claude Opus 4.7 --- pure-magic/src/lib.rs | 66 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 56 insertions(+), 10 deletions(-) diff --git a/pure-magic/src/lib.rs b/pure-magic/src/lib.rs index 60a52fe..40bc312 100644 --- a/pure-magic/src/lib.rs +++ b/pure-magic/src/lib.rs @@ -2197,16 +2197,6 @@ impl Match { )); } - if self.test.is_only_binary() && stream_kind.is_text() { - trace!("skip binary test source={source} line={line} stream_kind={stream_kind:?}",); - return Ok((false, None)); - } - - if self.test.is_only_text() && !stream_kind.is_text() { - trace!("skip text test source={source} line={line} stream_kind={stream_kind:?}",); - return Ok((false, None)); - } - let Ok(Some(mut offset)) = self .offset_from_start(haystack, rule_base_offset, last_level_offset) .inspect_err(|e| debug!("source={source} line={line} failed at computing offset: {e}")) @@ -2588,6 +2578,26 @@ impl EntryNode { ) -> Result { let mut nmatch = 0u64; + // Mirror libmagic's softmagic.c::match(): the binary/text gate fires + // only at the top of a rule. Once a parent matches, its sub-tests run + // regardless of stream classification — otherwise nested scalar tests + // (e.g. the `>5 ubyte` qualifiers inside the RTF rule) are dropped on + // text inputs and the rule's message is never emitted. + if self.root { + let source = opt_source.unwrap_or("unknown"); + let line = self.entry.line; + + if self.entry.test.is_only_binary() && stream_kind.is_text() { + trace!("skip binary test source={source} line={line} stream_kind={stream_kind:?}"); + return Ok(0); + } + + if self.entry.test.is_only_text() && !stream_kind.is_text() { + trace!("skip text test source={source} line={line} stream_kind={stream_kind:?}"); + return Ok(0); + } + } + let (ok, opt_match_res) = self.entry.matches( opt_source, magic, @@ -4918,4 +4928,40 @@ HelloWorld db.load_bulk(rules.into_iter()); assert!(matches!(db.verify(), Err(Error::Verify(_, _, _)))); } + + // Regression: nested scalar tests under a top-level `string` match used to + // be dropped on text inputs because the binary/text gate ran inside every + // recursion. The shape mirrors magdir/rtf:11–18 — the message lives on + // the level-2 ubyte child, so dropping the children silently strips the + // whole rule. Upstream libmagic's softmagic.c::match() applies the gate + // only at the outer match loop. + #[test] + fn test_string_parent_with_scalar_children_on_text_stream() { + assert_magic_match_text!( + r" +0 string {\\rtf +>5 ubyte !0xAB +>>5 ubyte !0x5C Rich Text Format data +!:mime text/rtf +!:ext rtf + ", + b"{\\rtf1\\ansi\\ansicpg1252\nHello world}", + "Rich Text Format data" + ); + } + + // The level-1 `>5 ubyte !0xAB` qualifier exists upstream specifically to + // skip DROID fmt-355-signature-id-522.rtf. With \xAB at offset 5 the rule + // must NOT emit a message even now that children run on text streams. + #[test] + fn test_rtf_droid_skip_still_rejects() { + assert_magic_not_match_text!( + r" +0 string {\\rtf +>5 ubyte !0xAB +>>5 ubyte !0x5C Rich Text Format data + ", + b"{\\rtf\xab......\n" + ); + } }