diff --git a/CHANGELOG.md b/CHANGELOG.md index e3a6fc9..799290e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,12 @@ All notable changes to Crossfire will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.2.8] - 2026-04-21 + +### Fixed + +- **Non-greedy quantifiers no longer get sampled like greedy ones.** `sre_parse` tags `{m,n}?`, `*?`, and `+?` as `min_repeat` and the greedy forms as `max_repeat`; upstream rstr (3.2.x) dispatches both through the same handler and draws the repeat count uniformly from `[m, n]` — throwing away the non-greedy semantics. For patterns with wide non-greedy holes like `(?s:.){0,200}?`, rstr generated ~100 random chars in the gap (with `.` sampled from `string.printable`, including `\v`/`\x0c`/`\n`). The result (a) bore no resemblance to what the regex would actually match against real text — the re engine fills non-greedy regions with the minimum the surrounding anchors allow — and (b) blew past `max_string_length=256`, so ~99% of rstr calls got filtered out, the few survivors all shared one degenerate shape, and stage-2 mutational padding then fanned that single base into a whole corpus of near-duplicates. `crossfire.generator` now patches `rstr.xeger.Xeger._handle_state` at module import: when the opcode is `min_repeat`, we emit exactly `start_range` repetitions — the semantically correct minimum, matching how the re engine behaves. `max_repeat` still goes through `_handle_repeat` (with the `{N>100}` cap fix from 0.2.7). Measured effect on the real-world `kubernetes_secret_yaml` gitleaks rule (the 0.2.7 degenerate-sample report): from 27 samples / ~10 shared middles to 30 samples / 29 unique middles at `samples_per_rule=30`, `max_string_length=256`. This resolves the last 0.2.7 known limitation — all 6 rules in the lumen-argus community.json regression set now produce real, diverse coverage. Regression coverage: `tests/test_generator.py::TestNonGreedyRepeat`. + ## [0.2.7] - 2026-04-21 ### Fixed diff --git a/crossfire/generator.py b/crossfire/generator.py index 429c4b9..130910e 100644 --- a/crossfire/generator.py +++ b/crossfire/generator.py @@ -52,9 +52,43 @@ def _patched_handle_repeat(self: Any, start_range: int, end_range: int, value: s return "".join(result) +_RSTR_ORIGINAL_HANDLE_STATE = _RstrXeger._handle_state + + +def _patched_handle_state(self: Any, state: Any) -> Any: + """Respect non-greedy quantifiers when generating samples. + + `sre_parse` tags `{m,n}?`, `*?`, `+?` as `min_repeat` and the greedy + forms as `max_repeat`. Upstream rstr routes both to the same handler + and draws the repeat count uniformly from `[m, n]` in either case — + throwing away the non-greedy semantics. + + For pattern-matching, that's not how non-greedy works. `(?s:.){0,200}?` + is the author saying "match as few dots as possible"; the re engine + would fill that region with the minimum the surrounding anchors allow. + A generator that picks ~100 random dot-any characters produces strings + that (a) look nothing like what this rule actually matches in real text + and (b) blow past `max_string_length`, so ~99% of rstr calls get + filtered out. The few survivors all share the same degenerate shape, + and stage-2 padding then fans one base sample into a whole corpus of + near-duplicates — the `kubernetes_secret_yaml` degeneracy reported + against 0.2.7. + + Fix: for `min_repeat`, emit exactly `start_range` repetitions — the + semantically correct minimum. Greedy `max_repeat` still goes through + `_handle_repeat` (with the `{N>100}` cap fix above). + """ + opcode, value = state + if opcode.name.lower() == "min_repeat": + start_range, _end_range, sub = value + return "".join("".join(self._handle_state(i) for i in sub) for _ in range(start_range)) + return _RSTR_ORIGINAL_HANDLE_STATE(self, state) + + # setattr (not direct assignment) avoids mypy's method-assign error # without an ignore that warn_unused_ignores flips on newer Python versions. setattr(_RstrXeger, "_handle_repeat", _patched_handle_repeat) # noqa: B010 +setattr(_RstrXeger, "_handle_state", _patched_handle_state) # noqa: B010 # We default to "spawn" rather than "fork" because forking from a multi-threaded # parent process is unsafe: child processes inherit memory but only the calling diff --git a/pyproject.toml b/pyproject.toml index c99bc9b..3a10727 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ include = ["crossfire*"] [project] name = "crossfire-rules" -version = "0.2.7" +version = "0.2.8" description = "Regex rule overlap analyzer for DLP, secret scanning, SAST, and IDS tools" readme = "README.md" license = "MIT" diff --git a/tests/test_generator.py b/tests/test_generator.py index f158fb0..50e671a 100644 --- a/tests/test_generator.py +++ b/tests/test_generator.py @@ -335,6 +335,96 @@ def test_narrow_variable_range_unchanged(self): assert 2 <= len(s) <= 5 +class TestNonGreedyRepeat: + """Upstream rstr routes `min_repeat` (non-greedy `{m,n}?`, `*?`, `+?`) + through the same handler as `max_repeat`, sampling uniformly over + `[m, n]`. That breaks wide non-greedy holes like `(?s:.){0,200}?`: + the generator emits ~100 random chars there, the result blows past + `max_string_length`, gets filtered, and the few survivors all share + one degenerate shape — stage-2 padding then inflates that single base + into a pseudo-corpus. Crossfire's patch makes `min_repeat` emit + exactly `start_range` repetitions (the semantically correct minimum), + which is how the re engine would actually fill that region against + real text. These tests lock that behavior in. + """ + + def test_non_greedy_zero_min_emits_empty(self): + """`(?s:.){0,100}?` should produce zero dot-any chars, not ~50.""" + import rstr + + import crossfire.generator # noqa: F401 + + # A pattern where the only variability is the non-greedy hole. + # With the fix, the hole is always empty → output is exactly "ab". + for _ in range(20): + s = rstr.xeger(r"a(?s:.){0,100}?b") + assert s == "ab" + + def test_non_greedy_nonzero_min_emits_min(self): + """`X{5,50}?` should emit exactly 5 copies of X — the explicit min.""" + import rstr + + import crossfire.generator # noqa: F401 + + for _ in range(20): + s = rstr.xeger(r"([a-f]{5,50}?)") + assert len(s) == 5 + assert all(c in "abcdef" for c in s) + + def test_greedy_still_varies(self): + """Greedy `{m,n}` (max_repeat) must keep varying — only non-greedy + is clamped to the minimum.""" + import rstr + + import crossfire.generator # noqa: F401 + + lengths = {len(rstr.xeger(r"[a-z]{2,20}")) for _ in range(40)} + assert len(lengths) >= 3 + + def test_kubernetes_secret_yaml_diversity(self): + """Regression for 0.2.7 degenerate-sample report: the gitleaks + `kubernetes_secret_yaml` pattern has two wide `(?s:.){0,N}?` holes. + Before the non-greedy fix, ~99% of raw rstr outputs overshot + `max_string_length=256` and the few passing samples shared the + same degenerate middle, which stage-2 padding then multiplied. + After the fix, each sample's middle must be distinct. + """ + # The exact real-world pattern from community.json (gitleaks). + pattern = ( + r"(?i)(?:\bkind:[ \t]*[\"']?\bsecret\b[\"']?(?s:.){0,200}?" + r"\bdata:(?s:.){0,100}?\s+([\w.-]+:(?:[ \t]*(?:\||>[-+]?)\s+)?" + r"[ \t]*(?:[\"']?[a-z0-9+/]{10,}={0,3}[\"']?" + r"|\{\{[ \t\w\"|$:=,.-]+}}|\"\"|''))|\bdata:(?s:.){0,100}?" + r"\s+([\w.-]+:(?:[ \t]*(?:\||>[-+]?)\s+)?[ \t]*" + r"(?:[\"']?[a-z0-9+/]{10,}={0,3}[\"']?" + r"|\{\{[ \t\w\"|$:=,.-]+}}|\"\"|''))(?s:.){0,200}?" + r"\bkind:[ \t]*[\"']?\bsecret\b[\"']?)" + ) + rule = _make_rule("kubernetes_secret_yaml", pattern) + gen = CorpusGenerator( + samples_per_rule=30, min_valid_samples=15, negative_samples=0, seed=42 + ) + entries = gen.generate([rule]) + positives = [e.text for e in entries if not e.is_negative] + assert len(positives) >= 15 + + # Middle-of-string uniqueness: the test from the team's 0.2.7 report. + # Prefixes/suffixes are stage-2 padding noise; the middle reveals + # whether the base matches are actually distinct. + def middle(s: str, k: int = 50) -> str: + if len(s) <= k: + return s + mid = len(s) // 2 + return s[mid - k // 2 : mid + k // 2] + + unique_middles = {middle(s) for s in positives} + # Pre-fix: 10/27. Post-fix: should be close to 1:1 with sample count. + assert len(unique_middles) >= len(positives) * 0.7, ( + f"only {len(unique_middles)} unique middles across {len(positives)} samples " + f"— non-greedy fix regressed, samples are degenerate again" + ) + + class TestIntermittentRstrFailures: """Some patterns make rstr throw on a fraction of calls — the generator must not `break` on the first exception, or we lose the useful output