lumen-argus · slima4 · Apr 21, 2026 · Apr 21, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,12 @@ All notable changes to Crossfire will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.2.8] - 2026-04-21
+
+### Fixed
+
+- **Non-greedy quantifiers no longer get sampled like greedy ones.** `sre_parse` tags `{m,n}?`, `*?`, and `+?` as `min_repeat` and the greedy forms as `max_repeat`; upstream rstr (3.2.x) dispatches both through the same handler and draws the repeat count uniformly from `[m, n]` — throwing away the non-greedy semantics. For patterns with wide non-greedy holes like `(?s:.){0,200}?`, rstr generated ~100 random chars in the gap (with `.` sampled from `string.printable`, including `\v`/`\x0c`/`\n`). The result (a) bore no resemblance to what the regex would actually match against real text — the re engine fills non-greedy regions with the minimum the surrounding anchors allow — and (b) blew past `max_string_length=256`, so ~99% of rstr calls got filtered out, the few survivors all shared one degenerate shape, and stage-2 mutational padding then fanned that single base into a whole corpus of near-duplicates. `crossfire.generator` now patches `rstr.xeger.Xeger._handle_state` at module import: when the opcode is `min_repeat`, we emit exactly `start_range` repetitions — the semantically correct minimum, matching how the re engine behaves. `max_repeat` still goes through `_handle_repeat` (with the `{N>100}` cap fix from 0.2.7). Measured effect on the real-world `kubernetes_secret_yaml` gitleaks rule (the 0.2.7 degenerate-sample report): from 27 samples / ~10 shared middles to 30 samples / 29 unique middles at `samples_per_rule=30`, `max_string_length=256`. This resolves the last 0.2.7 known limitation — all 6 rules in the lumen-argus community.json regression set now produce real, diverse coverage. Regression coverage: `tests/test_generator.py::TestNonGreedyRepeat`.
+
 ## [0.2.7] - 2026-04-21
 
 ### Fixed

diff --git a/crossfire/generator.py b/crossfire/generator.py
@@ -52,9 +52,43 @@ def _patched_handle_repeat(self: Any, start_range: int, end_range: int, value: s
     return "".join(result)
 
 
+_RSTR_ORIGINAL_HANDLE_STATE = _RstrXeger._handle_state
+
+
+def _patched_handle_state(self: Any, state: Any) -> Any:
+    """Respect non-greedy quantifiers when generating samples.
+
+    `sre_parse` tags `{m,n}?`, `*?`, `+?` as `min_repeat` and the greedy
+    forms as `max_repeat`. Upstream rstr routes both to the same handler
+    and draws the repeat count uniformly from `[m, n]` in either case —
+    throwing away the non-greedy semantics.
+
+    For pattern-matching, that's not how non-greedy works. `(?s:.){0,200}?`
+    is the author saying "match as few dots as possible"; the re engine
+    would fill that region with the minimum the surrounding anchors allow.
+    A generator that picks ~100 random dot-any characters produces strings
+    that (a) look nothing like what this rule actually matches in real text
+    and (b) blow past `max_string_length`, so ~99% of rstr calls get
+    filtered out. The few survivors all share the same degenerate shape,
+    and stage-2 padding then fans one base sample into a whole corpus of
+    near-duplicates — the `kubernetes_secret_yaml` degeneracy reported
+    against 0.2.7.
+
+    Fix: for `min_repeat`, emit exactly `start_range` repetitions — the
+    semantically correct minimum. Greedy `max_repeat` still goes through
+    `_handle_repeat` (with the `{N>100}` cap fix above).
+    """
+    opcode, value = state
+    if opcode.name.lower() == "min_repeat":
+        start_range, _end_range, sub = value
+        return "".join("".join(self._handle_state(i) for i in sub) for _ in range(start_range))
+    return _RSTR_ORIGINAL_HANDLE_STATE(self, state)
+
+
 # setattr (not direct assignment) avoids mypy's method-assign error
 # without an ignore that warn_unused_ignores flips on newer Python versions.
 setattr(_RstrXeger, "_handle_repeat", _patched_handle_repeat)  # noqa: B010
+setattr(_RstrXeger, "_handle_state", _patched_handle_state)  # noqa: B010
 
 # We default to "spawn" rather than "fork" because forking from a multi-threaded
 # parent process is unsafe: child processes inherit memory but only the calling

diff --git a/pyproject.toml b/pyproject.toml
@@ -7,7 +7,7 @@ include = ["crossfire*"]
 
 [project]
 name = "crossfire-rules"
-version = "0.2.7"
+version = "0.2.8"
 description = "Regex rule overlap analyzer for DLP, secret scanning, SAST, and IDS tools"
 readme = "README.md"
 license = "MIT"

diff --git a/tests/test_generator.py b/tests/test_generator.py
@@ -335,6 +335,96 @@ def test_narrow_variable_range_unchanged(self):
             assert 2 <= len(s) <= 5
 
 
+class TestNonGreedyRepeat:
+    """Upstream rstr routes `min_repeat` (non-greedy `{m,n}?`, `*?`, `+?`)
+    through the same handler as `max_repeat`, sampling uniformly over
+    `[m, n]`. That breaks wide non-greedy holes like `(?s:.){0,200}?`:
+    the generator emits ~100 random chars there, the result blows past
+    `max_string_length`, gets filtered, and the few survivors all share
+    one degenerate shape — stage-2 padding then inflates that single base
+    into a pseudo-corpus. Crossfire's patch makes `min_repeat` emit
+    exactly `start_range` repetitions (the semantically correct minimum),
+    which is how the re engine would actually fill that region against
+    real text. These tests lock that behavior in.
+    """
+
+    def test_non_greedy_zero_min_emits_empty(self):
+        """`(?s:.){0,100}?` should produce zero dot-any chars, not ~50."""
+        import rstr
+
+        import crossfire.generator  # noqa: F401
+
+        # A pattern where the only variability is the non-greedy hole.
+        # With the fix, the hole is always empty → output is exactly "ab".
+        for _ in range(20):
+            s = rstr.xeger(r"a(?s:.){0,100}?b")
+            assert s == "ab"
+
+    def test_non_greedy_nonzero_min_emits_min(self):
+        """`X{5,50}?` should emit exactly 5 copies of X — the explicit min."""
+        import rstr
+
+        import crossfire.generator  # noqa: F401
+
+        for _ in range(20):
+            s = rstr.xeger(r"([a-f]{5,50}?)")
+            assert len(s) == 5
+            assert all(c in "abcdef" for c in s)
+
+    def test_greedy_still_varies(self):
+        """Greedy `{m,n}` (max_repeat) must keep varying — only non-greedy
+        is clamped to the minimum."""
+        import rstr
+
+        import crossfire.generator  # noqa: F401
+
+        lengths = {len(rstr.xeger(r"[a-z]{2,20}")) for _ in range(40)}
+        assert len(lengths) >= 3
+
+    def test_kubernetes_secret_yaml_diversity(self):
+        """Regression for 0.2.7 degenerate-sample report: the gitleaks
+        `kubernetes_secret_yaml` pattern has two wide `(?s:.){0,N}?` holes.
+        Before the non-greedy fix, ~99% of raw rstr outputs overshot
+        `max_string_length=256` and the few passing samples shared the
+        same degenerate middle, which stage-2 padding then multiplied.
+        After the fix, each sample's middle must be distinct.
+        """
+        # The exact real-world pattern from community.json (gitleaks).
+        pattern = (
+            r"(?i)(?:\bkind:[ \t]*[\"']?\bsecret\b[\"']?(?s:.){0,200}?"
+            r"\bdata:(?s:.){0,100}?\s+([\w.-]+:(?:[ \t]*(?:\||>[-+]?)\s+)?"
+            r"[ \t]*(?:[\"']?[a-z0-9+/]{10,}={0,3}[\"']?"
+            r"|\{\{[ \t\w\"|$:=,.-]+}}|\"\"|''))|\bdata:(?s:.){0,100}?"
+            r"\s+([\w.-]+:(?:[ \t]*(?:\||>[-+]?)\s+)?[ \t]*"
+            r"(?:[\"']?[a-z0-9+/]{10,}={0,3}[\"']?"
+            r"|\{\{[ \t\w\"|$:=,.-]+}}|\"\"|''))(?s:.){0,200}?"
+            r"\bkind:[ \t]*[\"']?\bsecret\b[\"']?)"
+        )
+        rule = _make_rule("kubernetes_secret_yaml", pattern)
+        gen = CorpusGenerator(
+            samples_per_rule=30, min_valid_samples=15, negative_samples=0, seed=42
+        )
+        entries = gen.generate([rule])
+        positives = [e.text for e in entries if not e.is_negative]
+        assert len(positives) >= 15
+
+        # Middle-of-string uniqueness: the test from the team's 0.2.7 report.
+        # Prefixes/suffixes are stage-2 padding noise; the middle reveals
+        # whether the base matches are actually distinct.
+        def middle(s: str, k: int = 50) -> str:
+            if len(s) <= k:
+                return s
+            mid = len(s) // 2
+            return s[mid - k // 2 : mid + k // 2]
+
+        unique_middles = {middle(s) for s in positives}
+        # Pre-fix: 10/27. Post-fix: should be close to 1:1 with sample count.
+        assert len(unique_middles) >= len(positives) * 0.7, (
+            f"only {len(unique_middles)} unique middles across {len(positives)} samples "
+            f"— non-greedy fix regressed, samples are degenerate again"
+        )
+
+
 class TestIntermittentRstrFailures:
     """Some patterns make rstr throw on a fraction of calls — the generator
     must not `break` on the first exception, or we lose the useful output