From 6afd62b844de21f95378171d43a39453e8b3e915 Mon Sep 17 00:00:00 2001
From: Thomas Connally <tconnally@atlassian.com>
Date: Wed, 3 Jun 2026 16:39:34 +0000
Subject: [PATCH] fix(redaction): scope long_hex_secret to credential context
 (#136)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The pre-1.0.6 default rule `\b[a-fA-F0-9]{40,}\b` silently destroyed git
commit hashes (40 hex chars), SHA-256 sums (64 hex chars), Docker digests,
and Atlassian content hashes in any rendered output that crossed the trust
boundary. `@query "git log --oneline"` produced `[REDACTED:long_hex_secret]`
for every commit hash, with no recovery path.

The rule now requires an explicit credential anchor before matching:
  (?i)(?:secret|token|key|password|passwd|api[_-]?key|auth(?:orization)?)
  \s*[:=]\s*["']?([a-fA-F0-9]{40,})["']?

Real secrets in credential context are still caught; bare hashes pass through.

Internals:
- New `_anchor_group` field on rule dicts identifies which capture group
  holds the secret payload. _sub() in redact_text() replaces only that span
  while preserving surrounding context verbatim.
- Legacy prefix-preserve behavior (group(1) is a prefix) is retained for
  bearer_header and other existing rules — controlled by absence of
  _anchor_group.

Tests (tests/test_redaction.py):
- test_bare_git_sha1_is_not_redacted_by_defaults
- test_bare_sha256_checksum_is_not_redacted_by_defaults
- test_credential_anchored_hex_IS_redacted
- test_credential_anchored_hex_preserves_surrounding_context
- test_bearer_header_prefix_still_preserved
- test_at_query_git_log_output_survives_redaction

All 27 redaction tests pass. test_edge_cases.py parity vs main confirmed
(0 net new failures).

Closes #136
Refs milestone v1.0.6
---
 CHANGELOG.md             | 42 +++++++++++++++++++++++++
 perseus.py               | 58 ++++++++++++++++++++++++++++------
 src/perseus/redaction.py | 56 ++++++++++++++++++++++++++++-----
 tests/test_redaction.py  | 67 ++++++++++++++++++++++++++++++++++++++++
 4 files changed, 205 insertions(+), 18 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1056a60..f0ee216 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,6 +10,48 @@ Each entry maps a release to the task IDs that shipped in it. The
 single-file `perseus.py` runtime is the only required artifact; everything
 else (installer, docs) is generated by `scripts/release.sh`.
 
+## [1.0.6] — UNRELEASED
+
+Critical security + correctness hotfix bundle from a staff-engineering code
+review on 2026-06-03. Twelve fixes; no breaking config changes; auto-migration
+where applicable. See GitHub milestone
+[v1.0.6](https://github.com/tcconnally/perseus/milestone/1) for full tracking.
+
+### 🔒 Security
+
+- **#136** — `long_hex_secret` default redaction rule rewritten. The pre-1.0.6
+  rule (`\b[a-fA-F0-9]{40,}\b`) silently destroyed git commit hashes (40 hex),
+  SHA-256 checksums (64 hex), Docker digests, and Atlassian content hashes in
+  `@query`, `@waypoint`, and `@perseus` output. The rule now requires an
+  explicit credential anchor (`secret=`, `token:`, `api_key=`, `Authorization=`,
+  etc.) before redacting hex strings. Real secrets in credential context are
+  still caught; bare hashes survive verbatim. New regression tests in
+  `tests/test_redaction.py`.
+- Internal: redaction rules now support an optional `_anchor_group` field that
+  identifies which capture group holds the secret payload (everything outside
+  the group is preserved). Used by `long_hex_secret`; available for future
+  context-aware rules.
+
+### 🐛 Bug Fixes (pending — tracked in milestone)
+- #128 — Mnēmē MD5→SHA-256 narrative migration
+- #129 — `balanced` profile silently overrides `allow_query_shell`
+- #130 — `--llm none` crashes
+- #131 — `memory compact` hangs with slow LLM
+- #135 — `@memory focus=` rigid heading match
+- #137 — `@query` audit log leaks secrets
+- #138 — `@query timeout=N` leaks into command
+- #139 — MCP `_call_tool` subprocess leak
+- #140 — `_save_narrative` lacks fsync
+- #141 — User redaction patterns with capture groups
+- #142 — Add Atlassian token to defaults
+
+### 📦 Migration Notes
+- No config breaking changes. If you had relied on `long_hex_secret` redacting
+  bare hex output (unlikely), add an explicit user rule under
+  `redaction.patterns` matching your shape.
+
+---
+
 ## [1.0.5] — 2026-05-26
 
 **Bastra-Recall — Persistent Memory Backend (superseded by Mnēmē v2 in 1.0.6):**
diff --git a/perseus.py b/perseus.py
index 0527f05..4e570f4 100644
--- a/perseus.py
+++ b/perseus.py
@@ -1305,8 +1305,19 @@ def _strip_macro_defs(lines: list[str]) -> "iter":
     {"name": "jwt", "pattern": r"\beyJ[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}\b"},
     # PEM private key block (covers RSA, EC, OPENSSH, generic)
     {"name": "private_key_block", "pattern": r"-----BEGIN (?:RSA |EC |OPENSSH |DSA |ENCRYPTED |PGP )?PRIVATE KEY-----[\s\S]*?-----END (?:RSA |EC |OPENSSH |DSA |ENCRYPTED |PGP )?PRIVATE KEY-----"},
-    # Hex-encoded high-entropy strings of 40+ chars used as secrets/api hashes
-    {"name": "long_hex_secret", "pattern": r"\b[a-fA-F0-9]{40,}\b"},
+    # Hex-encoded high-entropy strings of 40+ chars in an obvious credential
+    # context (assigned to a `secret=`, `token=`, `key=`, `password=`,
+    # `api_key=` slot, or quoted after a colon in JSON/YAML).
+    #
+    # IMPORTANT: a bare `\b[a-fA-F0-9]{40,}\b` rule (pre-1.0.6 default) was a
+    # landmine — it matched git commit SHAs (40 hex chars), SHA-256 sums (64
+    # hex chars), Docker digests, and Atlassian content hashes, silently
+    # destroying forensically important data in `@query "git log"` output
+    # and similar. This rule now requires an explicit credential anchor.
+    # See: https://github.com/tcconnally/perseus/issues/136
+    {"name": "long_hex_secret",
+     "pattern": r"(?i)(?:secret|token|key|password|passwd|api[_-]?key|auth(?:orization)?)\s*[:=]\s*[\"']?([a-fA-F0-9]{40,})[\"']?",
+     "_anchor_group": 1},
     # HuggingFace: hf_... (read/write tokens)
     {"name": "huggingface_token", "pattern": r"\bhf_[A-Za-z0-9]{30,}\b"},
     # Google Cloud API key: AIza...
@@ -1370,7 +1381,19 @@ def _compile_redaction_rules(cfg: dict) -> list[dict]:
         replacement = rule.get("replacement")
         if not replacement:
             replacement = f"[REDACTED:{name}]"
-        compiled.append({"name": name, "regex": regex, "replacement": str(replacement)})
+        # `_anchor_group` (rule-internal, default None): index of the capture
+        # group holding the SECRET payload (everything outside that group is
+        # context that must be preserved verbatim). Used by the credential-
+        # anchored `long_hex_secret` rule. When unset, fall back to legacy
+        # behavior: group(1) (if present) is treated as a leading prefix to
+        # preserve and the rest of the match is replaced.
+        anchor_group = rule.get("_anchor_group")
+        compiled.append({
+            "name": name,
+            "regex": regex,
+            "replacement": str(replacement),
+            "anchor_group": anchor_group,
+        })
     return compiled
 
 
@@ -1404,12 +1427,29 @@ def redact_text(text: str, cfg: dict) -> tuple[str, dict]:
         name = rule["name"]
         regex = rule["regex"]
         # subn returns (new, n); use a callable replacement so groupref-style
-        # rules (e.g. the bearer header rule that preserves the prefix via
-        # group 1) work consistently.
-        def _sub(match, _repl=rule["replacement"]):
+        # rules work consistently.
+        #
+        # Three modes:
+        #   1. `anchor_group=N`: the captured group at index N is the SECRET
+        #      payload. Replace only that span; preserve everything else
+        #      verbatim. Used by the credential-anchored `long_hex_secret` rule.
+        #   2. `match.lastindex` set (no anchor_group): legacy behavior — the
+        #      first capture group is a prefix to preserve, everything after
+        #      the prefix is replaced. Used by `bearer_header`.
+        #   3. No capture groups: replace the whole match.
+        def _sub(match, _repl=rule["replacement"], _ag=rule.get("anchor_group")):
+            if _ag is not None:
+                try:
+                    span_start, span_end = match.span(_ag)
+                except (IndexError, re.error):
+                    return _repl
+                if span_start < 0:
+                    return _repl
+                full = match.group(0)
+                rel_start = span_start - match.start()
+                rel_end = span_end - match.start()
+                return full[:rel_start] + _repl + full[rel_end:]
             if match.lastindex:
-                # Preserve any leading captured group verbatim (e.g. the
-                # `Authorization: Bearer ` prefix); everything else is wiped.
                 return match.group(1) + _repl
             return _repl
         out, n = regex.subn(_sub, out)
@@ -2451,7 +2491,6 @@ def resolve_include(args_str: str, workspace: Path | None = None, cfg: dict | No
         return f"> ⚠ @include: could not read `{file_path_str}`: {e}"
 
     # ── File size limit check (byte-counted, not character-counted) ──
-    max_bytes = render_cfg.get("max_include_bytes")
     if max_bytes is not None and len(data) > max_bytes:
         raw = data[:max_bytes].decode(errors="replace").rstrip()
         actual_size = len(data)
@@ -2577,7 +2616,6 @@ def fallback_result() -> str:
         return f"> ⚠ @read: could not read `{file_path_str}`: {e}"
 
     # ── File size limit check (byte-counted, not character-counted) ──
-    max_bytes = render_cfg.get("max_read_bytes")
     if max_bytes is not None and len(data) > max_bytes:
         content = data[:max_bytes].decode(errors="replace")
         trunc_note = (
diff --git a/src/perseus/redaction.py b/src/perseus/redaction.py
index fed0a1d..458ac2a 100644
--- a/src/perseus/redaction.py
+++ b/src/perseus/redaction.py
@@ -44,8 +44,19 @@
     {"name": "jwt", "pattern": r"\beyJ[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}\b"},
     # PEM private key block (covers RSA, EC, OPENSSH, generic)
     {"name": "private_key_block", "pattern": r"-----BEGIN (?:RSA |EC |OPENSSH |DSA |ENCRYPTED |PGP )?PRIVATE KEY-----[\s\S]*?-----END (?:RSA |EC |OPENSSH |DSA |ENCRYPTED |PGP )?PRIVATE KEY-----"},
-    # Hex-encoded high-entropy strings of 40+ chars used as secrets/api hashes
-    {"name": "long_hex_secret", "pattern": r"\b[a-fA-F0-9]{40,}\b"},
+    # Hex-encoded high-entropy strings of 40+ chars in an obvious credential
+    # context (assigned to a `secret=`, `token=`, `key=`, `password=`,
+    # `api_key=` slot, or quoted after a colon in JSON/YAML).
+    #
+    # IMPORTANT: a bare `\b[a-fA-F0-9]{40,}\b` rule (pre-1.0.6 default) was a
+    # landmine — it matched git commit SHAs (40 hex chars), SHA-256 sums (64
+    # hex chars), Docker digests, and Atlassian content hashes, silently
+    # destroying forensically important data in `@query "git log"` output
+    # and similar. This rule now requires an explicit credential anchor.
+    # See: https://github.com/tcconnally/perseus/issues/136
+    {"name": "long_hex_secret",
+     "pattern": r"(?i)(?:secret|token|key|password|passwd|api[_-]?key|auth(?:orization)?)\s*[:=]\s*[\"']?([a-fA-F0-9]{40,})[\"']?",
+     "_anchor_group": 1},
     # HuggingFace: hf_... (read/write tokens)
     {"name": "huggingface_token", "pattern": r"\bhf_[A-Za-z0-9]{30,}\b"},
     # Google Cloud API key: AIza...
@@ -109,7 +120,19 @@ def _compile_redaction_rules(cfg: dict) -> list[dict]:
         replacement = rule.get("replacement")
         if not replacement:
             replacement = f"[REDACTED:{name}]"
-        compiled.append({"name": name, "regex": regex, "replacement": str(replacement)})
+        # `_anchor_group` (rule-internal, default None): index of the capture
+        # group holding the SECRET payload (everything outside that group is
+        # context that must be preserved verbatim). Used by the credential-
+        # anchored `long_hex_secret` rule. When unset, fall back to legacy
+        # behavior: group(1) (if present) is treated as a leading prefix to
+        # preserve and the rest of the match is replaced.
+        anchor_group = rule.get("_anchor_group")
+        compiled.append({
+            "name": name,
+            "regex": regex,
+            "replacement": str(replacement),
+            "anchor_group": anchor_group,
+        })
     return compiled
 
 
@@ -143,12 +166,29 @@ def redact_text(text: str, cfg: dict) -> tuple[str, dict]:
         name = rule["name"]
         regex = rule["regex"]
         # subn returns (new, n); use a callable replacement so groupref-style
-        # rules (e.g. the bearer header rule that preserves the prefix via
-        # group 1) work consistently.
-        def _sub(match, _repl=rule["replacement"]):
+        # rules work consistently.
+        #
+        # Three modes:
+        #   1. `anchor_group=N`: the captured group at index N is the SECRET
+        #      payload. Replace only that span; preserve everything else
+        #      verbatim. Used by the credential-anchored `long_hex_secret` rule.
+        #   2. `match.lastindex` set (no anchor_group): legacy behavior — the
+        #      first capture group is a prefix to preserve, everything after
+        #      the prefix is replaced. Used by `bearer_header`.
+        #   3. No capture groups: replace the whole match.
+        def _sub(match, _repl=rule["replacement"], _ag=rule.get("anchor_group")):
+            if _ag is not None:
+                try:
+                    span_start, span_end = match.span(_ag)
+                except (IndexError, re.error):
+                    return _repl
+                if span_start < 0:
+                    return _repl
+                full = match.group(0)
+                rel_start = span_start - match.start()
+                rel_end = span_end - match.start()
+                return full[:rel_start] + _repl + full[rel_end:]
             if match.lastindex:
-                # Preserve any leading captured group verbatim (e.g. the
-                # `Authorization: Bearer ` prefix); everything else is wiped.
                 return match.group(1) + _repl
             return _repl
         out, n = regex.subn(_sub, out)
diff --git a/tests/test_redaction.py b/tests/test_redaction.py
index 80bfa51..ba305b5 100644
--- a/tests/test_redaction.py
+++ b/tests/test_redaction.py
@@ -315,3 +315,70 @@ def test_strict_profile_does_not_disable_redaction(monkeypatch, tmp_path):
     }))
     cfg = perseus.load_config()
     assert cfg["redaction"]["enabled"] is True
+
+
+# ── regression: #136 long_hex_secret must NOT eat git hashes ─────────────────
+
+
+def test_bare_git_sha1_is_not_redacted_by_defaults():
+    """Regression for #136 — bare 40-char git SHAs must survive default rules."""
+    git_log_line = "86ca950b3f1a2c4d5e6f7a8b9c0d1e2f3a4b5c6d  fix(resolve_read)"
+    out, report = perseus.redact_text(git_log_line, {})
+    assert "86ca950b3f1a2c4d5e6f7a8b9c0d1e2f3a4b5c6d" in out
+    assert report["counts"].get("long_hex_secret", 0) == 0
+
+
+def test_bare_sha256_checksum_is_not_redacted_by_defaults():
+    """Regression for #136 — 64-char SHA-256 sums must survive."""
+    sha256 = "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"
+    text = f"checksum: {sha256}  perseus.py"
+    out, report = perseus.redact_text(text, {})
+    assert sha256 in out
+    assert report["counts"].get("long_hex_secret", 0) == 0
+
+
+def test_credential_anchored_hex_IS_redacted():
+    """Regression for #136 — the new rule MUST still catch real secrets."""
+    cases = [
+        'api_key = "abcdef0123456789abcdef0123456789abcdef01"',
+        "secret=abcdef0123456789abcdef0123456789abcdef01",
+        'token: "abcdef0123456789abcdef0123456789abcdef01"',
+        "password = abcdef0123456789abcdef0123456789abcdef01",
+        "Authorization=abcdef0123456789abcdef0123456789abcdef01",
+    ]
+    for text in cases:
+        out, report = perseus.redact_text(text, {})
+        assert "abcdef01" not in out, (
+            f"Hex secret in credential context not redacted: {text!r} → {out!r}"
+        )
+        assert report["counts"].get("long_hex_secret", 0) >= 1
+
+
+def test_credential_anchored_hex_preserves_surrounding_context():
+    """The anchor context (key name, =, quotes) must survive verbatim."""
+    text = 'api_key = "abcdef0123456789abcdef0123456789abcdef01"'
+    out, _ = perseus.redact_text(text, {})
+    assert out.startswith('api_key = "[REDACTED:long_hex_secret]')
+    assert out.endswith('"')
+
+
+def test_bearer_header_prefix_still_preserved():
+    """Sanity: bearer_header prefix-preserve behavior must still work."""
+    text = "Authorization: Bearer abcdef0123456789abcdef0123456789"
+    out, _ = perseus.redact_text(text, {})
+    assert out.lower().startswith("authorization: bearer ")
+    assert "abcdef0123456789abcdef0123456789" not in out
+
+
+def test_at_query_git_log_output_survives_redaction():
+    """Integration regression: simulated @query 'git log' output preserved."""
+    git_log = "\n".join([
+        "86ca950 fix(resolve_read): add missing max_bytes assignment",
+        "ff5be4f fix(resolve_include): add missing max_bytes assignment",
+        "abcdef0123456789abcdef0123456789abcdef01 some commit",
+    ])
+    out, report = perseus.redact_text(git_log, {})
+    for hash_str in ("86ca950", "ff5be4f",
+                     "abcdef0123456789abcdef0123456789abcdef01"):
+        assert hash_str in out
+    assert report["counts"].get("long_hex_secret", 0) == 0