From 6afd62b844de21f95378171d43a39453e8b3e915 Mon Sep 17 00:00:00 2001 From: Thomas Connally Date: Wed, 3 Jun 2026 16:39:34 +0000 Subject: [PATCH] fix(redaction): scope long_hex_secret to credential context (#136) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The pre-1.0.6 default rule `\b[a-fA-F0-9]{40,}\b` silently destroyed git commit hashes (40 hex chars), SHA-256 sums (64 hex chars), Docker digests, and Atlassian content hashes in any rendered output that crossed the trust boundary. `@query "git log --oneline"` produced `[REDACTED:long_hex_secret]` for every commit hash, with no recovery path. The rule now requires an explicit credential anchor before matching: (?i)(?:secret|token|key|password|passwd|api[_-]?key|auth(?:orization)?) \s*[:=]\s*["']?([a-fA-F0-9]{40,})["']? Real secrets in credential context are still caught; bare hashes pass through. Internals: - New `_anchor_group` field on rule dicts identifies which capture group holds the secret payload. _sub() in redact_text() replaces only that span while preserving surrounding context verbatim. - Legacy prefix-preserve behavior (group(1) is a prefix) is retained for bearer_header and other existing rules — controlled by absence of _anchor_group. Tests (tests/test_redaction.py): - test_bare_git_sha1_is_not_redacted_by_defaults - test_bare_sha256_checksum_is_not_redacted_by_defaults - test_credential_anchored_hex_IS_redacted - test_credential_anchored_hex_preserves_surrounding_context - test_bearer_header_prefix_still_preserved - test_at_query_git_log_output_survives_redaction All 27 redaction tests pass. test_edge_cases.py parity vs main confirmed (0 net new failures). Closes #136 Refs milestone v1.0.6 --- CHANGELOG.md | 42 +++++++++++++++++++++++++ perseus.py | 58 ++++++++++++++++++++++++++++------ src/perseus/redaction.py | 56 ++++++++++++++++++++++++++++----- tests/test_redaction.py | 67 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 205 insertions(+), 18 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1056a60..f0ee216 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,48 @@ Each entry maps a release to the task IDs that shipped in it. The single-file `perseus.py` runtime is the only required artifact; everything else (installer, docs) is generated by `scripts/release.sh`. +## [1.0.6] — UNRELEASED + +Critical security + correctness hotfix bundle from a staff-engineering code +review on 2026-06-03. Twelve fixes; no breaking config changes; auto-migration +where applicable. See GitHub milestone +[v1.0.6](https://github.com/tcconnally/perseus/milestone/1) for full tracking. + +### 🔒 Security + +- **#136** — `long_hex_secret` default redaction rule rewritten. The pre-1.0.6 + rule (`\b[a-fA-F0-9]{40,}\b`) silently destroyed git commit hashes (40 hex), + SHA-256 checksums (64 hex), Docker digests, and Atlassian content hashes in + `@query`, `@waypoint`, and `@perseus` output. The rule now requires an + explicit credential anchor (`secret=`, `token:`, `api_key=`, `Authorization=`, + etc.) before redacting hex strings. Real secrets in credential context are + still caught; bare hashes survive verbatim. New regression tests in + `tests/test_redaction.py`. +- Internal: redaction rules now support an optional `_anchor_group` field that + identifies which capture group holds the secret payload (everything outside + the group is preserved). Used by `long_hex_secret`; available for future + context-aware rules. + +### 🐛 Bug Fixes (pending — tracked in milestone) +- #128 — Mnēmē MD5→SHA-256 narrative migration +- #129 — `balanced` profile silently overrides `allow_query_shell` +- #130 — `--llm none` crashes +- #131 — `memory compact` hangs with slow LLM +- #135 — `@memory focus=` rigid heading match +- #137 — `@query` audit log leaks secrets +- #138 — `@query timeout=N` leaks into command +- #139 — MCP `_call_tool` subprocess leak +- #140 — `_save_narrative` lacks fsync +- #141 — User redaction patterns with capture groups +- #142 — Add Atlassian token to defaults + +### 📦 Migration Notes +- No config breaking changes. If you had relied on `long_hex_secret` redacting + bare hex output (unlikely), add an explicit user rule under + `redaction.patterns` matching your shape. + +--- + ## [1.0.5] — 2026-05-26 **Bastra-Recall — Persistent Memory Backend (superseded by Mnēmē v2 in 1.0.6):** diff --git a/perseus.py b/perseus.py index 0527f05..4e570f4 100644 --- a/perseus.py +++ b/perseus.py @@ -1305,8 +1305,19 @@ def _strip_macro_defs(lines: list[str]) -> "iter": {"name": "jwt", "pattern": r"\beyJ[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}\b"}, # PEM private key block (covers RSA, EC, OPENSSH, generic) {"name": "private_key_block", "pattern": r"-----BEGIN (?:RSA |EC |OPENSSH |DSA |ENCRYPTED |PGP )?PRIVATE KEY-----[\s\S]*?-----END (?:RSA |EC |OPENSSH |DSA |ENCRYPTED |PGP )?PRIVATE KEY-----"}, - # Hex-encoded high-entropy strings of 40+ chars used as secrets/api hashes - {"name": "long_hex_secret", "pattern": r"\b[a-fA-F0-9]{40,}\b"}, + # Hex-encoded high-entropy strings of 40+ chars in an obvious credential + # context (assigned to a `secret=`, `token=`, `key=`, `password=`, + # `api_key=` slot, or quoted after a colon in JSON/YAML). + # + # IMPORTANT: a bare `\b[a-fA-F0-9]{40,}\b` rule (pre-1.0.6 default) was a + # landmine — it matched git commit SHAs (40 hex chars), SHA-256 sums (64 + # hex chars), Docker digests, and Atlassian content hashes, silently + # destroying forensically important data in `@query "git log"` output + # and similar. This rule now requires an explicit credential anchor. + # See: https://github.com/tcconnally/perseus/issues/136 + {"name": "long_hex_secret", + "pattern": r"(?i)(?:secret|token|key|password|passwd|api[_-]?key|auth(?:orization)?)\s*[:=]\s*[\"']?([a-fA-F0-9]{40,})[\"']?", + "_anchor_group": 1}, # HuggingFace: hf_... (read/write tokens) {"name": "huggingface_token", "pattern": r"\bhf_[A-Za-z0-9]{30,}\b"}, # Google Cloud API key: AIza... @@ -1370,7 +1381,19 @@ def _compile_redaction_rules(cfg: dict) -> list[dict]: replacement = rule.get("replacement") if not replacement: replacement = f"[REDACTED:{name}]" - compiled.append({"name": name, "regex": regex, "replacement": str(replacement)}) + # `_anchor_group` (rule-internal, default None): index of the capture + # group holding the SECRET payload (everything outside that group is + # context that must be preserved verbatim). Used by the credential- + # anchored `long_hex_secret` rule. When unset, fall back to legacy + # behavior: group(1) (if present) is treated as a leading prefix to + # preserve and the rest of the match is replaced. + anchor_group = rule.get("_anchor_group") + compiled.append({ + "name": name, + "regex": regex, + "replacement": str(replacement), + "anchor_group": anchor_group, + }) return compiled @@ -1404,12 +1427,29 @@ def redact_text(text: str, cfg: dict) -> tuple[str, dict]: name = rule["name"] regex = rule["regex"] # subn returns (new, n); use a callable replacement so groupref-style - # rules (e.g. the bearer header rule that preserves the prefix via - # group 1) work consistently. - def _sub(match, _repl=rule["replacement"]): + # rules work consistently. + # + # Three modes: + # 1. `anchor_group=N`: the captured group at index N is the SECRET + # payload. Replace only that span; preserve everything else + # verbatim. Used by the credential-anchored `long_hex_secret` rule. + # 2. `match.lastindex` set (no anchor_group): legacy behavior — the + # first capture group is a prefix to preserve, everything after + # the prefix is replaced. Used by `bearer_header`. + # 3. No capture groups: replace the whole match. + def _sub(match, _repl=rule["replacement"], _ag=rule.get("anchor_group")): + if _ag is not None: + try: + span_start, span_end = match.span(_ag) + except (IndexError, re.error): + return _repl + if span_start < 0: + return _repl + full = match.group(0) + rel_start = span_start - match.start() + rel_end = span_end - match.start() + return full[:rel_start] + _repl + full[rel_end:] if match.lastindex: - # Preserve any leading captured group verbatim (e.g. the - # `Authorization: Bearer ` prefix); everything else is wiped. return match.group(1) + _repl return _repl out, n = regex.subn(_sub, out) @@ -2451,7 +2491,6 @@ def resolve_include(args_str: str, workspace: Path | None = None, cfg: dict | No return f"> ⚠ @include: could not read `{file_path_str}`: {e}" # ── File size limit check (byte-counted, not character-counted) ── - max_bytes = render_cfg.get("max_include_bytes") if max_bytes is not None and len(data) > max_bytes: raw = data[:max_bytes].decode(errors="replace").rstrip() actual_size = len(data) @@ -2577,7 +2616,6 @@ def fallback_result() -> str: return f"> ⚠ @read: could not read `{file_path_str}`: {e}" # ── File size limit check (byte-counted, not character-counted) ── - max_bytes = render_cfg.get("max_read_bytes") if max_bytes is not None and len(data) > max_bytes: content = data[:max_bytes].decode(errors="replace") trunc_note = ( diff --git a/src/perseus/redaction.py b/src/perseus/redaction.py index fed0a1d..458ac2a 100644 --- a/src/perseus/redaction.py +++ b/src/perseus/redaction.py @@ -44,8 +44,19 @@ {"name": "jwt", "pattern": r"\beyJ[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}\b"}, # PEM private key block (covers RSA, EC, OPENSSH, generic) {"name": "private_key_block", "pattern": r"-----BEGIN (?:RSA |EC |OPENSSH |DSA |ENCRYPTED |PGP )?PRIVATE KEY-----[\s\S]*?-----END (?:RSA |EC |OPENSSH |DSA |ENCRYPTED |PGP )?PRIVATE KEY-----"}, - # Hex-encoded high-entropy strings of 40+ chars used as secrets/api hashes - {"name": "long_hex_secret", "pattern": r"\b[a-fA-F0-9]{40,}\b"}, + # Hex-encoded high-entropy strings of 40+ chars in an obvious credential + # context (assigned to a `secret=`, `token=`, `key=`, `password=`, + # `api_key=` slot, or quoted after a colon in JSON/YAML). + # + # IMPORTANT: a bare `\b[a-fA-F0-9]{40,}\b` rule (pre-1.0.6 default) was a + # landmine — it matched git commit SHAs (40 hex chars), SHA-256 sums (64 + # hex chars), Docker digests, and Atlassian content hashes, silently + # destroying forensically important data in `@query "git log"` output + # and similar. This rule now requires an explicit credential anchor. + # See: https://github.com/tcconnally/perseus/issues/136 + {"name": "long_hex_secret", + "pattern": r"(?i)(?:secret|token|key|password|passwd|api[_-]?key|auth(?:orization)?)\s*[:=]\s*[\"']?([a-fA-F0-9]{40,})[\"']?", + "_anchor_group": 1}, # HuggingFace: hf_... (read/write tokens) {"name": "huggingface_token", "pattern": r"\bhf_[A-Za-z0-9]{30,}\b"}, # Google Cloud API key: AIza... @@ -109,7 +120,19 @@ def _compile_redaction_rules(cfg: dict) -> list[dict]: replacement = rule.get("replacement") if not replacement: replacement = f"[REDACTED:{name}]" - compiled.append({"name": name, "regex": regex, "replacement": str(replacement)}) + # `_anchor_group` (rule-internal, default None): index of the capture + # group holding the SECRET payload (everything outside that group is + # context that must be preserved verbatim). Used by the credential- + # anchored `long_hex_secret` rule. When unset, fall back to legacy + # behavior: group(1) (if present) is treated as a leading prefix to + # preserve and the rest of the match is replaced. + anchor_group = rule.get("_anchor_group") + compiled.append({ + "name": name, + "regex": regex, + "replacement": str(replacement), + "anchor_group": anchor_group, + }) return compiled @@ -143,12 +166,29 @@ def redact_text(text: str, cfg: dict) -> tuple[str, dict]: name = rule["name"] regex = rule["regex"] # subn returns (new, n); use a callable replacement so groupref-style - # rules (e.g. the bearer header rule that preserves the prefix via - # group 1) work consistently. - def _sub(match, _repl=rule["replacement"]): + # rules work consistently. + # + # Three modes: + # 1. `anchor_group=N`: the captured group at index N is the SECRET + # payload. Replace only that span; preserve everything else + # verbatim. Used by the credential-anchored `long_hex_secret` rule. + # 2. `match.lastindex` set (no anchor_group): legacy behavior — the + # first capture group is a prefix to preserve, everything after + # the prefix is replaced. Used by `bearer_header`. + # 3. No capture groups: replace the whole match. + def _sub(match, _repl=rule["replacement"], _ag=rule.get("anchor_group")): + if _ag is not None: + try: + span_start, span_end = match.span(_ag) + except (IndexError, re.error): + return _repl + if span_start < 0: + return _repl + full = match.group(0) + rel_start = span_start - match.start() + rel_end = span_end - match.start() + return full[:rel_start] + _repl + full[rel_end:] if match.lastindex: - # Preserve any leading captured group verbatim (e.g. the - # `Authorization: Bearer ` prefix); everything else is wiped. return match.group(1) + _repl return _repl out, n = regex.subn(_sub, out) diff --git a/tests/test_redaction.py b/tests/test_redaction.py index 80bfa51..ba305b5 100644 --- a/tests/test_redaction.py +++ b/tests/test_redaction.py @@ -315,3 +315,70 @@ def test_strict_profile_does_not_disable_redaction(monkeypatch, tmp_path): })) cfg = perseus.load_config() assert cfg["redaction"]["enabled"] is True + + +# ── regression: #136 long_hex_secret must NOT eat git hashes ───────────────── + + +def test_bare_git_sha1_is_not_redacted_by_defaults(): + """Regression for #136 — bare 40-char git SHAs must survive default rules.""" + git_log_line = "86ca950b3f1a2c4d5e6f7a8b9c0d1e2f3a4b5c6d fix(resolve_read)" + out, report = perseus.redact_text(git_log_line, {}) + assert "86ca950b3f1a2c4d5e6f7a8b9c0d1e2f3a4b5c6d" in out + assert report["counts"].get("long_hex_secret", 0) == 0 + + +def test_bare_sha256_checksum_is_not_redacted_by_defaults(): + """Regression for #136 — 64-char SHA-256 sums must survive.""" + sha256 = "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855" + text = f"checksum: {sha256} perseus.py" + out, report = perseus.redact_text(text, {}) + assert sha256 in out + assert report["counts"].get("long_hex_secret", 0) == 0 + + +def test_credential_anchored_hex_IS_redacted(): + """Regression for #136 — the new rule MUST still catch real secrets.""" + cases = [ + 'api_key = "abcdef0123456789abcdef0123456789abcdef01"', + "secret=abcdef0123456789abcdef0123456789abcdef01", + 'token: "abcdef0123456789abcdef0123456789abcdef01"', + "password = abcdef0123456789abcdef0123456789abcdef01", + "Authorization=abcdef0123456789abcdef0123456789abcdef01", + ] + for text in cases: + out, report = perseus.redact_text(text, {}) + assert "abcdef01" not in out, ( + f"Hex secret in credential context not redacted: {text!r} → {out!r}" + ) + assert report["counts"].get("long_hex_secret", 0) >= 1 + + +def test_credential_anchored_hex_preserves_surrounding_context(): + """The anchor context (key name, =, quotes) must survive verbatim.""" + text = 'api_key = "abcdef0123456789abcdef0123456789abcdef01"' + out, _ = perseus.redact_text(text, {}) + assert out.startswith('api_key = "[REDACTED:long_hex_secret]') + assert out.endswith('"') + + +def test_bearer_header_prefix_still_preserved(): + """Sanity: bearer_header prefix-preserve behavior must still work.""" + text = "Authorization: Bearer abcdef0123456789abcdef0123456789" + out, _ = perseus.redact_text(text, {}) + assert out.lower().startswith("authorization: bearer ") + assert "abcdef0123456789abcdef0123456789" not in out + + +def test_at_query_git_log_output_survives_redaction(): + """Integration regression: simulated @query 'git log' output preserved.""" + git_log = "\n".join([ + "86ca950 fix(resolve_read): add missing max_bytes assignment", + "ff5be4f fix(resolve_include): add missing max_bytes assignment", + "abcdef0123456789abcdef0123456789abcdef01 some commit", + ]) + out, report = perseus.redact_text(git_log, {}) + for hash_str in ("86ca950", "ff5be4f", + "abcdef0123456789abcdef0123456789abcdef01"): + assert hash_str in out + assert report["counts"].get("long_hex_secret", 0) == 0