diff --git a/CHANGELOG.md b/CHANGELOG.md index a46ef94..a87f2c1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,47 @@ Each entry maps a release to the task IDs that shipped in it. The single-file `perseus.py` runtime is the only required artifact; everything else (installer, docs) is generated by `scripts/release.sh`. +## [1.0.6] — UNRELEASED + +Critical security + correctness hotfix bundle from a staff-engineering code +review on 2026-06-03. Twelve fixes; no breaking config changes; auto-migration +where applicable. See GitHub milestone +[v1.0.6](https://github.com/tcconnally/perseus/milestone/1) for full tracking. + +### 🔒 Security + +- **#136** — `long_hex_secret` default redaction rule rewritten. The pre-1.0.6 + rule (`\b[a-fA-F0-9]{40,}\b`) silently destroyed git commit hashes (40 hex), + SHA-256 checksums (64 hex), Docker digests, and Atlassian content hashes in + `@query`, `@waypoint`, and `@perseus` output. The rule now requires an + explicit credential anchor (`secret=`, `token:`, `api_key=`, `Authorization=`, + etc.) before redacting hex strings. Real secrets in credential context are + still caught; bare hashes survive verbatim. New regression tests in + `tests/test_redaction.py`. +- Internal: redaction rules now support an optional `_anchor_group` field that + identifies which capture group holds the secret payload (everything outside + the group is preserved). Used by `long_hex_secret`; available for future + context-aware rules. + +### 🐛 Bug Fixes (pending — tracked in milestone) +- #128 — Mnēmē MD5→SHA-256 narrative migration +- #129 — `balanced` profile silently overrides `allow_query_shell` +- #130 — `--llm none` crashes +- #131 — `memory compact` hangs with slow LLM +- #135 — `@memory focus=` rigid heading match +- #137 — `@query` audit log leaks secrets +- #138 — `@query timeout=N` leaks into command +- #139 — MCP `_call_tool` subprocess leak +- #140 — `_save_narrative` lacks fsync +- #141 — User redaction patterns with capture groups +- #142 — Add Atlassian token to defaults + +### 📦 Migration Notes +- No config breaking changes. If you had relied on `long_hex_secret` redacting + bare hex output (unlikely), add an explicit user rule under + `redaction.patterns` matching your shape. + +--- ## [1.0.6] — 2026-05-29 **Gauntlet hardening — 6 bugs fixed (code review + certification run):** diff --git a/perseus.py b/perseus.py index 00c6266..940e71b 100644 --- a/perseus.py +++ b/perseus.py @@ -1384,8 +1384,19 @@ def _reset_plugin_cache() -> None: {"name": "jwt", "pattern": r"\beyJ[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}\b"}, # PEM private key block (covers RSA, EC, OPENSSH, generic) {"name": "private_key_block", "pattern": r"-----BEGIN (?:RSA |EC |OPENSSH |DSA |ENCRYPTED |PGP )?PRIVATE KEY-----[\s\S]*?-----END (?:RSA |EC |OPENSSH |DSA |ENCRYPTED |PGP )?PRIVATE KEY-----"}, - # Hex-encoded high-entropy strings of 40+ chars used as secrets/api hashes - {"name": "long_hex_secret", "pattern": r"\b[a-fA-F0-9]{40,}\b"}, + # Hex-encoded high-entropy strings of 40+ chars in an obvious credential + # context (assigned to a `secret=`, `token=`, `key=`, `password=`, + # `api_key=` slot, or quoted after a colon in JSON/YAML). + # + # IMPORTANT: a bare `\b[a-fA-F0-9]{40,}\b` rule (pre-1.0.6 default) was a + # landmine — it matched git commit SHAs (40 hex chars), SHA-256 sums (64 + # hex chars), Docker digests, and Atlassian content hashes, silently + # destroying forensically important data in `@query "git log"` output + # and similar. This rule now requires an explicit credential anchor. + # See: https://github.com/tcconnally/perseus/issues/136 + {"name": "long_hex_secret", + "pattern": r"(?i)(?:secret|token|key|password|passwd|api[_-]?key|auth(?:orization)?)\s*[:=]\s*[\"']?([a-fA-F0-9]{40,})[\"']?", + "_anchor_group": 1}, # HuggingFace: hf_... (read/write tokens) {"name": "huggingface_token", "pattern": r"\bhf_[A-Za-z0-9]{30,}\b"}, # Google Cloud API key: AIza... @@ -1449,7 +1460,19 @@ def _compile_redaction_rules(cfg: dict) -> list[dict]: replacement = rule.get("replacement") if not replacement: replacement = f"[REDACTED:{name}]" - compiled.append({"name": name, "regex": regex, "replacement": str(replacement)}) + # `_anchor_group` (rule-internal, default None): index of the capture + # group holding the SECRET payload (everything outside that group is + # context that must be preserved verbatim). Used by the credential- + # anchored `long_hex_secret` rule. When unset, fall back to legacy + # behavior: group(1) (if present) is treated as a leading prefix to + # preserve and the rest of the match is replaced. + anchor_group = rule.get("_anchor_group") + compiled.append({ + "name": name, + "regex": regex, + "replacement": str(replacement), + "anchor_group": anchor_group, + }) return compiled @@ -1483,12 +1506,29 @@ def redact_text(text: str, cfg: dict) -> tuple[str, dict]: name = rule["name"] regex = rule["regex"] # subn returns (new, n); use a callable replacement so groupref-style - # rules (e.g. the bearer header rule that preserves the prefix via - # group 1) work consistently. - def _sub(match, _repl=rule["replacement"]): + # rules work consistently. + # + # Three modes: + # 1. `anchor_group=N`: the captured group at index N is the SECRET + # payload. Replace only that span; preserve everything else + # verbatim. Used by the credential-anchored `long_hex_secret` rule. + # 2. `match.lastindex` set (no anchor_group): legacy behavior — the + # first capture group is a prefix to preserve, everything after + # the prefix is replaced. Used by `bearer_header`. + # 3. No capture groups: replace the whole match. + def _sub(match, _repl=rule["replacement"], _ag=rule.get("anchor_group")): + if _ag is not None: + try: + span_start, span_end = match.span(_ag) + except (IndexError, re.error): + return _repl + if span_start < 0: + return _repl + full = match.group(0) + rel_start = span_start - match.start() + rel_end = span_end - match.start() + return full[:rel_start] + _repl + full[rel_end:] if match.lastindex: - # Preserve any leading captured group verbatim (e.g. the - # `Authorization: Bearer ` prefix); everything else is wiped. return match.group(1) + _repl return _repl out, n = regex.subn(_sub, out) diff --git a/src/perseus/redaction.py b/src/perseus/redaction.py index fed0a1d..458ac2a 100644 --- a/src/perseus/redaction.py +++ b/src/perseus/redaction.py @@ -44,8 +44,19 @@ {"name": "jwt", "pattern": r"\beyJ[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}\b"}, # PEM private key block (covers RSA, EC, OPENSSH, generic) {"name": "private_key_block", "pattern": r"-----BEGIN (?:RSA |EC |OPENSSH |DSA |ENCRYPTED |PGP )?PRIVATE KEY-----[\s\S]*?-----END (?:RSA |EC |OPENSSH |DSA |ENCRYPTED |PGP )?PRIVATE KEY-----"}, - # Hex-encoded high-entropy strings of 40+ chars used as secrets/api hashes - {"name": "long_hex_secret", "pattern": r"\b[a-fA-F0-9]{40,}\b"}, + # Hex-encoded high-entropy strings of 40+ chars in an obvious credential + # context (assigned to a `secret=`, `token=`, `key=`, `password=`, + # `api_key=` slot, or quoted after a colon in JSON/YAML). + # + # IMPORTANT: a bare `\b[a-fA-F0-9]{40,}\b` rule (pre-1.0.6 default) was a + # landmine — it matched git commit SHAs (40 hex chars), SHA-256 sums (64 + # hex chars), Docker digests, and Atlassian content hashes, silently + # destroying forensically important data in `@query "git log"` output + # and similar. This rule now requires an explicit credential anchor. + # See: https://github.com/tcconnally/perseus/issues/136 + {"name": "long_hex_secret", + "pattern": r"(?i)(?:secret|token|key|password|passwd|api[_-]?key|auth(?:orization)?)\s*[:=]\s*[\"']?([a-fA-F0-9]{40,})[\"']?", + "_anchor_group": 1}, # HuggingFace: hf_... (read/write tokens) {"name": "huggingface_token", "pattern": r"\bhf_[A-Za-z0-9]{30,}\b"}, # Google Cloud API key: AIza... @@ -109,7 +120,19 @@ def _compile_redaction_rules(cfg: dict) -> list[dict]: replacement = rule.get("replacement") if not replacement: replacement = f"[REDACTED:{name}]" - compiled.append({"name": name, "regex": regex, "replacement": str(replacement)}) + # `_anchor_group` (rule-internal, default None): index of the capture + # group holding the SECRET payload (everything outside that group is + # context that must be preserved verbatim). Used by the credential- + # anchored `long_hex_secret` rule. When unset, fall back to legacy + # behavior: group(1) (if present) is treated as a leading prefix to + # preserve and the rest of the match is replaced. + anchor_group = rule.get("_anchor_group") + compiled.append({ + "name": name, + "regex": regex, + "replacement": str(replacement), + "anchor_group": anchor_group, + }) return compiled @@ -143,12 +166,29 @@ def redact_text(text: str, cfg: dict) -> tuple[str, dict]: name = rule["name"] regex = rule["regex"] # subn returns (new, n); use a callable replacement so groupref-style - # rules (e.g. the bearer header rule that preserves the prefix via - # group 1) work consistently. - def _sub(match, _repl=rule["replacement"]): + # rules work consistently. + # + # Three modes: + # 1. `anchor_group=N`: the captured group at index N is the SECRET + # payload. Replace only that span; preserve everything else + # verbatim. Used by the credential-anchored `long_hex_secret` rule. + # 2. `match.lastindex` set (no anchor_group): legacy behavior — the + # first capture group is a prefix to preserve, everything after + # the prefix is replaced. Used by `bearer_header`. + # 3. No capture groups: replace the whole match. + def _sub(match, _repl=rule["replacement"], _ag=rule.get("anchor_group")): + if _ag is not None: + try: + span_start, span_end = match.span(_ag) + except (IndexError, re.error): + return _repl + if span_start < 0: + return _repl + full = match.group(0) + rel_start = span_start - match.start() + rel_end = span_end - match.start() + return full[:rel_start] + _repl + full[rel_end:] if match.lastindex: - # Preserve any leading captured group verbatim (e.g. the - # `Authorization: Bearer ` prefix); everything else is wiped. return match.group(1) + _repl return _repl out, n = regex.subn(_sub, out) diff --git a/tests/test_redaction.py b/tests/test_redaction.py index 80bfa51..ba305b5 100644 --- a/tests/test_redaction.py +++ b/tests/test_redaction.py @@ -315,3 +315,70 @@ def test_strict_profile_does_not_disable_redaction(monkeypatch, tmp_path): })) cfg = perseus.load_config() assert cfg["redaction"]["enabled"] is True + + +# ── regression: #136 long_hex_secret must NOT eat git hashes ───────────────── + + +def test_bare_git_sha1_is_not_redacted_by_defaults(): + """Regression for #136 — bare 40-char git SHAs must survive default rules.""" + git_log_line = "86ca950b3f1a2c4d5e6f7a8b9c0d1e2f3a4b5c6d fix(resolve_read)" + out, report = perseus.redact_text(git_log_line, {}) + assert "86ca950b3f1a2c4d5e6f7a8b9c0d1e2f3a4b5c6d" in out + assert report["counts"].get("long_hex_secret", 0) == 0 + + +def test_bare_sha256_checksum_is_not_redacted_by_defaults(): + """Regression for #136 — 64-char SHA-256 sums must survive.""" + sha256 = "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855" + text = f"checksum: {sha256} perseus.py" + out, report = perseus.redact_text(text, {}) + assert sha256 in out + assert report["counts"].get("long_hex_secret", 0) == 0 + + +def test_credential_anchored_hex_IS_redacted(): + """Regression for #136 — the new rule MUST still catch real secrets.""" + cases = [ + 'api_key = "abcdef0123456789abcdef0123456789abcdef01"', + "secret=abcdef0123456789abcdef0123456789abcdef01", + 'token: "abcdef0123456789abcdef0123456789abcdef01"', + "password = abcdef0123456789abcdef0123456789abcdef01", + "Authorization=abcdef0123456789abcdef0123456789abcdef01", + ] + for text in cases: + out, report = perseus.redact_text(text, {}) + assert "abcdef01" not in out, ( + f"Hex secret in credential context not redacted: {text!r} → {out!r}" + ) + assert report["counts"].get("long_hex_secret", 0) >= 1 + + +def test_credential_anchored_hex_preserves_surrounding_context(): + """The anchor context (key name, =, quotes) must survive verbatim.""" + text = 'api_key = "abcdef0123456789abcdef0123456789abcdef01"' + out, _ = perseus.redact_text(text, {}) + assert out.startswith('api_key = "[REDACTED:long_hex_secret]') + assert out.endswith('"') + + +def test_bearer_header_prefix_still_preserved(): + """Sanity: bearer_header prefix-preserve behavior must still work.""" + text = "Authorization: Bearer abcdef0123456789abcdef0123456789" + out, _ = perseus.redact_text(text, {}) + assert out.lower().startswith("authorization: bearer ") + assert "abcdef0123456789abcdef0123456789" not in out + + +def test_at_query_git_log_output_survives_redaction(): + """Integration regression: simulated @query 'git log' output preserved.""" + git_log = "\n".join([ + "86ca950 fix(resolve_read): add missing max_bytes assignment", + "ff5be4f fix(resolve_include): add missing max_bytes assignment", + "abcdef0123456789abcdef0123456789abcdef01 some commit", + ]) + out, report = perseus.redact_text(git_log, {}) + for hash_str in ("86ca950", "ff5be4f", + "abcdef0123456789abcdef0123456789abcdef01"): + assert hash_str in out + assert report["counts"].get("long_hex_secret", 0) == 0