Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 41 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,47 @@ Each entry maps a release to the task IDs that shipped in it. The
single-file `perseus.py` runtime is the only required artifact; everything
else (installer, docs) is generated by `scripts/release.sh`.

## [1.0.6] — UNRELEASED

Critical security + correctness hotfix bundle from a staff-engineering code
review on 2026-06-03. Twelve fixes; no breaking config changes; auto-migration
where applicable. See GitHub milestone
[v1.0.6](https://github.com/tcconnally/perseus/milestone/1) for full tracking.

### 🔒 Security

- **#136** — `long_hex_secret` default redaction rule rewritten. The pre-1.0.6
rule (`\b[a-fA-F0-9]{40,}\b`) silently destroyed git commit hashes (40 hex),
SHA-256 checksums (64 hex), Docker digests, and Atlassian content hashes in
`@query`, `@waypoint`, and `@perseus` output. The rule now requires an
explicit credential anchor (`secret=`, `token:`, `api_key=`, `Authorization=`,
etc.) before redacting hex strings. Real secrets in credential context are
still caught; bare hashes survive verbatim. New regression tests in
`tests/test_redaction.py`.
- Internal: redaction rules now support an optional `_anchor_group` field that
identifies which capture group holds the secret payload (everything outside
the group is preserved). Used by `long_hex_secret`; available for future
context-aware rules.

### 🐛 Bug Fixes (pending — tracked in milestone)
- #128 — Mnēmē MD5→SHA-256 narrative migration
- #129 — `balanced` profile silently overrides `allow_query_shell`
- #130 — `--llm none` crashes
- #131 — `memory compact` hangs with slow LLM
- #135 — `@memory focus=` rigid heading match
- #137 — `@query` audit log leaks secrets
- #138 — `@query timeout=N` leaks into command
- #139 — MCP `_call_tool` subprocess leak
- #140 — `_save_narrative` lacks fsync
- #141 — User redaction patterns with capture groups
- #142 — Add Atlassian token to defaults

### 📦 Migration Notes
- No config breaking changes. If you had relied on `long_hex_secret` redacting
bare hex output (unlikely), add an explicit user rule under
`redaction.patterns` matching your shape.

---
## [1.0.6] — 2026-05-29

**Gauntlet hardening — 6 bugs fixed (code review + certification run):**
Expand Down
56 changes: 48 additions & 8 deletions perseus.py
Original file line number Diff line number Diff line change
Expand Up @@ -1384,8 +1384,19 @@ def _reset_plugin_cache() -> None:
{"name": "jwt", "pattern": r"\beyJ[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}\b"},
# PEM private key block (covers RSA, EC, OPENSSH, generic)
{"name": "private_key_block", "pattern": r"-----BEGIN (?:RSA |EC |OPENSSH |DSA |ENCRYPTED |PGP )?PRIVATE KEY-----[\s\S]*?-----END (?:RSA |EC |OPENSSH |DSA |ENCRYPTED |PGP )?PRIVATE KEY-----"},
# Hex-encoded high-entropy strings of 40+ chars used as secrets/api hashes
{"name": "long_hex_secret", "pattern": r"\b[a-fA-F0-9]{40,}\b"},
# Hex-encoded high-entropy strings of 40+ chars in an obvious credential
# context (assigned to a `secret=`, `token=`, `key=`, `password=`,
# `api_key=` slot, or quoted after a colon in JSON/YAML).
#
# IMPORTANT: a bare `\b[a-fA-F0-9]{40,}\b` rule (pre-1.0.6 default) was a
# landmine — it matched git commit SHAs (40 hex chars), SHA-256 sums (64
# hex chars), Docker digests, and Atlassian content hashes, silently
# destroying forensically important data in `@query "git log"` output
# and similar. This rule now requires an explicit credential anchor.
# See: https://github.com/tcconnally/perseus/issues/136
{"name": "long_hex_secret",
"pattern": r"(?i)(?:secret|token|key|password|passwd|api[_-]?key|auth(?:orization)?)\s*[:=]\s*[\"']?([a-fA-F0-9]{40,})[\"']?",
"_anchor_group": 1},
# HuggingFace: hf_... (read/write tokens)
{"name": "huggingface_token", "pattern": r"\bhf_[A-Za-z0-9]{30,}\b"},
# Google Cloud API key: AIza...
Expand Down Expand Up @@ -1449,7 +1460,19 @@ def _compile_redaction_rules(cfg: dict) -> list[dict]:
replacement = rule.get("replacement")
if not replacement:
replacement = f"[REDACTED:{name}]"
compiled.append({"name": name, "regex": regex, "replacement": str(replacement)})
# `_anchor_group` (rule-internal, default None): index of the capture
# group holding the SECRET payload (everything outside that group is
# context that must be preserved verbatim). Used by the credential-
# anchored `long_hex_secret` rule. When unset, fall back to legacy
# behavior: group(1) (if present) is treated as a leading prefix to
# preserve and the rest of the match is replaced.
anchor_group = rule.get("_anchor_group")
compiled.append({
"name": name,
"regex": regex,
"replacement": str(replacement),
"anchor_group": anchor_group,
})
return compiled


Expand Down Expand Up @@ -1483,12 +1506,29 @@ def redact_text(text: str, cfg: dict) -> tuple[str, dict]:
name = rule["name"]
regex = rule["regex"]
# subn returns (new, n); use a callable replacement so groupref-style
# rules (e.g. the bearer header rule that preserves the prefix via
# group 1) work consistently.
def _sub(match, _repl=rule["replacement"]):
# rules work consistently.
#
# Three modes:
# 1. `anchor_group=N`: the captured group at index N is the SECRET
# payload. Replace only that span; preserve everything else
# verbatim. Used by the credential-anchored `long_hex_secret` rule.
# 2. `match.lastindex` set (no anchor_group): legacy behavior — the
# first capture group is a prefix to preserve, everything after
# the prefix is replaced. Used by `bearer_header`.
# 3. No capture groups: replace the whole match.
def _sub(match, _repl=rule["replacement"], _ag=rule.get("anchor_group")):
if _ag is not None:
try:
span_start, span_end = match.span(_ag)
except (IndexError, re.error):
return _repl
if span_start < 0:
return _repl
full = match.group(0)
rel_start = span_start - match.start()
rel_end = span_end - match.start()
return full[:rel_start] + _repl + full[rel_end:]
if match.lastindex:
# Preserve any leading captured group verbatim (e.g. the
# `Authorization: Bearer ` prefix); everything else is wiped.
return match.group(1) + _repl
return _repl
out, n = regex.subn(_sub, out)
Expand Down
56 changes: 48 additions & 8 deletions src/perseus/redaction.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,19 @@
{"name": "jwt", "pattern": r"\beyJ[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}\b"},
# PEM private key block (covers RSA, EC, OPENSSH, generic)
{"name": "private_key_block", "pattern": r"-----BEGIN (?:RSA |EC |OPENSSH |DSA |ENCRYPTED |PGP )?PRIVATE KEY-----[\s\S]*?-----END (?:RSA |EC |OPENSSH |DSA |ENCRYPTED |PGP )?PRIVATE KEY-----"},
# Hex-encoded high-entropy strings of 40+ chars used as secrets/api hashes
{"name": "long_hex_secret", "pattern": r"\b[a-fA-F0-9]{40,}\b"},
# Hex-encoded high-entropy strings of 40+ chars in an obvious credential
# context (assigned to a `secret=`, `token=`, `key=`, `password=`,
# `api_key=` slot, or quoted after a colon in JSON/YAML).
#
# IMPORTANT: a bare `\b[a-fA-F0-9]{40,}\b` rule (pre-1.0.6 default) was a
# landmine — it matched git commit SHAs (40 hex chars), SHA-256 sums (64
# hex chars), Docker digests, and Atlassian content hashes, silently
# destroying forensically important data in `@query "git log"` output
# and similar. This rule now requires an explicit credential anchor.
# See: https://github.com/tcconnally/perseus/issues/136
{"name": "long_hex_secret",
"pattern": r"(?i)(?:secret|token|key|password|passwd|api[_-]?key|auth(?:orization)?)\s*[:=]\s*[\"']?([a-fA-F0-9]{40,})[\"']?",
"_anchor_group": 1},
# HuggingFace: hf_... (read/write tokens)
{"name": "huggingface_token", "pattern": r"\bhf_[A-Za-z0-9]{30,}\b"},
# Google Cloud API key: AIza...
Expand Down Expand Up @@ -109,7 +120,19 @@ def _compile_redaction_rules(cfg: dict) -> list[dict]:
replacement = rule.get("replacement")
if not replacement:
replacement = f"[REDACTED:{name}]"
compiled.append({"name": name, "regex": regex, "replacement": str(replacement)})
# `_anchor_group` (rule-internal, default None): index of the capture
# group holding the SECRET payload (everything outside that group is
# context that must be preserved verbatim). Used by the credential-
# anchored `long_hex_secret` rule. When unset, fall back to legacy
# behavior: group(1) (if present) is treated as a leading prefix to
# preserve and the rest of the match is replaced.
anchor_group = rule.get("_anchor_group")
compiled.append({
"name": name,
"regex": regex,
"replacement": str(replacement),
"anchor_group": anchor_group,
})
return compiled


Expand Down Expand Up @@ -143,12 +166,29 @@ def redact_text(text: str, cfg: dict) -> tuple[str, dict]:
name = rule["name"]
regex = rule["regex"]
# subn returns (new, n); use a callable replacement so groupref-style
# rules (e.g. the bearer header rule that preserves the prefix via
# group 1) work consistently.
def _sub(match, _repl=rule["replacement"]):
# rules work consistently.
#
# Three modes:
# 1. `anchor_group=N`: the captured group at index N is the SECRET
# payload. Replace only that span; preserve everything else
# verbatim. Used by the credential-anchored `long_hex_secret` rule.
# 2. `match.lastindex` set (no anchor_group): legacy behavior — the
# first capture group is a prefix to preserve, everything after
# the prefix is replaced. Used by `bearer_header`.
# 3. No capture groups: replace the whole match.
def _sub(match, _repl=rule["replacement"], _ag=rule.get("anchor_group")):
if _ag is not None:
try:
span_start, span_end = match.span(_ag)
except (IndexError, re.error):
return _repl
if span_start < 0:
return _repl
full = match.group(0)
rel_start = span_start - match.start()
rel_end = span_end - match.start()
return full[:rel_start] + _repl + full[rel_end:]
if match.lastindex:
# Preserve any leading captured group verbatim (e.g. the
# `Authorization: Bearer ` prefix); everything else is wiped.
return match.group(1) + _repl
return _repl
out, n = regex.subn(_sub, out)
Expand Down
67 changes: 67 additions & 0 deletions tests/test_redaction.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,3 +315,70 @@ def test_strict_profile_does_not_disable_redaction(monkeypatch, tmp_path):
}))
cfg = perseus.load_config()
assert cfg["redaction"]["enabled"] is True


# ── regression: #136 long_hex_secret must NOT eat git hashes ─────────────────


def test_bare_git_sha1_is_not_redacted_by_defaults():
"""Regression for #136 — bare 40-char git SHAs must survive default rules."""
git_log_line = "86ca950b3f1a2c4d5e6f7a8b9c0d1e2f3a4b5c6d fix(resolve_read)"
out, report = perseus.redact_text(git_log_line, {})
assert "86ca950b3f1a2c4d5e6f7a8b9c0d1e2f3a4b5c6d" in out
assert report["counts"].get("long_hex_secret", 0) == 0


def test_bare_sha256_checksum_is_not_redacted_by_defaults():
"""Regression for #136 — 64-char SHA-256 sums must survive."""
sha256 = "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"
text = f"checksum: {sha256} perseus.py"
out, report = perseus.redact_text(text, {})
assert sha256 in out
assert report["counts"].get("long_hex_secret", 0) == 0


def test_credential_anchored_hex_IS_redacted():
"""Regression for #136 — the new rule MUST still catch real secrets."""
cases = [
'api_key = "abcdef0123456789abcdef0123456789abcdef01"',
"secret=abcdef0123456789abcdef0123456789abcdef01",
'token: "abcdef0123456789abcdef0123456789abcdef01"',
"password = abcdef0123456789abcdef0123456789abcdef01",
"Authorization=abcdef0123456789abcdef0123456789abcdef01",
]
for text in cases:
out, report = perseus.redact_text(text, {})
assert "abcdef01" not in out, (
f"Hex secret in credential context not redacted: {text!r} → {out!r}"
)
assert report["counts"].get("long_hex_secret", 0) >= 1


def test_credential_anchored_hex_preserves_surrounding_context():
"""The anchor context (key name, =, quotes) must survive verbatim."""
text = 'api_key = "abcdef0123456789abcdef0123456789abcdef01"'
out, _ = perseus.redact_text(text, {})
assert out.startswith('api_key = "[REDACTED:long_hex_secret]')
assert out.endswith('"')


def test_bearer_header_prefix_still_preserved():
"""Sanity: bearer_header prefix-preserve behavior must still work."""
text = "Authorization: Bearer abcdef0123456789abcdef0123456789"
out, _ = perseus.redact_text(text, {})
assert out.lower().startswith("authorization: bearer ")
assert "abcdef0123456789abcdef0123456789" not in out


def test_at_query_git_log_output_survives_redaction():
"""Integration regression: simulated @query 'git log' output preserved."""
git_log = "\n".join([
"86ca950 fix(resolve_read): add missing max_bytes assignment",
"ff5be4f fix(resolve_include): add missing max_bytes assignment",
"abcdef0123456789abcdef0123456789abcdef01 some commit",
])
out, report = perseus.redact_text(git_log, {})
for hash_str in ("86ca950", "ff5be4f",
"abcdef0123456789abcdef0123456789abcdef01"):
assert hash_str in out
assert report["counts"].get("long_hex_secret", 0) == 0
Loading