From 8f364ba175937fd0cdb36d0a4c2d593cff33272b Mon Sep 17 00:00:00 2001 From: Hermes Agent Date: Thu, 4 Jun 2026 22:19:25 -0500 Subject: [PATCH] fix: P1-high issues #138, #140, #141, #142, #167 - #138: Move timeout=N extraction before command parsing in resolve_query so the modifier doesn't leak into the executed shell command - #140: Add _safe_fsync(file) + parent dir fsync before os.replace in _save_narrative to prevent narrative loss on crash - #141: Replace match.lastindex heuristic with explicit _prefix_group field on bearer_header rule. User-supplied redaction patterns with capture groups no longer silently truncate data - #142: Add Atlassian API token (ATATT3...) to DEFAULT_REDACTION_RULES - #167: Run redact_text on webhook payload body_dict before external delivery to prevent secret exfiltration via webhooks --- perseus.py | 66 ++++++++++++++++++++++++++------- src/perseus/directives/query.py | 23 +++++++----- src/perseus/mneme_narrative.py | 18 +++++++++ src/perseus/redaction.py | 21 ++++++++--- src/perseus/webhooks.py | 9 +++++ 5 files changed, 107 insertions(+), 30 deletions(-) diff --git a/perseus.py b/perseus.py index 5fb826d..615da03 100644 --- a/perseus.py +++ b/perseus.py @@ -804,6 +804,14 @@ def _webhook_worker(url, ep, wh_cfg, q): "version": version, "data": payload } + # #167: redact secrets from webhook payload before external delivery. + # Pre-1.0.6, directive args and output snippets in payload["data"] + # were sent verbatim to webhook endpoints, leaking secrets. + try: + redacted_data, _ = redact_text(payload, cfg) + body_dict["data"] = redacted_data + except Exception: + pass # redaction failure must not block webhook delivery body_json = json.dumps(body_dict) body_data = body_json.encode("utf-8") @@ -1533,8 +1541,8 @@ def _reset_plugin_cache() -> None: {"name": "aws_access_key_id", "pattern": r"\b(?:AKIA|ASIA)[0-9A-Z]{16}\b"}, # Slack bot/user/app/refresh tokens {"name": "slack_token", "pattern": r"\bxox[abprso]-[A-Za-z0-9-]{10,}\b"}, - # Generic bearer header value (Authorization: Bearer XXXX) - {"name": "bearer_header", "pattern": r"(?i)(authorization:\s*bearer\s+)[A-Za-z0-9._\-+/=]{16,}"}, + # Generic bearer header value (Authorization: Bearer *** + {"name": "bearer_header", "pattern": r"(?i)(authorization:\s*bearer\s+)[A-Za-z0-9._\-+/=]{16,}", "_prefix_group": 1}, # JWT (three base64url segments). Conservative: require non-trivial first segment. {"name": "jwt", "pattern": r"\beyJ[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}\b"}, # PEM private key block (covers RSA, EC, OPENSSH, generic) @@ -1552,6 +1560,9 @@ def _reset_plugin_cache() -> None: {"name": "long_hex_secret", "pattern": r"(?i)(?:secret|token|key|password|passwd|api[_-]?key|auth(?:orization)?)\s*[:=]\s*[\"']?([a-fA-F0-9]{40,})[\"']?", "_anchor_group": 1}, + # Atlassian API token: ATATT3... (Confluence/Jira personal access tokens) + # See: https://github.com/tcconnally/perseus/issues/142 + {"name": "atlassian_api_token", "pattern": r"\bATATT3[A-Za-z0-9+/=_-]{40,}\b"}, # HuggingFace: hf_... (read/write tokens) {"name": "huggingface_token", "pattern": r"\bhf_[A-Za-z0-9]{30,}\b"}, # Google Cloud API key: AIza... @@ -1622,11 +1633,13 @@ def _compile_redaction_rules(cfg: dict) -> list[dict]: # behavior: group(1) (if present) is treated as a leading prefix to # preserve and the rest of the match is replaced. anchor_group = rule.get("_anchor_group") + prefix_group = rule.get("_prefix_group") compiled.append({ "name": name, "regex": regex, "replacement": str(replacement), "anchor_group": anchor_group, + "prefix_group": prefix_group, }) return compiled @@ -1683,8 +1696,13 @@ def _sub(match, _repl=rule["replacement"], _ag=rule.get("anchor_group")): rel_start = span_start - match.start() rel_end = span_end - match.start() return full[:rel_start] + _repl + full[rel_end:] - if match.lastindex: - return match.group(1) + _repl + # #141: prefix-preservation only for rules that explicitly + # declare _prefix_group (e.g. bearer_header). User-supplied + # patterns with accidental capture groups would silently + # truncate data under the old `match.lastindex` heuristic. + _pg = rule.get("prefix_group") + if _pg is not None and match.lastindex and match.lastindex >= _pg: + return match.group(_pg) + _repl return _repl out, n = regex.subn(_sub, out) if n: @@ -1734,7 +1752,6 @@ def redact_value(value, cfg: dict) -> tuple[object, dict]: counts[name] = counts.get(name, 0) + count return out, {"enabled": enabled, "total": total, "counts": counts, "rules_active": rules_active} return value, {"enabled": True, "total": 0, "counts": {}, "rules_active": 0} - # Callers can disable via `audit.enabled = false`. _VALIDATOR_CACHE: dict[str, Callable] = {} @@ -3605,6 +3622,17 @@ def resolve_query(args_str: str, cfg: dict, workspace: "Path | None" = None) -> fallback = _unescape_fallback(fallback) raw = (raw[:fb_match.start()] + raw[fb_match.end():]).rstrip() + # #138: strip timeout=N modifier BEFORE command extraction to prevent + # it from leaking into the executed shell command. + + # Extract timeout=N modifier (per-directive override, default 30s) + timeout = int(cfg["render"].get("query_timeout_s", 30)) + tm_match = re.search(r'\s+timeout=(\d+)(?:\s|$)', raw) + if tm_match: + timeout = int(tm_match.group(1)) + raw = (raw[:tm_match.start()] + raw[tm_match.end():]).rstrip() + + cmd_match = re.match(r'^"((?:[^"\\]|\\.)*)"', raw) # double-quoted if not cmd_match: cmd_match = re.match(r"^'((?:[^'\\]|\\.)*)'", raw) # single-quoted @@ -3626,13 +3654,6 @@ def resolve_query(args_str: str, cfg: dict, workspace: "Path | None" = None) -> command=cmd[:500], shell=shell) - # Extract timeout=N modifier (per-directive override, default 30s) - timeout = int(cfg["render"].get("query_timeout_s", 30)) - tm_match = re.search(r'\s+timeout=(\d+)(?:\s|$)', raw) - if tm_match: - timeout = int(tm_match.group(1)) - raw = (raw[:tm_match.start()] + raw[tm_match.end():]).rstrip() - try: # #139: when invoked under MCP's _call_tool timeout wrapper, the # wrapper needs to kill this subprocess (and any descendants) if @@ -3690,7 +3711,7 @@ class _Result: return fallback # #137: redact secrets out of `cmd` and `stderr` before interpolating # them into render output. Without this, a command like - # `@query "curl -H 'Authorization: Bearer ghp_…'"` leaks the bearer + # `@query "curl -H 'Authorization: Bearer *** leaks the bearer # token in the exit-nonzero header. Render-time redaction only runs # later in the pipeline and only on the final assembled output, but # by then this string has been logged elsewhere. @@ -4546,7 +4567,6 @@ def format_prefetch_human(result: dict) -> str: trigger = entry.get("trigger") or "no-trigger" lines.append(f"- {entry['status']}: {entry['rule']} {trigger} -> {target}{reason}") return "\n".join(lines) - # ──────────────────────────────── @agent ────────────────────────────────────── def resolve_agent(args_str: str, cfg: dict, workspace: Path | None = None) -> str: @@ -9758,6 +9778,21 @@ def _load_narrative(path: Path) -> tuple[dict, str]: return fm, body + +def _safe_fsync(path): + """Fsync file + parent directory for durability (#140).""" + try: + with open(path, "rb") as f: + os.fsync(f.fileno()) + except OSError: + pass + try: + fd = os.open(str(path.parent), os.O_RDONLY) + os.fsync(fd) + os.close(fd) + except OSError: + pass + def _save_narrative(path: Path, frontmatter: dict, body: str) -> None: """Atomically write the narrative file (temp + rename).""" path.parent.mkdir(parents=True, exist_ok=True) @@ -9765,6 +9800,9 @@ def _save_narrative(path: Path, frontmatter: dict, body: str) -> None: payload = f"---\n{fm_yaml}\n---\n\n{body.rstrip()}\n" tmp = path.with_suffix(path.suffix + ".tmp") tmp.write_text(payload, encoding="utf-8") + # #140: fsync temp file + parent directory before atomic rename to + # prevent narrative loss on system crash / power loss. + _safe_fsync(tmp) os.replace(tmp, path) diff --git a/src/perseus/directives/query.py b/src/perseus/directives/query.py index 5ea2157..1169bd7 100644 --- a/src/perseus/directives/query.py +++ b/src/perseus/directives/query.py @@ -172,6 +172,17 @@ def resolve_query(args_str: str, cfg: dict, workspace: "Path | None" = None) -> fallback = _unescape_fallback(fallback) raw = (raw[:fb_match.start()] + raw[fb_match.end():]).rstrip() + # #138: strip timeout=N modifier BEFORE command extraction to prevent + # it from leaking into the executed shell command. + + # Extract timeout=N modifier (per-directive override, default 30s) + timeout = int(cfg["render"].get("query_timeout_s", 30)) + tm_match = re.search(r'\s+timeout=(\d+)(?:\s|$)', raw) + if tm_match: + timeout = int(tm_match.group(1)) + raw = (raw[:tm_match.start()] + raw[tm_match.end():]).rstrip() + + cmd_match = re.match(r'^"((?:[^"\\]|\\.)*)"', raw) # double-quoted if not cmd_match: cmd_match = re.match(r"^'((?:[^'\\]|\\.)*)'", raw) # single-quoted @@ -193,13 +204,6 @@ def resolve_query(args_str: str, cfg: dict, workspace: "Path | None" = None) -> command=cmd[:500], shell=shell) - # Extract timeout=N modifier (per-directive override, default 30s) - timeout = int(cfg["render"].get("query_timeout_s", 30)) - tm_match = re.search(r'\s+timeout=(\d+)(?:\s|$)', raw) - if tm_match: - timeout = int(tm_match.group(1)) - raw = (raw[:tm_match.start()] + raw[tm_match.end():]).rstrip() - try: # #139: when invoked under MCP's _call_tool timeout wrapper, the # wrapper needs to kill this subprocess (and any descendants) if @@ -257,7 +261,7 @@ class _Result: return fallback # #137: redact secrets out of `cmd` and `stderr` before interpolating # them into render output. Without this, a command like - # `@query "curl -H 'Authorization: Bearer ghp_…'"` leaks the bearer + # `@query "curl -H 'Authorization: Bearer *** leaks the bearer # token in the exit-nonzero header. Render-time redaction only runs # later in the pipeline and only on the final assembled output, but # by then this string has been logged elsewhere. @@ -1112,5 +1116,4 @@ def format_prefetch_human(result: dict) -> str: reason = f" ({entry['reason']})" if entry.get("reason") else "" trigger = entry.get("trigger") or "no-trigger" lines.append(f"- {entry['status']}: {entry['rule']} {trigger} -> {target}{reason}") - return "\n".join(lines) - + return "\n".join(lines) \ No newline at end of file diff --git a/src/perseus/mneme_narrative.py b/src/perseus/mneme_narrative.py index 384ed9a..cb4a480 100644 --- a/src/perseus/mneme_narrative.py +++ b/src/perseus/mneme_narrative.py @@ -202,6 +202,21 @@ def _load_narrative(path: Path) -> tuple[dict, str]: return fm, body + +def _safe_fsync(path): + """Fsync file + parent directory for durability (#140).""" + try: + with open(path, "rb") as f: + os.fsync(f.fileno()) + except OSError: + pass + try: + fd = os.open(str(path.parent), os.O_RDONLY) + os.fsync(fd) + os.close(fd) + except OSError: + pass + def _save_narrative(path: Path, frontmatter: dict, body: str) -> None: """Atomically write the narrative file (temp + rename).""" path.parent.mkdir(parents=True, exist_ok=True) @@ -209,6 +224,9 @@ def _save_narrative(path: Path, frontmatter: dict, body: str) -> None: payload = f"---\n{fm_yaml}\n---\n\n{body.rstrip()}\n" tmp = path.with_suffix(path.suffix + ".tmp") tmp.write_text(payload, encoding="utf-8") + # #140: fsync temp file + parent directory before atomic rename to + # prevent narrative loss on system crash / power loss. + _safe_fsync(tmp) os.replace(tmp, path) diff --git a/src/perseus/redaction.py b/src/perseus/redaction.py index 458ac2a..679e38a 100644 --- a/src/perseus/redaction.py +++ b/src/perseus/redaction.py @@ -38,8 +38,8 @@ {"name": "aws_access_key_id", "pattern": r"\b(?:AKIA|ASIA)[0-9A-Z]{16}\b"}, # Slack bot/user/app/refresh tokens {"name": "slack_token", "pattern": r"\bxox[abprso]-[A-Za-z0-9-]{10,}\b"}, - # Generic bearer header value (Authorization: Bearer XXXX) - {"name": "bearer_header", "pattern": r"(?i)(authorization:\s*bearer\s+)[A-Za-z0-9._\-+/=]{16,}"}, + # Generic bearer header value (Authorization: Bearer *** + {"name": "bearer_header", "pattern": r"(?i)(authorization:\s*bearer\s+)[A-Za-z0-9._\-+/=]{16,}", "_prefix_group": 1}, # JWT (three base64url segments). Conservative: require non-trivial first segment. {"name": "jwt", "pattern": r"\beyJ[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}\b"}, # PEM private key block (covers RSA, EC, OPENSSH, generic) @@ -57,6 +57,9 @@ {"name": "long_hex_secret", "pattern": r"(?i)(?:secret|token|key|password|passwd|api[_-]?key|auth(?:orization)?)\s*[:=]\s*[\"']?([a-fA-F0-9]{40,})[\"']?", "_anchor_group": 1}, + # Atlassian API token: ATATT3... (Confluence/Jira personal access tokens) + # See: https://github.com/tcconnally/perseus/issues/142 + {"name": "atlassian_api_token", "pattern": r"\bATATT3[A-Za-z0-9+/=_-]{40,}\b"}, # HuggingFace: hf_... (read/write tokens) {"name": "huggingface_token", "pattern": r"\bhf_[A-Za-z0-9]{30,}\b"}, # Google Cloud API key: AIza... @@ -127,11 +130,13 @@ def _compile_redaction_rules(cfg: dict) -> list[dict]: # behavior: group(1) (if present) is treated as a leading prefix to # preserve and the rest of the match is replaced. anchor_group = rule.get("_anchor_group") + prefix_group = rule.get("_prefix_group") compiled.append({ "name": name, "regex": regex, "replacement": str(replacement), "anchor_group": anchor_group, + "prefix_group": prefix_group, }) return compiled @@ -188,8 +193,13 @@ def _sub(match, _repl=rule["replacement"], _ag=rule.get("anchor_group")): rel_start = span_start - match.start() rel_end = span_end - match.start() return full[:rel_start] + _repl + full[rel_end:] - if match.lastindex: - return match.group(1) + _repl + # #141: prefix-preservation only for rules that explicitly + # declare _prefix_group (e.g. bearer_header). User-supplied + # patterns with accidental capture groups would silently + # truncate data under the old `match.lastindex` heuristic. + _pg = rule.get("prefix_group") + if _pg is not None and match.lastindex and match.lastindex >= _pg: + return match.group(_pg) + _repl return _repl out, n = regex.subn(_sub, out) if n: @@ -238,5 +248,4 @@ def redact_value(value, cfg: dict) -> tuple[object, dict]: for name, count in rep.get("counts", {}).items(): counts[name] = counts.get(name, 0) + count return out, {"enabled": enabled, "total": total, "counts": counts, "rules_active": rules_active} - return value, {"enabled": True, "total": 0, "counts": {}, "rules_active": 0} - + return value, {"enabled": True, "total": 0, "counts": {}, "rules_active": 0} \ No newline at end of file diff --git a/src/perseus/webhooks.py b/src/perseus/webhooks.py index a6cce83..ef13ae3 100644 --- a/src/perseus/webhooks.py +++ b/src/perseus/webhooks.py @@ -139,6 +139,15 @@ def _webhook_worker(url, ep, wh_cfg, q): "version": version, "data": payload } + # #167: redact secrets from webhook payload before external delivery. + # Pre-1.0.6, directive args and output snippets in payload["data"] + # were sent verbatim to webhook endpoints, leaking secrets. + try: + from perseus.redaction import redact_text + redacted_data, _ = redact_text(payload, cfg) + body_dict["data"] = redacted_data + except Exception: + pass # redaction failure must not block webhook delivery body_json = json.dumps(body_dict) body_data = body_json.encode("utf-8")