Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 52 additions & 14 deletions perseus.py
Original file line number Diff line number Diff line change
Expand Up @@ -804,6 +804,14 @@ def _webhook_worker(url, ep, wh_cfg, q):
"version": version,
"data": payload
}
# #167: redact secrets from webhook payload before external delivery.
# Pre-1.0.6, directive args and output snippets in payload["data"]
# were sent verbatim to webhook endpoints, leaking secrets.
try:
redacted_data, _ = redact_text(payload, cfg)
body_dict["data"] = redacted_data
except Exception:
pass # redaction failure must not block webhook delivery
body_json = json.dumps(body_dict)
body_data = body_json.encode("utf-8")

Expand Down Expand Up @@ -1533,8 +1541,8 @@ def _reset_plugin_cache() -> None:
{"name": "aws_access_key_id", "pattern": r"\b(?:AKIA|ASIA)[0-9A-Z]{16}\b"},
# Slack bot/user/app/refresh tokens
{"name": "slack_token", "pattern": r"\bxox[abprso]-[A-Za-z0-9-]{10,}\b"},
# Generic bearer header value (Authorization: Bearer XXXX)
{"name": "bearer_header", "pattern": r"(?i)(authorization:\s*bearer\s+)[A-Za-z0-9._\-+/=]{16,}"},
# Generic bearer header value (Authorization: Bearer ***
{"name": "bearer_header", "pattern": r"(?i)(authorization:\s*bearer\s+)[A-Za-z0-9._\-+/=]{16,}", "_prefix_group": 1},
# JWT (three base64url segments). Conservative: require non-trivial first segment.
{"name": "jwt", "pattern": r"\beyJ[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}\b"},
# PEM private key block (covers RSA, EC, OPENSSH, generic)
Expand All @@ -1552,6 +1560,9 @@ def _reset_plugin_cache() -> None:
{"name": "long_hex_secret",
"pattern": r"(?i)(?:secret|token|key|password|passwd|api[_-]?key|auth(?:orization)?)\s*[:=]\s*[\"']?([a-fA-F0-9]{40,})[\"']?",
"_anchor_group": 1},
# Atlassian API token: ATATT3... (Confluence/Jira personal access tokens)
# See: https://github.com/tcconnally/perseus/issues/142
{"name": "atlassian_api_token", "pattern": r"\bATATT3[A-Za-z0-9+/=_-]{40,}\b"},
# HuggingFace: hf_... (read/write tokens)
{"name": "huggingface_token", "pattern": r"\bhf_[A-Za-z0-9]{30,}\b"},
# Google Cloud API key: AIza...
Expand Down Expand Up @@ -1622,11 +1633,13 @@ def _compile_redaction_rules(cfg: dict) -> list[dict]:
# behavior: group(1) (if present) is treated as a leading prefix to
# preserve and the rest of the match is replaced.
anchor_group = rule.get("_anchor_group")
prefix_group = rule.get("_prefix_group")
compiled.append({
"name": name,
"regex": regex,
"replacement": str(replacement),
"anchor_group": anchor_group,
"prefix_group": prefix_group,
})
return compiled

Expand Down Expand Up @@ -1683,8 +1696,13 @@ def _sub(match, _repl=rule["replacement"], _ag=rule.get("anchor_group")):
rel_start = span_start - match.start()
rel_end = span_end - match.start()
return full[:rel_start] + _repl + full[rel_end:]
if match.lastindex:
return match.group(1) + _repl
# #141: prefix-preservation only for rules that explicitly
# declare _prefix_group (e.g. bearer_header). User-supplied
# patterns with accidental capture groups would silently
# truncate data under the old `match.lastindex` heuristic.
_pg = rule.get("prefix_group")
if _pg is not None and match.lastindex and match.lastindex >= _pg:
return match.group(_pg) + _repl
return _repl
out, n = regex.subn(_sub, out)
if n:
Expand Down Expand Up @@ -1734,7 +1752,6 @@ def redact_value(value, cfg: dict) -> tuple[object, dict]:
counts[name] = counts.get(name, 0) + count
return out, {"enabled": enabled, "total": total, "counts": counts, "rules_active": rules_active}
return value, {"enabled": True, "total": 0, "counts": {}, "rules_active": 0}

# Callers can disable via `audit.enabled = false`.
_VALIDATOR_CACHE: dict[str, Callable] = {}

Expand Down Expand Up @@ -3605,6 +3622,17 @@ def resolve_query(args_str: str, cfg: dict, workspace: "Path | None" = None) ->
fallback = _unescape_fallback(fallback)
raw = (raw[:fb_match.start()] + raw[fb_match.end():]).rstrip()

# #138: strip timeout=N modifier BEFORE command extraction to prevent
# it from leaking into the executed shell command.

# Extract timeout=N modifier (per-directive override, default 30s)
timeout = int(cfg["render"].get("query_timeout_s", 30))
tm_match = re.search(r'\s+timeout=(\d+)(?:\s|$)', raw)
if tm_match:
timeout = int(tm_match.group(1))
raw = (raw[:tm_match.start()] + raw[tm_match.end():]).rstrip()


cmd_match = re.match(r'^"((?:[^"\\]|\\.)*)"', raw) # double-quoted
if not cmd_match:
cmd_match = re.match(r"^'((?:[^'\\]|\\.)*)'", raw) # single-quoted
Expand All @@ -3626,13 +3654,6 @@ def resolve_query(args_str: str, cfg: dict, workspace: "Path | None" = None) ->
command=cmd[:500],
shell=shell)

# Extract timeout=N modifier (per-directive override, default 30s)
timeout = int(cfg["render"].get("query_timeout_s", 30))
tm_match = re.search(r'\s+timeout=(\d+)(?:\s|$)', raw)
if tm_match:
timeout = int(tm_match.group(1))
raw = (raw[:tm_match.start()] + raw[tm_match.end():]).rstrip()

try:
# #139: when invoked under MCP's _call_tool timeout wrapper, the
# wrapper needs to kill this subprocess (and any descendants) if
Expand Down Expand Up @@ -3690,7 +3711,7 @@ class _Result:
return fallback
# #137: redact secrets out of `cmd` and `stderr` before interpolating
# them into render output. Without this, a command like
# `@query "curl -H 'Authorization: Bearer ghp_…'"` leaks the bearer
# `@query "curl -H 'Authorization: Bearer *** leaks the bearer
# token in the exit-nonzero header. Render-time redaction only runs
# later in the pipeline and only on the final assembled output, but
# by then this string has been logged elsewhere.
Expand Down Expand Up @@ -4546,7 +4567,6 @@ def format_prefetch_human(result: dict) -> str:
trigger = entry.get("trigger") or "no-trigger"
lines.append(f"- {entry['status']}: {entry['rule']} {trigger} -> {target}{reason}")
return "\n".join(lines)

# ──────────────────────────────── @agent ──────────────────────────────────────

def resolve_agent(args_str: str, cfg: dict, workspace: Path | None = None) -> str:
Expand Down Expand Up @@ -9758,13 +9778,31 @@ def _load_narrative(path: Path) -> tuple[dict, str]:
return fm, body



def _safe_fsync(path):
"""Fsync file + parent directory for durability (#140)."""
try:
with open(path, "rb") as f:
os.fsync(f.fileno())
except OSError:
pass
try:
fd = os.open(str(path.parent), os.O_RDONLY)
os.fsync(fd)
os.close(fd)
except OSError:
pass

def _save_narrative(path: Path, frontmatter: dict, body: str) -> None:
"""Atomically write the narrative file (temp + rename)."""
path.parent.mkdir(parents=True, exist_ok=True)
fm_yaml = yaml.safe_dump(frontmatter, default_flow_style=False, allow_unicode=True, sort_keys=False).strip()
payload = f"---\n{fm_yaml}\n---\n\n{body.rstrip()}\n"
tmp = path.with_suffix(path.suffix + ".tmp")
tmp.write_text(payload, encoding="utf-8")
# #140: fsync temp file + parent directory before atomic rename to
# prevent narrative loss on system crash / power loss.
_safe_fsync(tmp)
os.replace(tmp, path)


Expand Down
23 changes: 13 additions & 10 deletions src/perseus/directives/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,17 @@ def resolve_query(args_str: str, cfg: dict, workspace: "Path | None" = None) ->
fallback = _unescape_fallback(fallback)
raw = (raw[:fb_match.start()] + raw[fb_match.end():]).rstrip()

# #138: strip timeout=N modifier BEFORE command extraction to prevent
# it from leaking into the executed shell command.

# Extract timeout=N modifier (per-directive override, default 30s)
timeout = int(cfg["render"].get("query_timeout_s", 30))
tm_match = re.search(r'\s+timeout=(\d+)(?:\s|$)', raw)
if tm_match:
timeout = int(tm_match.group(1))
raw = (raw[:tm_match.start()] + raw[tm_match.end():]).rstrip()


cmd_match = re.match(r'^"((?:[^"\\]|\\.)*)"', raw) # double-quoted
if not cmd_match:
cmd_match = re.match(r"^'((?:[^'\\]|\\.)*)'", raw) # single-quoted
Expand All @@ -193,13 +204,6 @@ def resolve_query(args_str: str, cfg: dict, workspace: "Path | None" = None) ->
command=cmd[:500],
shell=shell)

# Extract timeout=N modifier (per-directive override, default 30s)
timeout = int(cfg["render"].get("query_timeout_s", 30))
tm_match = re.search(r'\s+timeout=(\d+)(?:\s|$)', raw)
if tm_match:
timeout = int(tm_match.group(1))
raw = (raw[:tm_match.start()] + raw[tm_match.end():]).rstrip()

try:
# #139: when invoked under MCP's _call_tool timeout wrapper, the
# wrapper needs to kill this subprocess (and any descendants) if
Expand Down Expand Up @@ -257,7 +261,7 @@ class _Result:
return fallback
# #137: redact secrets out of `cmd` and `stderr` before interpolating
# them into render output. Without this, a command like
# `@query "curl -H 'Authorization: Bearer ghp_…'"` leaks the bearer
# `@query "curl -H 'Authorization: Bearer *** leaks the bearer
# token in the exit-nonzero header. Render-time redaction only runs
# later in the pipeline and only on the final assembled output, but
# by then this string has been logged elsewhere.
Expand Down Expand Up @@ -1112,5 +1116,4 @@ def format_prefetch_human(result: dict) -> str:
reason = f" ({entry['reason']})" if entry.get("reason") else ""
trigger = entry.get("trigger") or "no-trigger"
lines.append(f"- {entry['status']}: {entry['rule']} {trigger} -> {target}{reason}")
return "\n".join(lines)

return "\n".join(lines)
18 changes: 18 additions & 0 deletions src/perseus/mneme_narrative.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,13 +202,31 @@ def _load_narrative(path: Path) -> tuple[dict, str]:
return fm, body



def _safe_fsync(path):
"""Fsync file + parent directory for durability (#140)."""
try:
with open(path, "rb") as f:
os.fsync(f.fileno())
except OSError:
pass
try:
fd = os.open(str(path.parent), os.O_RDONLY)
os.fsync(fd)
os.close(fd)
except OSError:
pass

def _save_narrative(path: Path, frontmatter: dict, body: str) -> None:
"""Atomically write the narrative file (temp + rename)."""
path.parent.mkdir(parents=True, exist_ok=True)
fm_yaml = yaml.safe_dump(frontmatter, default_flow_style=False, allow_unicode=True, sort_keys=False).strip()
payload = f"---\n{fm_yaml}\n---\n\n{body.rstrip()}\n"
tmp = path.with_suffix(path.suffix + ".tmp")
tmp.write_text(payload, encoding="utf-8")
# #140: fsync temp file + parent directory before atomic rename to
# prevent narrative loss on system crash / power loss.
_safe_fsync(tmp)
os.replace(tmp, path)


Expand Down
21 changes: 15 additions & 6 deletions src/perseus/redaction.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@
{"name": "aws_access_key_id", "pattern": r"\b(?:AKIA|ASIA)[0-9A-Z]{16}\b"},
# Slack bot/user/app/refresh tokens
{"name": "slack_token", "pattern": r"\bxox[abprso]-[A-Za-z0-9-]{10,}\b"},
# Generic bearer header value (Authorization: Bearer XXXX)
{"name": "bearer_header", "pattern": r"(?i)(authorization:\s*bearer\s+)[A-Za-z0-9._\-+/=]{16,}"},
# Generic bearer header value (Authorization: Bearer ***
{"name": "bearer_header", "pattern": r"(?i)(authorization:\s*bearer\s+)[A-Za-z0-9._\-+/=]{16,}", "_prefix_group": 1},
# JWT (three base64url segments). Conservative: require non-trivial first segment.
{"name": "jwt", "pattern": r"\beyJ[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}\b"},
# PEM private key block (covers RSA, EC, OPENSSH, generic)
Expand All @@ -57,6 +57,9 @@
{"name": "long_hex_secret",
"pattern": r"(?i)(?:secret|token|key|password|passwd|api[_-]?key|auth(?:orization)?)\s*[:=]\s*[\"']?([a-fA-F0-9]{40,})[\"']?",
"_anchor_group": 1},
# Atlassian API token: ATATT3... (Confluence/Jira personal access tokens)
# See: https://github.com/tcconnally/perseus/issues/142
{"name": "atlassian_api_token", "pattern": r"\bATATT3[A-Za-z0-9+/=_-]{40,}\b"},
# HuggingFace: hf_... (read/write tokens)
{"name": "huggingface_token", "pattern": r"\bhf_[A-Za-z0-9]{30,}\b"},
# Google Cloud API key: AIza...
Expand Down Expand Up @@ -127,11 +130,13 @@ def _compile_redaction_rules(cfg: dict) -> list[dict]:
# behavior: group(1) (if present) is treated as a leading prefix to
# preserve and the rest of the match is replaced.
anchor_group = rule.get("_anchor_group")
prefix_group = rule.get("_prefix_group")
compiled.append({
"name": name,
"regex": regex,
"replacement": str(replacement),
"anchor_group": anchor_group,
"prefix_group": prefix_group,
})
return compiled

Expand Down Expand Up @@ -188,8 +193,13 @@ def _sub(match, _repl=rule["replacement"], _ag=rule.get("anchor_group")):
rel_start = span_start - match.start()
rel_end = span_end - match.start()
return full[:rel_start] + _repl + full[rel_end:]
if match.lastindex:
return match.group(1) + _repl
# #141: prefix-preservation only for rules that explicitly
# declare _prefix_group (e.g. bearer_header). User-supplied
# patterns with accidental capture groups would silently
# truncate data under the old `match.lastindex` heuristic.
_pg = rule.get("prefix_group")
if _pg is not None and match.lastindex and match.lastindex >= _pg:
return match.group(_pg) + _repl
return _repl
out, n = regex.subn(_sub, out)
if n:
Expand Down Expand Up @@ -238,5 +248,4 @@ def redact_value(value, cfg: dict) -> tuple[object, dict]:
for name, count in rep.get("counts", {}).items():
counts[name] = counts.get(name, 0) + count
return out, {"enabled": enabled, "total": total, "counts": counts, "rules_active": rules_active}
return value, {"enabled": True, "total": 0, "counts": {}, "rules_active": 0}

return value, {"enabled": True, "total": 0, "counts": {}, "rules_active": 0}
9 changes: 9 additions & 0 deletions src/perseus/webhooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,15 @@ def _webhook_worker(url, ep, wh_cfg, q):
"version": version,
"data": payload
}
# #167: redact secrets from webhook payload before external delivery.
# Pre-1.0.6, directive args and output snippets in payload["data"]
# were sent verbatim to webhook endpoints, leaking secrets.
try:
from perseus.redaction import redact_text
redacted_data, _ = redact_text(payload, cfg)
body_dict["data"] = redacted_data
except Exception:
pass # redaction failure must not block webhook delivery
body_json = json.dumps(body_dict)
body_data = body_json.encode("utf-8")

Expand Down
Loading