Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 13 additions & 6 deletions benchmark/gauntlet/gauntlet_adversarial.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ def run_scenario(
env = os.environ.copy()
perseus_home.mkdir(parents=True, exist_ok=True)
env["PERSEUS_HOME"] = str(perseus_home)
env["PERSEUS_ALLOW_DANGEROUS"] = "1"

t0 = time.time()
last_check = t0
Expand Down Expand Up @@ -501,6 +502,7 @@ def scenario_a7_signal_storm(
home = Path("/tmp/perseus-gauntlet/signal-storm")
home.mkdir(parents=True, exist_ok=True)
env["PERSEUS_HOME"] = str(home)
env["PERSEUS_ALLOW_DANGEROUS"] = "1"

t0 = time.time()
signals_sent = 0
Expand Down Expand Up @@ -631,6 +633,7 @@ def scenario_a9_fork_bomb_defense(
home = Path("/tmp/perseus-gauntlet/fork-bomb")
home.mkdir(parents=True, exist_ok=True)
env["PERSEUS_HOME"] = str(home)
env["PERSEUS_ALLOW_DANGEROUS"] = "1"

# Pre-create context file so run_scenario doesn't need to write
ctx_file = home / "_adversarial_ctx.md"
Expand Down Expand Up @@ -706,9 +709,10 @@ def scenario_a10_symlink_race(
target = race_dir / "target"
target.write_text("sensitive data")

# Create context.md BEFORE renders
# Create context.md BEFORE renders. The @read keeps this scenario tied to
# the symlink chain instead of just rendering an inert prompt.
ctx_file = race_dir / "context.md"
ctx_file.write_text("@perseus v0.8\n@prompt symlink race\n@query \"echo test\" @cache ttl=300\n")
ctx_file.write_text("@perseus v0.8\n@prompt symlink race\n@read \"link_0\"\n")

chain = []
for i in range(20):
Expand All @@ -721,17 +725,16 @@ def scenario_a10_symlink_race(

result["setup"] = f"created {len(chain)} symlink chain entries"

# Create context.md BEFORE the render loop so renders have a file to read
(race_dir / "context.md").write_text("@perseus v0.8\n@prompt symlink race\n")

# Run renders while modifying symlinks
perseus = perseus_executable()
env = os.environ.copy()
env["PERSEUS_HOME"] = str(perseus_home)
env["PERSEUS_ALLOW_DANGEROUS"] = "1"

t0 = time.time()
renders_ok = 0
renders_failed = 0
escape_errors: list[str] = []

while time.time() - t0 < duration_s:
if _kill_switch_triggered():
Expand All @@ -741,7 +744,7 @@ def scenario_a10_symlink_race(
for link in chain:
try:
link.unlink()
link.symlink_to(race_dir / ".." / ".." / "etc" / "passwd")
link.symlink_to(Path("/etc/passwd"))
time.sleep(0.001)
link.unlink()
link.symlink_to(target)
Expand All @@ -756,6 +759,9 @@ def scenario_a10_symlink_race(
)
if r.returncode == 0:
renders_ok += 1
if "root:" in r.stdout:
renders_failed += 1
escape_errors.append("read through symlink resolved outside workspace")
else:
renders_failed += 1
except Exception:
Expand All @@ -765,6 +771,7 @@ def scenario_a10_symlink_race(
"duration_s": time.time() - t0,
"renders_ok": renders_ok,
"renders_failed": renders_failed,
"errors": escape_errors,
}

# Cleanup
Expand Down
130 changes: 107 additions & 23 deletions benchmark/gauntlet/gauntlet_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,14 +272,28 @@ def evaluate_all(

# Detect environment failures from error messages
category = gate.get("category", "engine")
if not passed and isinstance(observed, str):
if isinstance(observed, str):
env_patterns = [
"PermissionError", "permission denied", "GOOGLE_API_KEY",
"API key", "api_key", "env var",
]
if any(p.lower() in observed.lower() for p in env_patterns):
category = "environment"

# Treat explicit skips as skips, not passes. A skipped hard gate
# means the run is incomplete and cannot be certified.
if isinstance(observed, str) and observed.startswith("skipped:"):
results.append({
"name": gate["name"],
"pass": gate["severity"] != "hard",
"observed": observed,
"threshold": gate["threshold"],
"severity": gate["severity"],
"category": category,
"skipped": True,
})
continue

# Treat "no data" as skipped/fail based on severity
if isinstance(observed, str) and observed == "no data":
if gate["severity"] == "hard":
Expand Down Expand Up @@ -318,32 +332,90 @@ def evaluate_all(
@staticmethod
def make_report(gate_results: list[dict]) -> dict:
total = len(gate_results)
passed = sum(1 for g in gate_results if g["pass"])
active = [g for g in gate_results if not g.get("skipped")]
skipped = [g for g in gate_results if g.get("skipped")]
passed = sum(1 for g in active if g["pass"])
hard_failed = [
g for g in gate_results if not g["pass"] and g["severity"] == "hard"
g for g in active if not g["pass"] and g["severity"] == "hard"
]
hard_skipped = [
g for g in skipped if g["severity"] == "hard"
]
# Separate by category
by_category = {}
for g in gate_results:
cat = g.get("category", "engine")
if cat not in by_category:
by_category[cat] = {"passed": 0, "failed": 0, "total": 0}
by_category[cat] = {"passed": 0, "failed": 0, "skipped": 0, "total": 0}
by_category[cat]["total"] += 1
if g["pass"]:
if g.get("skipped"):
by_category[cat]["skipped"] += 1
elif g["pass"]:
by_category[cat]["passed"] += 1
else:
by_category[cat]["failed"] += 1

return {
"total": total,
"active_total": len(active),
"passed": passed,
"failed": [g for g in gate_results if not g["pass"]],
"skipped": skipped,
"skipped_count": len(skipped),
"hard_skipped": hard_skipped,
"failed": [g for g in active if not g["pass"]],
"hard_failed": hard_failed,
"pass": len(hard_failed) == 0,
"pass": len(hard_failed) == 0 and len(hard_skipped) == 0,
"by_category": by_category,
}


def phase_budget_overruns(phase_results: dict[str, Any] | list[dict[str, Any]]) -> list[dict]:
"""Return phase time-budget overruns from a gauntlet result collection."""
if isinstance(phase_results, dict):
phases = phase_results.values()
else:
phases = phase_results

overruns: list[dict] = []
for phase in phases:
if not isinstance(phase, dict):
continue
if phase.get("within_time_budget") is not False:
continue

duration_s = phase.get("duration_s")
max_duration_s = phase.get("max_duration_s")
item = {
"phase": phase.get("phase", "?"),
"name": phase.get("name", ""),
"duration_s": round(duration_s, 3) if isinstance(duration_s, (int, float)) else duration_s,
"max_duration_s": round(max_duration_s, 3) if isinstance(max_duration_s, (int, float)) else max_duration_s,
}
if isinstance(duration_s, (int, float)) and isinstance(max_duration_s, (int, float)):
item["over_by_s"] = round(max(0.0, duration_s - max_duration_s), 3)
overruns.append(item)
return overruns


def budget_gate_threshold(phase_results: dict[str, Any] | list[dict[str, Any]]) -> tuple[bool, Any]:
"""Gate threshold: every executed phase must stay within its time budget."""
overruns = phase_budget_overruns(phase_results)
if overruns:
return False, overruns
return True, "all executed phases within time budget"


def rss_growth_threshold(phase_results: dict[str, Any]) -> tuple[bool, Any]:
"""Gate threshold: Phase 10 must have a real RSS signal and stay <= 5%."""
phase = phase_results.get("phase_10", {}) if isinstance(phase_results, dict) else {}
if not phase.get("rss_measurement_available", False):
return False, "no data"
growth = phase.get("rss_growth_pct")
if not isinstance(growth, (int, float)):
return False, "no data" if growth is None else growth
return growth <= 5.0, growth


# ─── NFS Probe

class TelemetrySink:
Expand Down Expand Up @@ -477,32 +549,43 @@ def generate_final_report(
) -> str:
"""Generate a human-readable gauntlet report in markdown."""
gate_report = GateRunner.make_report(gate_results)
is_smoke = bool(meta and meta.get("duration") == "smoke")
run_pass = len(gate_report.get("failed", [])) == 0
overall_pass = run_pass if is_smoke else gate_report["pass"]

lines: list[str] = [
f"# Perseus Gauntlet — Final Report",
f"",
f"**Version:** {GAUNTLET_VERSION} ",
f"**Date:** {timestamp_iso()} ",
f"**Version:** {GAUNTLET_VERSION}",
f"**Date:** {timestamp_iso()}",
f"",
f"## Summary",
f"",
f"| Metric | Result |",
f"|--------|--------|",
f"| Phases | {len(phase_results)} |",
f"| Gates passed | {gate_report['passed']}/{gate_report['total']} |",
f"| Overall | {'**PASS** ' if gate_report['pass'] else '**FAIL** '} |",
f"| Gates passed | {gate_report['passed']}/{gate_report.get('active_total', gate_report['total'])} active |",
f"| Gates skipped | {gate_report.get('skipped_count', 0)} |",
f"| Overall | {'**PASS**' if overall_pass else '**FAIL**'} |",
f"",
]

if meta:
lines.extend(
[
f"**Host:** {meta.get('hostname', 'unknown')} ",
f"**Perseus:** {meta.get('perseus_version', '?')} ",
f"**Developers per node:** {meta.get('developers_per_node', '?')} ",
f"**Nodes:** {meta.get('nodes', '?')} ",
f"**Host:** {meta.get('hostname', 'unknown')}",
f"**Perseus:** {meta.get('perseus_version', '?')}",
f"**Developers per node:** {meta.get('developers_per_node', '?')}",
f"**Nodes:** {meta.get('nodes', '?')}",
]
)
if is_smoke:
lines.extend(
[
f"**Smoke run:** {'PASS' if run_pass else 'FAIL'}",
f"**Full certification:** {'PASS' if gate_report['pass'] else 'not evaluated'}",
]
)

# Phase results table
lines.extend(
Expand All @@ -528,7 +611,7 @@ def generate_final_report(
lines.extend(
[
f"",
f"## Gate Results ({gate_report['passed']}/{gate_report['total']} passed)",
f"## Gate Results ({gate_report['passed']}/{gate_report.get('active_total', gate_report['total'])} active passed; {gate_report.get('skipped_count', 0)} skipped)",
f"",
f"| Gate | Pass | Observed | Threshold | Severity |",
f"|------|------|----------|-----------|----------|",
Expand All @@ -538,7 +621,7 @@ def generate_final_report(
obs = g.get("observed", "")
obs_str = json.dumps(obs) if not isinstance(obs, str) else str(obs)[:80]
lines.append(
f"| {g['name']} | {'✅' if g['pass'] else '❌'} | {obs_str} | "
f"| {g['name']} | {'SKIP' if g.get('skipped') else ('✅' if g['pass'] else '❌')} | {obs_str} | "
f"{g.get('threshold', '')} | {g['severity']} |"
)

Expand Down Expand Up @@ -570,15 +653,16 @@ def _compute_gauntlet_score(
if gate_report["total"] == 0:
return 0.0

# Exclude skipped gates from scoring (they have no data to evaluate)
# Exclude skipped gates from active pass-rate scoring. Skipped hard gates
# still prevent certification and the certification bonus.
skipped = set()
if gate_results:
skipped = {g["name"] for g in gate_results if g.get("skipped")}

active_total = gate_report["total"] - len(skipped)
active_total = gate_report.get("active_total", gate_report["total"] - len(skipped))
if active_total <= 0:
# All gates skipped — score 100 (nothing to evaluate, no failures)
return 100.0
# All gates skipped: no evidence, no score.
return 0.0

# Count passed among non-skipped gates only.
# gate_report["passed"] includes skipped gates (they have pass=True),
Expand All @@ -600,7 +684,7 @@ def _phase_completed(pr: dict) -> bool:
if pr.get("failures", 0) > 0:
return False
if pr.get("status") == "skipped":
return True
return False
if "overall_pass" in pr:
return bool(pr.get("overall_pass"))
return True
Expand All @@ -609,7 +693,7 @@ def _phase_completed(pr: dict) -> bool:
phase_bonus = (completed / max(len(phase_results), 1)) * 20.0
# Certification bonus: all active hard gates pass. Without this, a perfect
# run tops out at 90 despite the score being documented as 0-100.
hard_pass_bonus = 0.0 if gate_report.get("hard_failed") else 10.0
hard_pass_bonus = 0.0 if (gate_report.get("hard_failed") or gate_report.get("hard_skipped")) else 10.0
# Hard-fail penalty: -10 per failed hard gate, excluding skipped gates
penalty = len([
g for g in gate_report.get("hard_failed", [])
Expand Down
12 changes: 9 additions & 3 deletions benchmark/gauntlet/gauntlet_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ def render_profile(
home = WARM_HOME if cache_state == "warm" else COLD_HOME
env = os.environ.copy()
env["PERSEUS_HOME"] = str(home)
env["PERSEUS_ALLOW_DANGEROUS"] = "1"
env["PERSEUS_BENCH"] = "1" # enables BENCH| line on stderr for cache_hits/cache_misses
if env_extra:
env.update(env_extra)
Expand Down Expand Up @@ -305,6 +306,7 @@ def phase_checkpoint_relay(
perseus = perseus_executable()
env = os.environ.copy()
env["PERSEUS_HOME"] = str(WARM_HOME)
env["PERSEUS_ALLOW_DANGEROUS"] = "1"

for i in range(min(writes_per_node, 2000)):
t0 = time.time()
Expand Down Expand Up @@ -449,13 +451,17 @@ def phase_sustained_torture(
"cv": statistics.stdev(sorted_times) / mean if n >= 2 and mean > 0 else 0.0,
"total_s": time.time() - t_start,
})
valid_rss_samples = [
sample for sample in rss_samples
if isinstance(sample, int) and sample > 0
]
agg["rss_samples"] = rss_samples
agg["rss_growth_pct"] = (
((rss_samples[-1] - rss_samples[0]) / rss_samples[0] * 100)
if len(rss_samples) >= 2 and rss_samples[0] > 0
((valid_rss_samples[-1] - valid_rss_samples[0]) / valid_rss_samples[0] * 100)
if len(valid_rss_samples) >= 2
else None # None signals "unsupported platform / insufficient samples" — not zero
)
agg["rss_measurement_available"] = len(rss_samples) >= 2
agg["rss_measurement_available"] = len(valid_rss_samples) >= 2
return agg


Expand Down
Loading
Loading