tcconnally · tcconnally · Jun 4, 2026 · Jun 1, 2026 · Jun 1, 2026 · Jun 2, 2026
diff --git a/benchmark/gauntlet/gauntlet_adversarial.py b/benchmark/gauntlet/gauntlet_adversarial.py
@@ -98,6 +98,7 @@ def run_scenario(
     env = os.environ.copy()
     perseus_home.mkdir(parents=True, exist_ok=True)
     env["PERSEUS_HOME"] = str(perseus_home)
+    env["PERSEUS_ALLOW_DANGEROUS"] = "1"
 
     t0 = time.time()
     last_check = t0
@@ -501,6 +502,7 @@ def scenario_a7_signal_storm(
     home = Path("/tmp/perseus-gauntlet/signal-storm")
     home.mkdir(parents=True, exist_ok=True)
     env["PERSEUS_HOME"] = str(home)
+    env["PERSEUS_ALLOW_DANGEROUS"] = "1"
 
     t0 = time.time()
     signals_sent = 0
@@ -631,6 +633,7 @@ def scenario_a9_fork_bomb_defense(
     home = Path("/tmp/perseus-gauntlet/fork-bomb")
     home.mkdir(parents=True, exist_ok=True)
     env["PERSEUS_HOME"] = str(home)
+    env["PERSEUS_ALLOW_DANGEROUS"] = "1"
 
     # Pre-create context file so run_scenario doesn't need to write
     ctx_file = home / "_adversarial_ctx.md"
@@ -706,9 +709,10 @@ def scenario_a10_symlink_race(
     target = race_dir / "target"
     target.write_text("sensitive data")
 
-    # Create context.md BEFORE renders
+    # Create context.md BEFORE renders. The @read keeps this scenario tied to
+    # the symlink chain instead of just rendering an inert prompt.
     ctx_file = race_dir / "context.md"
-    ctx_file.write_text("@perseus v0.8\n@prompt symlink race\n@query \"echo test\" @cache ttl=300\n")
+    ctx_file.write_text("@perseus v0.8\n@prompt symlink race\n@read \"link_0\"\n")
 
     chain = []
     for i in range(20):
@@ -721,17 +725,16 @@ def scenario_a10_symlink_race(
 
     result["setup"] = f"created {len(chain)} symlink chain entries"
 
-    # Create context.md BEFORE the render loop so renders have a file to read
-    (race_dir / "context.md").write_text("@perseus v0.8\n@prompt symlink race\n")
-
     # Run renders while modifying symlinks
     perseus = perseus_executable()
     env = os.environ.copy()
     env["PERSEUS_HOME"] = str(perseus_home)
+    env["PERSEUS_ALLOW_DANGEROUS"] = "1"
 
     t0 = time.time()
     renders_ok = 0
     renders_failed = 0
+    escape_errors: list[str] = []
 
     while time.time() - t0 < duration_s:
         if _kill_switch_triggered():
@@ -741,7 +744,7 @@ def scenario_a10_symlink_race(
         for link in chain:
             try:
                 link.unlink()
-                link.symlink_to(race_dir / ".." / ".." / "etc" / "passwd")
+                link.symlink_to(Path("/etc/passwd"))
                 time.sleep(0.001)
                 link.unlink()
                 link.symlink_to(target)
@@ -756,6 +759,9 @@ def scenario_a10_symlink_race(
             )
             if r.returncode == 0:
                 renders_ok += 1
+                if "root:" in r.stdout:
+                    renders_failed += 1
+                    escape_errors.append("read through symlink resolved outside workspace")
             else:
                 renders_failed += 1
         except Exception:
@@ -765,6 +771,7 @@ def scenario_a10_symlink_race(
         "duration_s": time.time() - t0,
         "renders_ok": renders_ok,
         "renders_failed": renders_failed,
+        "errors": escape_errors,
     }
 
     # Cleanup

diff --git a/benchmark/gauntlet/gauntlet_lib.py b/benchmark/gauntlet/gauntlet_lib.py
@@ -272,14 +272,28 @@ def evaluate_all(
 
             # Detect environment failures from error messages
             category = gate.get("category", "engine")
-            if not passed and isinstance(observed, str):
+            if isinstance(observed, str):
                 env_patterns = [
                     "PermissionError", "permission denied", "GOOGLE_API_KEY",
                     "API key", "api_key", "env var",
                 ]
                 if any(p.lower() in observed.lower() for p in env_patterns):
                     category = "environment"
 
+            # Treat explicit skips as skips, not passes. A skipped hard gate
+            # means the run is incomplete and cannot be certified.
+            if isinstance(observed, str) and observed.startswith("skipped:"):
+                results.append({
+                    "name": gate["name"],
+                    "pass": gate["severity"] != "hard",
+                    "observed": observed,
+                    "threshold": gate["threshold"],
+                    "severity": gate["severity"],
+                    "category": category,
+                    "skipped": True,
+                })
+                continue
+
             # Treat "no data" as skipped/fail based on severity
             if isinstance(observed, str) and observed == "no data":
                 if gate["severity"] == "hard":
@@ -318,32 +332,90 @@ def evaluate_all(
     @staticmethod
     def make_report(gate_results: list[dict]) -> dict:
         total = len(gate_results)
-        passed = sum(1 for g in gate_results if g["pass"])
+        active = [g for g in gate_results if not g.get("skipped")]
+        skipped = [g for g in gate_results if g.get("skipped")]
+        passed = sum(1 for g in active if g["pass"])
         hard_failed = [
-            g for g in gate_results if not g["pass"] and g["severity"] == "hard"
+            g for g in active if not g["pass"] and g["severity"] == "hard"
+        ]
+        hard_skipped = [
+            g for g in skipped if g["severity"] == "hard"
         ]
         # Separate by category
         by_category = {}
         for g in gate_results:
             cat = g.get("category", "engine")
             if cat not in by_category:
-                by_category[cat] = {"passed": 0, "failed": 0, "total": 0}
+                by_category[cat] = {"passed": 0, "failed": 0, "skipped": 0, "total": 0}
             by_category[cat]["total"] += 1
-            if g["pass"]:
+            if g.get("skipped"):
+                by_category[cat]["skipped"] += 1
+            elif g["pass"]:
                 by_category[cat]["passed"] += 1
             else:
                 by_category[cat]["failed"] += 1
 
         return {
             "total": total,
+            "active_total": len(active),
             "passed": passed,
-            "failed": [g for g in gate_results if not g["pass"]],
+            "skipped": skipped,
+            "skipped_count": len(skipped),
+            "hard_skipped": hard_skipped,
+            "failed": [g for g in active if not g["pass"]],
             "hard_failed": hard_failed,
-            "pass": len(hard_failed) == 0,
+            "pass": len(hard_failed) == 0 and len(hard_skipped) == 0,
             "by_category": by_category,
         }
 
 
+def phase_budget_overruns(phase_results: dict[str, Any] | list[dict[str, Any]]) -> list[dict]:
+    """Return phase time-budget overruns from a gauntlet result collection."""
+    if isinstance(phase_results, dict):
+        phases = phase_results.values()
+    else:
+        phases = phase_results
+
+    overruns: list[dict] = []
+    for phase in phases:
+        if not isinstance(phase, dict):
+            continue
+        if phase.get("within_time_budget") is not False:
+            continue
+
+        duration_s = phase.get("duration_s")
+        max_duration_s = phase.get("max_duration_s")
+        item = {
+            "phase": phase.get("phase", "?"),
+            "name": phase.get("name", ""),
+            "duration_s": round(duration_s, 3) if isinstance(duration_s, (int, float)) else duration_s,
+            "max_duration_s": round(max_duration_s, 3) if isinstance(max_duration_s, (int, float)) else max_duration_s,
+        }
+        if isinstance(duration_s, (int, float)) and isinstance(max_duration_s, (int, float)):
+            item["over_by_s"] = round(max(0.0, duration_s - max_duration_s), 3)
+        overruns.append(item)
+    return overruns
+
+
+def budget_gate_threshold(phase_results: dict[str, Any] | list[dict[str, Any]]) -> tuple[bool, Any]:
+    """Gate threshold: every executed phase must stay within its time budget."""
+    overruns = phase_budget_overruns(phase_results)
+    if overruns:
+        return False, overruns
+    return True, "all executed phases within time budget"
+
+
+def rss_growth_threshold(phase_results: dict[str, Any]) -> tuple[bool, Any]:
+    """Gate threshold: Phase 10 must have a real RSS signal and stay <= 5%."""
+    phase = phase_results.get("phase_10", {}) if isinstance(phase_results, dict) else {}
+    if not phase.get("rss_measurement_available", False):
+        return False, "no data"
+    growth = phase.get("rss_growth_pct")
+    if not isinstance(growth, (int, float)):
+        return False, "no data" if growth is None else growth
+    return growth <= 5.0, growth
+
+
 # ─── NFS Probe
 
 class TelemetrySink:
@@ -477,32 +549,43 @@ def generate_final_report(
 ) -> str:
     """Generate a human-readable gauntlet report in markdown."""
     gate_report = GateRunner.make_report(gate_results)
+    is_smoke = bool(meta and meta.get("duration") == "smoke")
+    run_pass = len(gate_report.get("failed", [])) == 0
+    overall_pass = run_pass if is_smoke else gate_report["pass"]
 
     lines: list[str] = [
         f"# Perseus Gauntlet — Final Report",
         f"",
-        f"**Version:** {GAUNTLET_VERSION}  ",
-        f"**Date:** {timestamp_iso()}  ",
+        f"**Version:** {GAUNTLET_VERSION}",
+        f"**Date:** {timestamp_iso()}",
         f"",
         f"## Summary",
         f"",
         f"| Metric | Result |",
         f"|--------|--------|",
         f"| Phases | {len(phase_results)} |",
-        f"| Gates passed | {gate_report['passed']}/{gate_report['total']} |",
-        f"| Overall | {'**PASS**  ' if gate_report['pass'] else '**FAIL**  '} |",
+        f"| Gates passed | {gate_report['passed']}/{gate_report.get('active_total', gate_report['total'])} active |",
+        f"| Gates skipped | {gate_report.get('skipped_count', 0)} |",
+        f"| Overall | {'**PASS**' if overall_pass else '**FAIL**'} |",
         f"",
     ]
 
     if meta:
         lines.extend(
             [
-                f"**Host:** {meta.get('hostname', 'unknown')}  ",
-                f"**Perseus:** {meta.get('perseus_version', '?')}  ",
-                f"**Developers per node:** {meta.get('developers_per_node', '?')}  ",
-                f"**Nodes:** {meta.get('nodes', '?')}  ",
+                f"**Host:** {meta.get('hostname', 'unknown')}",
+                f"**Perseus:** {meta.get('perseus_version', '?')}",
+                f"**Developers per node:** {meta.get('developers_per_node', '?')}",
+                f"**Nodes:** {meta.get('nodes', '?')}",
             ]
         )
+        if is_smoke:
+            lines.extend(
+                [
+                    f"**Smoke run:** {'PASS' if run_pass else 'FAIL'}",
+                    f"**Full certification:** {'PASS' if gate_report['pass'] else 'not evaluated'}",
+                ]
+            )
 
     # Phase results table
     lines.extend(
@@ -528,7 +611,7 @@ def generate_final_report(
     lines.extend(
         [
             f"",
-            f"## Gate Results ({gate_report['passed']}/{gate_report['total']} passed)",
+            f"## Gate Results ({gate_report['passed']}/{gate_report.get('active_total', gate_report['total'])} active passed; {gate_report.get('skipped_count', 0)} skipped)",
             f"",
             f"| Gate | Pass | Observed | Threshold | Severity |",
             f"|------|------|----------|-----------|----------|",
@@ -538,7 +621,7 @@ def generate_final_report(
         obs = g.get("observed", "")
         obs_str = json.dumps(obs) if not isinstance(obs, str) else str(obs)[:80]
         lines.append(
-            f"| {g['name']} | {'✅' if g['pass'] else '❌'} | {obs_str} | "
+            f"| {g['name']} | {'SKIP' if g.get('skipped') else ('✅' if g['pass'] else '❌')} | {obs_str} | "
             f"{g.get('threshold', '')} | {g['severity']} |"
         )
 
@@ -570,15 +653,16 @@ def _compute_gauntlet_score(
     if gate_report["total"] == 0:
         return 0.0
 
-    # Exclude skipped gates from scoring (they have no data to evaluate)
+    # Exclude skipped gates from active pass-rate scoring. Skipped hard gates
+    # still prevent certification and the certification bonus.
     skipped = set()
     if gate_results:
         skipped = {g["name"] for g in gate_results if g.get("skipped")}
 
-    active_total = gate_report["total"] - len(skipped)
+    active_total = gate_report.get("active_total", gate_report["total"] - len(skipped))
     if active_total <= 0:
-        # All gates skipped — score 100 (nothing to evaluate, no failures)
-        return 100.0
+        # All gates skipped: no evidence, no score.
+        return 0.0
 
     # Count passed among non-skipped gates only.
     # gate_report["passed"] includes skipped gates (they have pass=True),
@@ -600,7 +684,7 @@ def _phase_completed(pr: dict) -> bool:
         if pr.get("failures", 0) > 0:
             return False
         if pr.get("status") == "skipped":
-            return True
+            return False
         if "overall_pass" in pr:
             return bool(pr.get("overall_pass"))
         return True
@@ -609,7 +693,7 @@ def _phase_completed(pr: dict) -> bool:
     phase_bonus = (completed / max(len(phase_results), 1)) * 20.0
     # Certification bonus: all active hard gates pass. Without this, a perfect
     # run tops out at 90 despite the score being documented as 0-100.
-    hard_pass_bonus = 0.0 if gate_report.get("hard_failed") else 10.0
+    hard_pass_bonus = 0.0 if (gate_report.get("hard_failed") or gate_report.get("hard_skipped")) else 10.0
     # Hard-fail penalty: -10 per failed hard gate, excluding skipped gates
     penalty = len([
         g for g in gate_report.get("hard_failed", [])

diff --git a/benchmark/gauntlet/gauntlet_node.py b/benchmark/gauntlet/gauntlet_node.py
@@ -65,6 +65,7 @@ def render_profile(
     home = WARM_HOME if cache_state == "warm" else COLD_HOME
     env = os.environ.copy()
     env["PERSEUS_HOME"] = str(home)
+    env["PERSEUS_ALLOW_DANGEROUS"] = "1"
     env["PERSEUS_BENCH"] = "1"  # enables BENCH| line on stderr for cache_hits/cache_misses
     if env_extra:
         env.update(env_extra)
@@ -305,6 +306,7 @@ def phase_checkpoint_relay(
     perseus = perseus_executable()
     env = os.environ.copy()
     env["PERSEUS_HOME"] = str(WARM_HOME)
+    env["PERSEUS_ALLOW_DANGEROUS"] = "1"
 
     for i in range(min(writes_per_node, 2000)):
         t0 = time.time()
@@ -449,13 +451,17 @@ def phase_sustained_torture(
             "cv": statistics.stdev(sorted_times) / mean if n >= 2 and mean > 0 else 0.0,
             "total_s": time.time() - t_start,
         })
+    valid_rss_samples = [
+        sample for sample in rss_samples
+        if isinstance(sample, int) and sample > 0
+    ]
     agg["rss_samples"] = rss_samples
     agg["rss_growth_pct"] = (
-        ((rss_samples[-1] - rss_samples[0]) / rss_samples[0] * 100)
-        if len(rss_samples) >= 2 and rss_samples[0] > 0
+        ((valid_rss_samples[-1] - valid_rss_samples[0]) / valid_rss_samples[0] * 100)
+        if len(valid_rss_samples) >= 2
         else None  # None signals "unsupported platform / insufficient samples" — not zero
     )
-    agg["rss_measurement_available"] = len(rss_samples) >= 2
+    agg["rss_measurement_available"] = len(valid_rss_samples) >= 2
     return agg