cuga-project · haroldship · Jun 8, 2026 · May 19, 2026 · May 28, 2026 · Jun 3, 2026
diff --git a/.bob/commands/cuga-create-pr.md b/.bob/commands/cuga-create-pr.md
@@ -81,6 +81,12 @@ just ci
 
 (`lint`, `test-regression`, and `security`). For docs-only or command-only changes, `just lint` may suffice.
 
+Optional live smoke (needs AppWorld + M3 runtime, API keys, capability containers):
+
+```bash
+just test-smoke-e2e
+```
+
 #### Run the Command
 
 

diff --git a/.claude/commands/cuga-create-pr.md b/.claude/commands/cuga-create-pr.md
@@ -81,6 +81,12 @@ just ci
 
 (`lint`, `test-regression`, and `security`). For docs-only or command-only changes, `just lint` may suffice.
 
+Optional live smoke (needs AppWorld + M3 runtime, API keys, capability containers):
+
+```bash
+just test-smoke-e2e
+```
+
 #### Run the Command
 
 

diff --git a/.cursor/commands/cuga-create-pr.md b/.cursor/commands/cuga-create-pr.md
@@ -81,6 +81,12 @@ just ci
 
 (`lint`, `test-regression`, and `security`). For docs-only or command-only changes, `just lint` may suffice.
 
+Optional live smoke (needs AppWorld + M3 runtime, API keys, capability containers):
+
+```bash
+just test-smoke-e2e
+```
+
 #### Run the Command
 
 

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -45,9 +45,10 @@ or under the root `tests/` directory.
 ```bash
 just lint            # ruff check + ruff format --check
 just test-sanity     # ~5s
+just test-smoke-e2e  # live: AppWorld + M3 (needs API keys, AppWorld, M3 containers)
 just test-regression # ~7s
 just security        # bandit + pip-audit
-just ci              # all of the above
+just ci              # lint + test-regression + security (smoke is optional/manual, not included)
 ```
 
 CI runs the same `lint`, `test-regression`, and `security` checks on

diff --git a/README.md b/README.md
@@ -417,6 +417,16 @@ Bundle structure (comparison):
 
 Bundles are stored in `benchmarks/{benchmark}/evaluation_bundles/` and are git-ignored.
 
+#### Resilience: bundles and partial results on interrupt or crash
+
+M3's `eval.sh`/`compare.sh` salvage a best-effort bundle even when a run is interrupted (**Ctrl-C**) or crashes mid-flight, instead of silently losing everything collected so far:
+
+- **Partial result files**: if the evaluator is interrupted or hits an unexpected exception mid-run, it saves whatever task results were already collected to `benchmarks/m3/results/m3_config_partial_*.json` (or `m3_config_no_gt_partial_*.json` with `--no-ground-truth`), distinguishable from complete-run files by the `partial` prefix.
+- **Bundle on exit**: `create_bundle`/`create_compare_bundle` are idempotent and run from both the success path and the script's `cleanup` trap (`trap cleanup EXIT INT TERM ERR`), so a bundle is produced exactly once whether the run finishes normally, is interrupted, or crashes — picking up the freshest result file written during that run.
+- **Comparisons exclude partials**: `compare.sh` filters `m3_config_partial_*`/`m3_config_no_gt_partial_*` out of its result-file collection so an interrupted run doesn't skew aggregate pass-rate/token totals in a comparison report.
+
+This salvage behavior is best-effort and bounded to the currently in-flight task — completed tasks/domains are preserved, but progress within the task that was running at the moment of interruption may still be lost.
+
 ---
 
 ## 🔧 Configuration

diff --git a/benchmarks/appworld/eval_appworld_sdk.py b/benchmarks/appworld/eval_appworld_sdk.py
@@ -59,6 +59,7 @@
     save_evaluation_results,
     setup_agent_with_tools,
 )
+from benchmarks.helpers.sdk_eval_helpers import _react_steps_from_invoke_result
 
 tracker = ActivityTracker()
 var_manager = VariablesManager()
@@ -137,6 +138,7 @@ async def invoke_and_score_appworld(
     eval_dict: Dict[str, Any] = {}
     trace_id: Optional[str] = None
     _langfuse_metrics = None
+    invoke_result_holder: List[Any] = []
 
     async def run_invoke(invoke_config: Optional[dict] = None) -> None:
         nonlocal response, tool_calls, err, is_error, invoked
@@ -148,6 +150,8 @@ async def run_invoke(invoke_config: Optional[dict] = None) -> None:
                 track_tool_calls=track_tool_calls,
                 config=invoke_config or {},
             )
+            invoke_result_holder.clear()
+            invoke_result_holder.append(invoke_result)
             response = invoke_result.answer
             tool_calls = list(invoke_result.tool_calls or []) if track_tool_calls else []
             invoked = True
@@ -310,6 +314,14 @@ def complete_and_eval() -> None:
         result["llm_call_details"] = _langfuse_metrics.llm_call_details
         result["node_timings"] = _langfuse_metrics.node_timings
 
+    agent_steps = None
+    if invoke_result_holder:
+        agent_steps = _react_steps_from_invoke_result(invoke_result_holder[0])
+    if agent_steps is None:
+        agent_steps = len(tracker.steps) or len(tool_calls)
+    if agent_steps is not None:
+        result["steps"] = agent_steps
+
     return result
 
 
@@ -424,6 +436,9 @@ async def evaluate_task(self, task_id: str, task_index: int) -> Dict[str, Any]:
                 user_context = _build_user_context(world)
 
                 def tracker_callback(result: Dict[str, Any], keyword_check: Dict[str, Any], intent: str):
+                    agent_steps = result.get("steps")
+                    if agent_steps is None:
+                        agent_steps = len(tracker.steps) or len(result.get("tool_calls") or [])
                     eval_info = result.get("appworld_evaluation") or {}
                     report_md = json.dumps(
                         {
@@ -443,7 +458,7 @@ def tracker_callback(result: Dict[str, Any], keyword_check: Dict[str, Any], inte
                             score=0.0,
                             agent_answer="",
                             exception=True,
-                            num_steps=0,
+                            num_steps=agent_steps,
                             total_llm_calls=result.get("total_llm_calls", 0),
                             total_tokens=result.get("total_tokens", 0),
                             total_cost=result.get("total_cost", 0.0),
@@ -461,7 +476,7 @@ def tracker_callback(result: Dict[str, Any], keyword_check: Dict[str, Any], inte
                             score=score,
                             agent_answer=result.get("response", ""),
                             exception=False,
-                            num_steps=0,
+                            num_steps=agent_steps,
                             total_llm_calls=result.get("total_llm_calls", 0),
                             total_tokens=result.get("total_tokens", 0),
                             total_cost=result.get("total_cost", 0.0),

diff --git a/benchmarks/helpers/compare_report.py b/benchmarks/helpers/compare_report.py
@@ -48,8 +48,8 @@ def _format_config_label(config_key: str) -> str:
 
 
 def _fmt(val, fmt=","):
-    """Format a numeric value, returning '--' if zero/None."""
-    if val is None or val == 0:
+    """Format a numeric value, returning '--' if None (zero is shown as 0)."""
+    if val is None:
         return "--"
     if fmt == ",":
         # Use 1-decimal precision for floats so we don't surface float-repr

diff --git a/benchmarks/helpers/sdk_eval_helpers.py b/benchmarks/helpers/sdk_eval_helpers.py
@@ -1059,6 +1059,8 @@ async def evaluate_task_with_langfuse(
         react_steps = _react_steps_from_invoke_result(invoke_result)
         if react_steps is not None:
             result["steps"] = react_steps
+        elif result.get("steps") is None and tool_calls:
+            result["steps"] = len(tool_calls)
 
         if predefined_trace_id:
             result["trace_id"] = predefined_trace_id

diff --git a/benchmarks/helpers/tests/test_validate_bundle_report.py b/benchmarks/helpers/tests/test_validate_bundle_report.py
@@ -0,0 +1,43 @@
+"""Sanity checks for bundle report.md validation."""
+
+import pytest
+
+from benchmarks.helpers.validate_bundle_report import validate_report_md
+
+pytestmark = pytest.mark.sanity
+
+
+def test_validate_report_ok(tmp_path):
+    report = tmp_path / "report.md"
+    report.write_text(
+        """# Evaluation Report
+
+## Summary
+
+- **Total Tokens**: 1,234
+- **Total LLM Calls**: 5
+- **Total Duration**: 12.5s
+
+## Per-Task Results
+
+| Task | Result | Tokens | Cost | LLM Calls | Cache Tokens | Duration | Steps |
+|------|--------|--------|------|-----------|--------------|----------|-------|
+| t1 | ✓ | 1,234 | -- | 5 | 0 | 12.5s | 3 |
+"""
+    )
+    assert validate_report_md(report) == []
+
+
+def test_validate_report_flags_missing_metrics(tmp_path):
+    report = tmp_path / "report.md"
+    report.write_text(
+        """## Per-Task Results
+
+| Task | Result | Tokens | Cost | LLM Calls | Cache Tokens | Duration | Steps |
+|------|--------|--------|------|-----------|--------------|----------|-------|
+| t1 | ✓ | -- | -- | -- | -- | -- | -- |
+"""
+    )
+    errors = validate_report_md(report)
+    assert errors
+    assert any("Tokens" in e for e in errors)
diff --git a/benchmarks/helpers/validate_bundle_report.py b/benchmarks/helpers/validate_bundle_report.py
@@ -0,0 +1,97 @@
+"""Validate report.md from an eval bundle."""
+
+from __future__ import annotations
+
+import re
+from pathlib import Path
+
+_REQUIRED_COLS = frozenset({"Tokens", "LLM Calls", "Cache Tokens", "Duration", "Steps"})
+_EMPTY_MARKERS = frozenset({"--", "—", "-"})
+
+
+def _parse_table_header(line: str) -> list[str] | None:
+    if not line.startswith("|") or "---" in line:
+        return None
+    cells = [c.strip() for c in line.strip().strip("|").split("|")]
+    return cells if cells and cells[0] == "Task" else None
+
+
+def _is_separator(line: str) -> bool:
+    return line.startswith("|") and re.search(r"-{3,}", line) is not None
+
+
+def validate_report_md(path: Path) -> list[str]:
+    text = path.read_text()
+    errors: list[str] = []
+
+    in_per_task = False
+    header_cols: list[str] | None = None
+    required_indices: list[int] = []
+
+    for line_no, line in enumerate(text.splitlines(), start=1):
+        if line.startswith("## Per-Task"):
+            in_per_task = True
+            header_cols = None
+            required_indices = []
+            continue
+        if in_per_task and line.startswith("## "):
+            in_per_task = False
+            continue
+        if not in_per_task or not line.startswith("|"):
+            continue
+        if _is_separator(line):
+            continue
+
+        cols = _parse_table_header(line)
+        if cols and cols[0] == "Task":
+            header_cols = cols
+            required_indices = [i for i, name in enumerate(header_cols) if name in _REQUIRED_COLS]
+            continue
+
+        if not header_cols or not required_indices:
+            continue
+
+        cells = [c.strip() for c in line.strip().strip("|").split("|")]
+        if len(cells) < len(header_cols):
+            continue
+        if all(not cells[i] for i in range(min(3, len(cells)))) and cells[-1] in ("", "—"):
+            continue
+
+        for idx in required_indices:
+            col_name = header_cols[idx]
+            val = cells[idx] if idx < len(cells) else ""
+            if not val or val in _EMPTY_MARKERS:
+                task_label = cells[0] or cells[1] or f"line {line_no}"
+                errors.append(f"{path}:{line_no}: {col_name} is empty for task {task_label!r}")
+
+    for label in ("Total Tokens", "Total LLM Calls", "Total Duration"):
+        m = re.search(rf"\*\*{re.escape(label)}\*\*:\s*(.+)", text)
+        if m:
+            val = m.group(1).strip()
+            if not val or val in _EMPTY_MARKERS:
+                errors.append(f"{path}: summary {label} is missing")
+
+    return errors
+
+
+def main() -> int:
+    import argparse
+    import sys
+
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("report", type=Path)
+    args = parser.parse_args()
+    if not args.report.is_file():
+        print(f"report not found: {args.report}", file=sys.stderr)
+        return 1
+    errors = validate_report_md(args.report)
+    if errors:
+        for err in errors:
+            print(err, file=sys.stderr)
+        return 1
+    print(f"OK: {args.report}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())