Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
15 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .bob/commands/cuga-create-pr.md
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,12 @@ just ci

(`lint`, `test-regression`, and `security`). For docs-only or command-only changes, `just lint` may suffice.

Optional live smoke (needs AppWorld + M3 runtime, API keys, capability containers):

```bash
just test-smoke-e2e
```

#### Run the Command


Expand Down
6 changes: 6 additions & 0 deletions .claude/commands/cuga-create-pr.md
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,12 @@ just ci

(`lint`, `test-regression`, and `security`). For docs-only or command-only changes, `just lint` may suffice.

Optional live smoke (needs AppWorld + M3 runtime, API keys, capability containers):

```bash
just test-smoke-e2e
```

#### Run the Command


Expand Down
6 changes: 6 additions & 0 deletions .cursor/commands/cuga-create-pr.md
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,12 @@ just ci

(`lint`, `test-regression`, and `security`). For docs-only or command-only changes, `just lint` may suffice.

Optional live smoke (needs AppWorld + M3 runtime, API keys, capability containers):

```bash
just test-smoke-e2e
```

#### Run the Command


Expand Down
3 changes: 2 additions & 1 deletion CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,10 @@ or under the root `tests/` directory.
```bash
just lint # ruff check + ruff format --check
just test-sanity # ~5s
just test-smoke-e2e # live: AppWorld + M3 (needs API keys, AppWorld, M3 containers)
just test-regression # ~7s
just security # bandit + pip-audit
just ci # all of the above
just ci # lint + test-regression + security (smoke is optional/manual, not included)
```

CI runs the same `lint`, `test-regression`, and `security` checks on
Expand Down
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -417,6 +417,16 @@ Bundle structure (comparison):

Bundles are stored in `benchmarks/{benchmark}/evaluation_bundles/` and are git-ignored.

#### Resilience: bundles and partial results on interrupt or crash

M3's `eval.sh`/`compare.sh` salvage a best-effort bundle even when a run is interrupted (**Ctrl-C**) or crashes mid-flight, instead of silently losing everything collected so far:

- **Partial result files**: if the evaluator is interrupted or hits an unexpected exception mid-run, it saves whatever task results were already collected to `benchmarks/m3/results/m3_config_partial_*.json` (or `m3_config_no_gt_partial_*.json` with `--no-ground-truth`), distinguishable from complete-run files by the `partial` prefix.
- **Bundle on exit**: `create_bundle`/`create_compare_bundle` are idempotent and run from both the success path and the script's `cleanup` trap (`trap cleanup EXIT INT TERM ERR`), so a bundle is produced exactly once whether the run finishes normally, is interrupted, or crashes — picking up the freshest result file written during that run.
- **Comparisons exclude partials**: `compare.sh` filters `m3_config_partial_*`/`m3_config_no_gt_partial_*` out of its result-file collection so an interrupted run doesn't skew aggregate pass-rate/token totals in a comparison report.

This salvage behavior is best-effort and bounded to the currently in-flight task — completed tasks/domains are preserved, but progress within the task that was running at the moment of interruption may still be lost.

---

## 🔧 Configuration
Expand Down
19 changes: 17 additions & 2 deletions benchmarks/appworld/eval_appworld_sdk.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@
save_evaluation_results,
setup_agent_with_tools,
)
from benchmarks.helpers.sdk_eval_helpers import _react_steps_from_invoke_result

tracker = ActivityTracker()
var_manager = VariablesManager()
Expand Down Expand Up @@ -137,6 +138,7 @@ async def invoke_and_score_appworld(
eval_dict: Dict[str, Any] = {}
trace_id: Optional[str] = None
_langfuse_metrics = None
invoke_result_holder: List[Any] = []

async def run_invoke(invoke_config: Optional[dict] = None) -> None:
nonlocal response, tool_calls, err, is_error, invoked
Expand All @@ -148,6 +150,8 @@ async def run_invoke(invoke_config: Optional[dict] = None) -> None:
track_tool_calls=track_tool_calls,
config=invoke_config or {},
)
invoke_result_holder.clear()
invoke_result_holder.append(invoke_result)
response = invoke_result.answer
tool_calls = list(invoke_result.tool_calls or []) if track_tool_calls else []
invoked = True
Expand Down Expand Up @@ -310,6 +314,14 @@ def complete_and_eval() -> None:
result["llm_call_details"] = _langfuse_metrics.llm_call_details
result["node_timings"] = _langfuse_metrics.node_timings

agent_steps = None
if invoke_result_holder:
agent_steps = _react_steps_from_invoke_result(invoke_result_holder[0])
if agent_steps is None:
agent_steps = len(tracker.steps) or len(tool_calls)
if agent_steps is not None:
result["steps"] = agent_steps

return result


Expand Down Expand Up @@ -424,6 +436,9 @@ async def evaluate_task(self, task_id: str, task_index: int) -> Dict[str, Any]:
user_context = _build_user_context(world)

def tracker_callback(result: Dict[str, Any], keyword_check: Dict[str, Any], intent: str):
agent_steps = result.get("steps")
if agent_steps is None:
agent_steps = len(tracker.steps) or len(result.get("tool_calls") or [])
eval_info = result.get("appworld_evaluation") or {}
report_md = json.dumps(
{
Expand All @@ -443,7 +458,7 @@ def tracker_callback(result: Dict[str, Any], keyword_check: Dict[str, Any], inte
score=0.0,
agent_answer="",
exception=True,
num_steps=0,
num_steps=agent_steps,
total_llm_calls=result.get("total_llm_calls", 0),
total_tokens=result.get("total_tokens", 0),
total_cost=result.get("total_cost", 0.0),
Expand All @@ -461,7 +476,7 @@ def tracker_callback(result: Dict[str, Any], keyword_check: Dict[str, Any], inte
score=score,
agent_answer=result.get("response", ""),
exception=False,
num_steps=0,
num_steps=agent_steps,
total_llm_calls=result.get("total_llm_calls", 0),
total_tokens=result.get("total_tokens", 0),
total_cost=result.get("total_cost", 0.0),
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/helpers/compare_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,8 @@ def _format_config_label(config_key: str) -> str:


def _fmt(val, fmt=","):
"""Format a numeric value, returning '--' if zero/None."""
if val is None or val == 0:
"""Format a numeric value, returning '--' if None (zero is shown as 0)."""
if val is None:
return "--"
if fmt == ",":
# Use 1-decimal precision for floats so we don't surface float-repr
Expand Down
2 changes: 2 additions & 0 deletions benchmarks/helpers/sdk_eval_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1059,6 +1059,8 @@ async def evaluate_task_with_langfuse(
react_steps = _react_steps_from_invoke_result(invoke_result)
if react_steps is not None:
result["steps"] = react_steps
elif result.get("steps") is None and tool_calls:
result["steps"] = len(tool_calls)

if predefined_trace_id:
result["trace_id"] = predefined_trace_id
Expand Down
43 changes: 43 additions & 0 deletions benchmarks/helpers/tests/test_validate_bundle_report.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
"""Sanity checks for bundle report.md validation."""

import pytest

from benchmarks.helpers.validate_bundle_report import validate_report_md

pytestmark = pytest.mark.sanity


def test_validate_report_ok(tmp_path):
report = tmp_path / "report.md"
report.write_text(
"""# Evaluation Report

## Summary

- **Total Tokens**: 1,234
- **Total LLM Calls**: 5
- **Total Duration**: 12.5s

## Per-Task Results

| Task | Result | Tokens | Cost | LLM Calls | Cache Tokens | Duration | Steps |
|------|--------|--------|------|-----------|--------------|----------|-------|
| t1 | ✓ | 1,234 | -- | 5 | 0 | 12.5s | 3 |
"""
)
assert validate_report_md(report) == []


def test_validate_report_flags_missing_metrics(tmp_path):
report = tmp_path / "report.md"
report.write_text(
"""## Per-Task Results

| Task | Result | Tokens | Cost | LLM Calls | Cache Tokens | Duration | Steps |
|------|--------|--------|------|-----------|--------------|----------|-------|
| t1 | ✓ | -- | -- | -- | -- | -- | -- |
"""
)
errors = validate_report_md(report)
assert errors
assert any("Tokens" in e for e in errors)
97 changes: 97 additions & 0 deletions benchmarks/helpers/validate_bundle_report.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
"""Validate report.md from an eval bundle."""

from __future__ import annotations

import re
from pathlib import Path

_REQUIRED_COLS = frozenset({"Tokens", "LLM Calls", "Cache Tokens", "Duration", "Steps"})
_EMPTY_MARKERS = frozenset({"--", "—", "-"})


def _parse_table_header(line: str) -> list[str] | None:
if not line.startswith("|") or "---" in line:
return None
cells = [c.strip() for c in line.strip().strip("|").split("|")]
return cells if cells and cells[0] == "Task" else None


def _is_separator(line: str) -> bool:
return line.startswith("|") and re.search(r"-{3,}", line) is not None


def validate_report_md(path: Path) -> list[str]:
text = path.read_text()
errors: list[str] = []

in_per_task = False
header_cols: list[str] | None = None
required_indices: list[int] = []

for line_no, line in enumerate(text.splitlines(), start=1):
if line.startswith("## Per-Task"):
in_per_task = True
header_cols = None
required_indices = []
continue
if in_per_task and line.startswith("## "):
in_per_task = False
continue
if not in_per_task or not line.startswith("|"):
continue
if _is_separator(line):
continue

cols = _parse_table_header(line)
if cols and cols[0] == "Task":
header_cols = cols
required_indices = [i for i, name in enumerate(header_cols) if name in _REQUIRED_COLS]
continue

if not header_cols or not required_indices:
continue

cells = [c.strip() for c in line.strip().strip("|").split("|")]
if len(cells) < len(header_cols):
continue
if all(not cells[i] for i in range(min(3, len(cells)))) and cells[-1] in ("", "—"):
continue

for idx in required_indices:
col_name = header_cols[idx]
val = cells[idx] if idx < len(cells) else ""
if not val or val in _EMPTY_MARKERS:
task_label = cells[0] or cells[1] or f"line {line_no}"
errors.append(f"{path}:{line_no}: {col_name} is empty for task {task_label!r}")

for label in ("Total Tokens", "Total LLM Calls", "Total Duration"):
m = re.search(rf"\*\*{re.escape(label)}\*\*:\s*(.+)", text)
if m:
val = m.group(1).strip()
if not val or val in _EMPTY_MARKERS:
errors.append(f"{path}: summary {label} is missing")

return errors


def main() -> int:
import argparse
import sys

parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("report", type=Path)
args = parser.parse_args()
if not args.report.is_file():
print(f"report not found: {args.report}", file=sys.stderr)
return 1
errors = validate_report_md(args.report)
if errors:
for err in errors:
print(err, file=sys.stderr)
return 1
print(f"OK: {args.report}")
return 0


if __name__ == "__main__":
raise SystemExit(main())
Loading
Loading