diff --git a/.bob/commands/cuga-create-pr.md b/.bob/commands/cuga-create-pr.md index e4a809e..7697637 100644 --- a/.bob/commands/cuga-create-pr.md +++ b/.bob/commands/cuga-create-pr.md @@ -81,6 +81,12 @@ just ci (`lint`, `test-regression`, and `security`). For docs-only or command-only changes, `just lint` may suffice. +Optional live smoke (needs AppWorld + M3 runtime, API keys, capability containers): + +```bash +just test-smoke-e2e +``` + #### Run the Command diff --git a/.claude/commands/cuga-create-pr.md b/.claude/commands/cuga-create-pr.md index e4a809e..7697637 100644 --- a/.claude/commands/cuga-create-pr.md +++ b/.claude/commands/cuga-create-pr.md @@ -81,6 +81,12 @@ just ci (`lint`, `test-regression`, and `security`). For docs-only or command-only changes, `just lint` may suffice. +Optional live smoke (needs AppWorld + M3 runtime, API keys, capability containers): + +```bash +just test-smoke-e2e +``` + #### Run the Command diff --git a/.cursor/commands/cuga-create-pr.md b/.cursor/commands/cuga-create-pr.md index e4a809e..7697637 100644 --- a/.cursor/commands/cuga-create-pr.md +++ b/.cursor/commands/cuga-create-pr.md @@ -81,6 +81,12 @@ just ci (`lint`, `test-regression`, and `security`). For docs-only or command-only changes, `just lint` may suffice. +Optional live smoke (needs AppWorld + M3 runtime, API keys, capability containers): + +```bash +just test-smoke-e2e +``` + #### Run the Command diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 2b067b5..1b693ff 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -45,9 +45,10 @@ or under the root `tests/` directory. ```bash just lint # ruff check + ruff format --check just test-sanity # ~5s +just test-smoke-e2e # live: AppWorld + M3 (needs API keys, AppWorld, M3 containers) just test-regression # ~7s just security # bandit + pip-audit -just ci # all of the above +just ci # lint + test-regression + security (smoke is optional/manual, not included) ``` CI runs the same `lint`, `test-regression`, and `security` checks on diff --git a/README.md b/README.md index 18d3a27..e980d72 100644 --- a/README.md +++ b/README.md @@ -417,6 +417,16 @@ Bundle structure (comparison): Bundles are stored in `benchmarks/{benchmark}/evaluation_bundles/` and are git-ignored. +#### Resilience: bundles and partial results on interrupt or crash + +M3's `eval.sh`/`compare.sh` salvage a best-effort bundle even when a run is interrupted (**Ctrl-C**) or crashes mid-flight, instead of silently losing everything collected so far: + +- **Partial result files**: if the evaluator is interrupted or hits an unexpected exception mid-run, it saves whatever task results were already collected to `benchmarks/m3/results/m3_config_partial_*.json` (or `m3_config_no_gt_partial_*.json` with `--no-ground-truth`), distinguishable from complete-run files by the `partial` prefix. +- **Bundle on exit**: `create_bundle`/`create_compare_bundle` are idempotent and run from both the success path and the script's `cleanup` trap (`trap cleanup EXIT INT TERM ERR`), so a bundle is produced exactly once whether the run finishes normally, is interrupted, or crashes — picking up the freshest result file written during that run. +- **Comparisons exclude partials**: `compare.sh` filters `m3_config_partial_*`/`m3_config_no_gt_partial_*` out of its result-file collection so an interrupted run doesn't skew aggregate pass-rate/token totals in a comparison report. + +This salvage behavior is best-effort and bounded to the currently in-flight task — completed tasks/domains are preserved, but progress within the task that was running at the moment of interruption may still be lost. + --- ## 🔧 Configuration diff --git a/benchmarks/appworld/eval_appworld_sdk.py b/benchmarks/appworld/eval_appworld_sdk.py index 125b932..67c9f67 100644 --- a/benchmarks/appworld/eval_appworld_sdk.py +++ b/benchmarks/appworld/eval_appworld_sdk.py @@ -59,6 +59,7 @@ save_evaluation_results, setup_agent_with_tools, ) +from benchmarks.helpers.sdk_eval_helpers import _react_steps_from_invoke_result tracker = ActivityTracker() var_manager = VariablesManager() @@ -137,6 +138,7 @@ async def invoke_and_score_appworld( eval_dict: Dict[str, Any] = {} trace_id: Optional[str] = None _langfuse_metrics = None + invoke_result_holder: List[Any] = [] async def run_invoke(invoke_config: Optional[dict] = None) -> None: nonlocal response, tool_calls, err, is_error, invoked @@ -148,6 +150,8 @@ async def run_invoke(invoke_config: Optional[dict] = None) -> None: track_tool_calls=track_tool_calls, config=invoke_config or {}, ) + invoke_result_holder.clear() + invoke_result_holder.append(invoke_result) response = invoke_result.answer tool_calls = list(invoke_result.tool_calls or []) if track_tool_calls else [] invoked = True @@ -310,6 +314,14 @@ def complete_and_eval() -> None: result["llm_call_details"] = _langfuse_metrics.llm_call_details result["node_timings"] = _langfuse_metrics.node_timings + agent_steps = None + if invoke_result_holder: + agent_steps = _react_steps_from_invoke_result(invoke_result_holder[0]) + if agent_steps is None: + agent_steps = len(tracker.steps) or len(tool_calls) + if agent_steps is not None: + result["steps"] = agent_steps + return result @@ -424,6 +436,9 @@ async def evaluate_task(self, task_id: str, task_index: int) -> Dict[str, Any]: user_context = _build_user_context(world) def tracker_callback(result: Dict[str, Any], keyword_check: Dict[str, Any], intent: str): + agent_steps = result.get("steps") + if agent_steps is None: + agent_steps = len(tracker.steps) or len(result.get("tool_calls") or []) eval_info = result.get("appworld_evaluation") or {} report_md = json.dumps( { @@ -443,7 +458,7 @@ def tracker_callback(result: Dict[str, Any], keyword_check: Dict[str, Any], inte score=0.0, agent_answer="", exception=True, - num_steps=0, + num_steps=agent_steps, total_llm_calls=result.get("total_llm_calls", 0), total_tokens=result.get("total_tokens", 0), total_cost=result.get("total_cost", 0.0), @@ -461,7 +476,7 @@ def tracker_callback(result: Dict[str, Any], keyword_check: Dict[str, Any], inte score=score, agent_answer=result.get("response", ""), exception=False, - num_steps=0, + num_steps=agent_steps, total_llm_calls=result.get("total_llm_calls", 0), total_tokens=result.get("total_tokens", 0), total_cost=result.get("total_cost", 0.0), diff --git a/benchmarks/helpers/compare_report.py b/benchmarks/helpers/compare_report.py index 81bfe7f..f2d4d89 100644 --- a/benchmarks/helpers/compare_report.py +++ b/benchmarks/helpers/compare_report.py @@ -48,8 +48,8 @@ def _format_config_label(config_key: str) -> str: def _fmt(val, fmt=","): - """Format a numeric value, returning '--' if zero/None.""" - if val is None or val == 0: + """Format a numeric value, returning '--' if None (zero is shown as 0).""" + if val is None: return "--" if fmt == ",": # Use 1-decimal precision for floats so we don't surface float-repr diff --git a/benchmarks/helpers/sdk_eval_helpers.py b/benchmarks/helpers/sdk_eval_helpers.py index dd6e605..bc56338 100644 --- a/benchmarks/helpers/sdk_eval_helpers.py +++ b/benchmarks/helpers/sdk_eval_helpers.py @@ -1059,6 +1059,8 @@ async def evaluate_task_with_langfuse( react_steps = _react_steps_from_invoke_result(invoke_result) if react_steps is not None: result["steps"] = react_steps + elif result.get("steps") is None and tool_calls: + result["steps"] = len(tool_calls) if predefined_trace_id: result["trace_id"] = predefined_trace_id diff --git a/benchmarks/helpers/tests/test_validate_bundle_report.py b/benchmarks/helpers/tests/test_validate_bundle_report.py new file mode 100644 index 0000000..e00e998 --- /dev/null +++ b/benchmarks/helpers/tests/test_validate_bundle_report.py @@ -0,0 +1,43 @@ +"""Sanity checks for bundle report.md validation.""" + +import pytest + +from benchmarks.helpers.validate_bundle_report import validate_report_md + +pytestmark = pytest.mark.sanity + + +def test_validate_report_ok(tmp_path): + report = tmp_path / "report.md" + report.write_text( + """# Evaluation Report + +## Summary + +- **Total Tokens**: 1,234 +- **Total LLM Calls**: 5 +- **Total Duration**: 12.5s + +## Per-Task Results + +| Task | Result | Tokens | Cost | LLM Calls | Cache Tokens | Duration | Steps | +|------|--------|--------|------|-----------|--------------|----------|-------| +| t1 | ✓ | 1,234 | -- | 5 | 0 | 12.5s | 3 | +""" + ) + assert validate_report_md(report) == [] + + +def test_validate_report_flags_missing_metrics(tmp_path): + report = tmp_path / "report.md" + report.write_text( + """## Per-Task Results + +| Task | Result | Tokens | Cost | LLM Calls | Cache Tokens | Duration | Steps | +|------|--------|--------|------|-----------|--------------|----------|-------| +| t1 | ✓ | -- | -- | -- | -- | -- | -- | +""" + ) + errors = validate_report_md(report) + assert errors + assert any("Tokens" in e for e in errors) diff --git a/benchmarks/helpers/validate_bundle_report.py b/benchmarks/helpers/validate_bundle_report.py new file mode 100644 index 0000000..6020646 --- /dev/null +++ b/benchmarks/helpers/validate_bundle_report.py @@ -0,0 +1,97 @@ +"""Validate report.md from an eval bundle.""" + +from __future__ import annotations + +import re +from pathlib import Path + +_REQUIRED_COLS = frozenset({"Tokens", "LLM Calls", "Cache Tokens", "Duration", "Steps"}) +_EMPTY_MARKERS = frozenset({"--", "—", "-"}) + + +def _parse_table_header(line: str) -> list[str] | None: + if not line.startswith("|") or "---" in line: + return None + cells = [c.strip() for c in line.strip().strip("|").split("|")] + return cells if cells and cells[0] == "Task" else None + + +def _is_separator(line: str) -> bool: + return line.startswith("|") and re.search(r"-{3,}", line) is not None + + +def validate_report_md(path: Path) -> list[str]: + text = path.read_text() + errors: list[str] = [] + + in_per_task = False + header_cols: list[str] | None = None + required_indices: list[int] = [] + + for line_no, line in enumerate(text.splitlines(), start=1): + if line.startswith("## Per-Task"): + in_per_task = True + header_cols = None + required_indices = [] + continue + if in_per_task and line.startswith("## "): + in_per_task = False + continue + if not in_per_task or not line.startswith("|"): + continue + if _is_separator(line): + continue + + cols = _parse_table_header(line) + if cols and cols[0] == "Task": + header_cols = cols + required_indices = [i for i, name in enumerate(header_cols) if name in _REQUIRED_COLS] + continue + + if not header_cols or not required_indices: + continue + + cells = [c.strip() for c in line.strip().strip("|").split("|")] + if len(cells) < len(header_cols): + continue + if all(not cells[i] for i in range(min(3, len(cells)))) and cells[-1] in ("", "—"): + continue + + for idx in required_indices: + col_name = header_cols[idx] + val = cells[idx] if idx < len(cells) else "" + if not val or val in _EMPTY_MARKERS: + task_label = cells[0] or cells[1] or f"line {line_no}" + errors.append(f"{path}:{line_no}: {col_name} is empty for task {task_label!r}") + + for label in ("Total Tokens", "Total LLM Calls", "Total Duration"): + m = re.search(rf"\*\*{re.escape(label)}\*\*:\s*(.+)", text) + if m: + val = m.group(1).strip() + if not val or val in _EMPTY_MARKERS: + errors.append(f"{path}: summary {label} is missing") + + return errors + + +def main() -> int: + import argparse + import sys + + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("report", type=Path) + args = parser.parse_args() + if not args.report.is_file(): + print(f"report not found: {args.report}", file=sys.stderr) + return 1 + errors = validate_report_md(args.report) + if errors: + for err in errors: + print(err, file=sys.stderr) + return 1 + print(f"OK: {args.report}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/benchmarks/m3/compare.sh b/benchmarks/m3/compare.sh index 4757077..8141108 100755 --- a/benchmarks/m3/compare.sh +++ b/benchmarks/m3/compare.sh @@ -181,7 +181,210 @@ runs_done=0 runs_elapsed_total=0 compare_t0=$(date +%s) +BUNDLE_DONE=false + +# Best-effort comparison bundle. Defined as a function so it can be called +# from both the success path at the bottom of the script AND from the +# compare_cleanup trap on interrupt/crash (issues #91, #92). Idempotent via +# BUNDLE_DONE. Reads CONFIG_RESULT_KEYS/VALS and CONFIG_TRAJ_KEYS/VALS which +# are populated by the per-config loop below — on early interrupt those +# arrays may be empty, in which case the function exits cleanly. +create_compare_bundle() { + [ "$BUNDLE_DONE" = "true" ] && return 0 + [[ "${NO_BUNDLE:-false}" == "true" ]] && return 0 + BUNDLE_DONE=true + + echo "" + echo -e "${YELLOW:-}Creating comparison bundle...${NC:-}" + + # Build JSON input: {"model:agent": ["file1.json", ...]} + local JSON_PARTS=() + local ci config files file_list pfirst f + for ci in "${!CONFIG_RESULT_KEYS[@]}"; do + config="${CONFIG_RESULT_KEYS[$ci]}" + files="${CONFIG_RESULT_VALS[$ci]}" + if [[ -z "$files" ]]; then + continue + fi + file_list="" + pfirst=true + for f in $files; do + if [[ "$pfirst" != "true" ]]; then + file_list+="," + fi + pfirst=false + file_list+="\"${f}\"" + done + JSON_PARTS+=("\"${config}\":[${file_list}]") + done + + local JSON_INPUT="{" + local jfirst=true part + for part in "${JSON_PARTS[@]}"; do + if [[ "$jfirst" != "true" ]]; then + JSON_INPUT+="," + fi + jfirst=false + JSON_INPUT+="$part" + done + JSON_INPUT+="}" + + if [[ "$JSON_INPUT" == "{}" ]]; then + echo -e "${YELLOW:-}No completed runs to bundle — skipping.${NC:-}" + return 0 + fi + + # Generate comparison report (best-effort) + echo -e "${YELLOW:-}Generating comparison report...${NC:-}" + local REPORT_TMP + REPORT_TMP=$(mktemp /tmp/m3_report_XXXXXX) + echo "$JSON_INPUT" | (cd "$PROJECT_ROOT" && uv run --no-sync python -m benchmarks.helpers.compare_report --output "$REPORT_TMP") \ + || echo -e "${YELLOW:-}Report generation failed — bundling without comparison report.${NC:-}" + echo "" + + # Build per-model env snapshot for bundle + local MODEL_ENVS_JSON="" + if type build_model_envs_json &>/dev/null; then + MODEL_ENVS_JSON=$(build_model_envs_json "${MODEL_LIST[@]}") + fi + + # Build per-config trajectory dirs JSON grouped by run: + # {"model:agent:policy": [["/run1/domA", ...], ["/run2/domA", ...]]} + # CONFIG_TRAJ_VALS holds sentinel-delimited groups (one per eval run). + local TRAJ_JSON_PARTS=() + local tconfig tgroups groups_json cur_group in_group line + for ci in "${!CONFIG_TRAJ_KEYS[@]}"; do + tconfig="${CONFIG_TRAJ_KEYS[$ci]}" + tgroups="${CONFIG_TRAJ_VALS[$ci]}" + if [[ -z "$tgroups" ]]; then + continue + fi + groups_json="" + cur_group="" + in_group=false + while IFS= read -r line; do + if [[ "$line" == "$TRAJ_GROUP_SEP" ]]; then + if [[ "$in_group" == "true" ]]; then + if [[ -n "$groups_json" ]]; then groups_json+=","; fi + groups_json+="[${cur_group}]" + fi + cur_group="" + in_group=true + continue + fi + [[ -z "$line" ]] && continue + if [[ -n "$cur_group" ]]; then cur_group+=","; fi + cur_group+="\"${line}\"" + done <<< "$tgroups" + if [[ "$in_group" == "true" ]]; then + if [[ -n "$groups_json" ]]; then groups_json+=","; fi + groups_json+="[${cur_group}]" + fi + if [[ -z "$groups_json" ]]; then + continue + fi + TRAJ_JSON_PARTS+=("\"${tconfig}\":[${groups_json}]") + done + + local TRAJ_JSON_INPUT="{" + local tjfirst=true + for part in "${TRAJ_JSON_PARTS[@]}"; do + if [[ "$tjfirst" != "true" ]]; then + TRAJ_JSON_INPUT+="," + fi + tjfirst=false + TRAJ_JSON_INPUT+="$part" + done + TRAJ_JSON_INPUT+="}" + + # Determine task file + local TASK_FILE="$SCRIPT_DIR/data/hockey.json" + local arg + for arg in "${FORWARDED_ARGS[@]}"; do + if [[ "$arg" == "--multiturn" ]]; then + TASK_FILE="$SCRIPT_DIR/data/olympics_multiturn.json" + break + fi + done + + local BUNDLE_CMD=(uv run --no-sync python -m benchmarks.helpers.bundle assemble-compare + --benchmark m3 + --config-results "$JSON_INPUT" + --report "$REPORT_TMP" + --task-files "$TASK_FILE") + + if [[ -n "$MODEL_ENVS_JSON" ]]; then + BUNDLE_CMD+=(--model-envs "$MODEL_ENVS_JSON") + fi + if [[ "$TRAJ_JSON_INPUT" != "{}" ]]; then + BUNDLE_CMD+=(--trajectory-dirs "$TRAJ_JSON_INPUT") + fi + # Build per-config log JSON grouped by run (one console+registry log set + # per eval run) so each run folder gets its OWN logs: + # {"model:agent:policy": [["/run1/console.log", ...], ["/run2/...", ...]]} + local LOG_JSON_PARTS=() + local lconfig lgroups lgroups_json lcur_group lin_group + for ci in "${!CONFIG_LOG_KEYS[@]}"; do + lconfig="${CONFIG_LOG_KEYS[$ci]}" + lgroups="${CONFIG_LOG_VALS[$ci]}" + if [[ -z "$lgroups" ]]; then + continue + fi + lgroups_json="" + lcur_group="" + lin_group=false + while IFS= read -r line; do + if [[ "$line" == "$LOG_GROUP_SEP" ]]; then + if [[ "$lin_group" == "true" ]]; then + if [[ -n "$lgroups_json" ]]; then lgroups_json+=","; fi + lgroups_json+="[${lcur_group}]" + fi + lcur_group="" + lin_group=true + continue + fi + [[ -z "$line" ]] && continue + if [[ -n "$lcur_group" ]]; then lcur_group+=","; fi + lcur_group+="\"${line}\"" + done <<< "$lgroups" + if [[ "$lin_group" == "true" ]]; then + if [[ -n "$lgroups_json" ]]; then lgroups_json+=","; fi + lgroups_json+="[${lcur_group}]" + fi + if [[ -z "$lgroups_json" ]]; then + continue + fi + LOG_JSON_PARTS+=("\"${lconfig}\":[${lgroups_json}]") + done + local LOG_JSON="{" + local ljfirst=true + for part in "${LOG_JSON_PARTS[@]}"; do + if [[ "$ljfirst" != "true" ]]; then LOG_JSON+=","; fi + ljfirst=false + LOG_JSON+="$part" + done + LOG_JSON+="}" + if [[ "$LOG_JSON" != "{}" ]]; then + BUNDLE_CMD+=(--log-files "$LOG_JSON") + fi + # Download Langfuse traces if available + BUNDLE_CMD+=(--fetch-langfuse) + if [[ "${BUNDLE_ZIP:-false}" == "true" ]]; then + BUNDLE_CMD+=(--zip) + fi + + # Bundle CLI needs project root on PYTHONPATH + (cd "$PROJECT_ROOT" && "${BUNDLE_CMD[@]}") \ + || echo -e "${YELLOW:-}Bundle creation reported errors (best-effort).${NC:-}" + rm -f "$REPORT_TMP" +} + compare_cleanup() { + # Best-effort comparison bundle on interrupt/crash (issues #91, #92). + # If we made it past the per-config loop the success path below will have + # already created the bundle; BUNDLE_DONE makes this idempotent. + create_compare_bundle || true + echo -e "${YELLOW:-}Stopping servers...${NC:-}" kill_port_processes "${REGISTRY_PORT:-8001}" # Staged per-run logs were already copied into the bundle by now. @@ -214,8 +417,14 @@ LOG_GROUP_SEP="@@RUN@@" # the right glob for each agent. _list_results_for_agent() { local agent="$1" + # Exclude interrupt/crash partial saves (m3_config_partial_*, + # m3_config_no_gt_partial_*) — they're incomplete runs and would skew + # compare_report's totals/pass-rate aggregates if folded in alongside + # complete runs. if [[ "$agent" == "cuga" ]]; then - ls -1 "$RESULTS_DIR"/m3_config_*.json 2>/dev/null | sort + ls -1 "$RESULTS_DIR"/m3_config_*.json 2>/dev/null \ + | grep -vE '/m3_config_(no_gt_)?partial_' \ + | sort else # react: m3_*.json but NOT m3_config_*.json (and not multiturn either, # which is a separate flow). @@ -335,179 +544,7 @@ if [ -n "$OUTPUT_FILE" ]; then echo -e "${GREEN:-}✓${NC:-} Results in: $RESULTS_DIR" fi -# Create reproducibility bundle unless skipped -if [[ "${NO_BUNDLE:-false}" != "true" ]]; then - echo "" - echo -e "${YELLOW:-}Creating comparison bundle...${NC:-}" - - # Build JSON input: {"model:agent": ["file1.json", ...]} - JSON_PARTS=() - for ci in "${!CONFIG_RESULT_KEYS[@]}"; do - config="${CONFIG_RESULT_KEYS[$ci]}" - files="${CONFIG_RESULT_VALS[$ci]}" - if [[ -z "$files" ]]; then - continue - fi - file_list="" - pfirst=true - for f in $files; do - if [[ "$pfirst" != "true" ]]; then - file_list+="," - fi - pfirst=false - file_list+="\"${f}\"" - done - JSON_PARTS+=("\"${config}\":[${file_list}]") - done - - JSON_INPUT="{" - jfirst=true - for part in "${JSON_PARTS[@]}"; do - if [[ "$jfirst" != "true" ]]; then - JSON_INPUT+="," - fi - jfirst=false - JSON_INPUT+="$part" - done - JSON_INPUT+="}" - - if [[ "$JSON_INPUT" != "{}" ]]; then - # Generate comparison report - echo -e "${YELLOW:-}Generating comparison report...${NC:-}" - REPORT_TMP=$(mktemp /tmp/m3_report_XXXXXX) - echo "$JSON_INPUT" | (cd "$PROJECT_ROOT" && uv run --no-sync python -m benchmarks.helpers.compare_report --output "$REPORT_TMP") - echo "" - - # Build per-model env snapshot for bundle - MODEL_ENVS_JSON="" - if type build_model_envs_json &>/dev/null; then - MODEL_ENVS_JSON=$(build_model_envs_json "${MODEL_LIST[@]}") - fi - - # Build per-config trajectory JSON grouped by run: - # {"model:agent:policy": [["/run1/domA", ...], ["/run2/domA", ...]]} - # CONFIG_TRAJ_VALS holds sentinel-delimited groups (one per eval run). - TRAJ_JSON_PARTS=() - for ci in "${!CONFIG_TRAJ_KEYS[@]}"; do - tconfig="${CONFIG_TRAJ_KEYS[$ci]}" - tgroups="${CONFIG_TRAJ_VALS[$ci]}" - if [[ -z "$tgroups" ]]; then - continue - fi - groups_json="" - cur_group="" - in_group=false - while IFS= read -r line; do - if [[ "$line" == "$TRAJ_GROUP_SEP" ]]; then - if [[ "$in_group" == "true" ]]; then - if [[ -n "$groups_json" ]]; then groups_json+=","; fi - groups_json+="[${cur_group}]" - fi - cur_group="" - in_group=true - continue - fi - [[ -z "$line" ]] && continue - if [[ -n "$cur_group" ]]; then cur_group+=","; fi - cur_group+="\"${line}\"" - done <<< "$tgroups" - if [[ "$in_group" == "true" ]]; then - if [[ -n "$groups_json" ]]; then groups_json+=","; fi - groups_json+="[${cur_group}]" - fi - if [[ -z "$groups_json" ]]; then - continue - fi - TRAJ_JSON_PARTS+=("\"${tconfig}\":[${groups_json}]") - done - - TRAJ_JSON_INPUT="{" - tjfirst=true - for part in "${TRAJ_JSON_PARTS[@]}"; do - if [[ "$tjfirst" != "true" ]]; then - TRAJ_JSON_INPUT+="," - fi - tjfirst=false - TRAJ_JSON_INPUT+="$part" - done - TRAJ_JSON_INPUT+="}" - - # Determine task file - TASK_FILE="$SCRIPT_DIR/data/hockey.json" - for arg in "${FORWARDED_ARGS[@]}"; do - if [[ "$arg" == "--multiturn" ]]; then - TASK_FILE="$SCRIPT_DIR/data/olympics_mutliturn.json" - break - fi - done - - BUNDLE_CMD=(uv run --no-sync python -m benchmarks.helpers.bundle assemble-compare - --benchmark m3 - --config-results "$JSON_INPUT" - --report "$REPORT_TMP" - --task-files "$TASK_FILE") - - if [[ -n "$MODEL_ENVS_JSON" ]]; then - BUNDLE_CMD+=(--model-envs "$MODEL_ENVS_JSON") - fi - if [[ "$TRAJ_JSON_INPUT" != "{}" ]]; then - BUNDLE_CMD+=(--trajectory-dirs "$TRAJ_JSON_INPUT") - fi - # Build per-config log JSON grouped by run (one console+registry log set - # per eval run) so each run folder gets its OWN logs: - # {"model:agent:policy": [["/run1/console.log", ...], ["/run2/...", ...]]} - LOG_JSON_PARTS=() - for ci in "${!CONFIG_LOG_KEYS[@]}"; do - lconfig="${CONFIG_LOG_KEYS[$ci]}" - lgroups="${CONFIG_LOG_VALS[$ci]}" - if [[ -z "$lgroups" ]]; then - continue - fi - lgroups_json="" - lcur_group="" - lin_group=false - while IFS= read -r line; do - if [[ "$line" == "$LOG_GROUP_SEP" ]]; then - if [[ "$lin_group" == "true" ]]; then - if [[ -n "$lgroups_json" ]]; then lgroups_json+=","; fi - lgroups_json+="[${lcur_group}]" - fi - lcur_group="" - lin_group=true - continue - fi - [[ -z "$line" ]] && continue - if [[ -n "$lcur_group" ]]; then lcur_group+=","; fi - lcur_group+="\"${line}\"" - done <<< "$lgroups" - if [[ "$lin_group" == "true" ]]; then - if [[ -n "$lgroups_json" ]]; then lgroups_json+=","; fi - lgroups_json+="[${lcur_group}]" - fi - if [[ -z "$lgroups_json" ]]; then - continue - fi - LOG_JSON_PARTS+=("\"${lconfig}\":[${lgroups_json}]") - done - LOG_JSON="{" - ljfirst=true - for part in "${LOG_JSON_PARTS[@]}"; do - if [[ "$ljfirst" != "true" ]]; then LOG_JSON+=","; fi - ljfirst=false - LOG_JSON+="$part" - done - LOG_JSON+="}" - if [[ "$LOG_JSON" != "{}" ]]; then - BUNDLE_CMD+=(--log-files "$LOG_JSON") - fi - # Download Langfuse traces if available - BUNDLE_CMD+=(--fetch-langfuse) - if [[ "${BUNDLE_ZIP:-false}" == "true" ]]; then - BUNDLE_CMD+=(--zip) - fi - - # Bundle CLI needs project root on PYTHONPATH - (cd "$PROJECT_ROOT" && "${BUNDLE_CMD[@]}") - rm -f "$REPORT_TMP" - fi -fi +# Create the comparison bundle (success path). Idempotent — if the cleanup +# trap already created it on interrupt, this is a no-op. See create_compare_bundle +# definition near the top of this script and #91, #92. +create_compare_bundle diff --git a/benchmarks/m3/data/olympics_mutliturn.json b/benchmarks/m3/data/olympics_multiturn.json similarity index 100% rename from benchmarks/m3/data/olympics_mutliturn.json rename to benchmarks/m3/data/olympics_multiturn.json diff --git a/benchmarks/m3/eval.sh b/benchmarks/m3/eval.sh index f9cafde..250b8f9 100755 --- a/benchmarks/m3/eval.sh +++ b/benchmarks/m3/eval.sh @@ -114,10 +114,106 @@ done REGISTRY_PID="" +# Timestamp captured before the eval starts. Used by create_bundle to pick +# only the result file(s) produced by *this* run, not a leftover from earlier. +RUN_START_TS=$(date +%s) +BUNDLE_DONE=false + +# Best-effort bundle creation. Called from the success path AND from the +# cleanup trap on Ctrl-C / crash / non-zero exit (issues #91, #92), so a +# long run that is interrupted still leaves logs + trajectories + any +# results that were already written. Skips silently if --no-bundle was +# passed, or if nothing from this run was produced yet. +create_bundle() { + [ "$BUNDLE_DONE" = "true" ] && return 0 + [ "${NO_BUNDLE:-false}" = "true" ] && return 0 + BUNDLE_DONE=true + + echo "" + echo -e "${YELLOW:-}Creating reproducibility bundle...${NC:-}" + + # Find the most recent result file produced by *this* run (mtime newer + # than RUN_START_TS). If the run was killed before any save, there'll be + # nothing here and we skip the bundle — there's nothing meaningful to + # bundle without at least one results JSON. + local latest_result="" + local f + for f in $(ls -t "$SCRIPT_DIR/results"/m3_*.json "$SCRIPT_DIR/results"/multiturn_*.json 2>/dev/null); do + local f_mtime + f_mtime=$(stat -f %m "$f" 2>/dev/null || stat -c %Y "$f" 2>/dev/null) + if [ -n "$f_mtime" ] && [ "$f_mtime" -ge "$RUN_START_TS" ]; then + latest_result="$f" + break + fi + done + + if [ -z "$latest_result" ]; then + echo -e "${YELLOW:-}No result file from this run was found — skipping bundle.${NC:-}" + echo -e "${YELLOW:-}(Console log is still at $CONSOLE_LOG.)${NC:-}" + return 0 + fi + + # Determine task file used + local task_file + if [ "$MULTITURN" = "true" ]; then + task_file="$SCRIPT_DIR/data/olympics_multiturn.json" + else + task_file="$SCRIPT_DIR/data/hockey.json" + fi + + # Generate eval report (best effort — if report generation fails we still + # want the bundle, so don't let `set -e` abort here). + local report_tmp + report_tmp=$(mktemp /tmp/m3_eval_report_XXXXXX) + uv run --no-sync python -m benchmarks.helpers.compare_report eval \ + --result-file "$latest_result" --output "$report_tmp" || \ + echo -e "${YELLOW:-}Report generation failed — bundling without report.${NC:-}" + + local bundle_args=(assemble --benchmark m3 + --result-files "$latest_result" + --task-files "$task_file" + --report "$report_tmp") + if [ -n "$MODEL_PROFILE" ]; then + bundle_args+=(--model-profile "$MODEL_PROFILE") + fi + if [ "$NO_POLICIES" = "true" ]; then + bundle_args+=(--no-policies) + fi + if [ "${BUNDLE_ZIP:-false}" = "true" ]; then + bundle_args+=(--zip) + fi + # Include cuga trajectories + local traj_dir + traj_dir=$(find_latest_trajectory "$SCRIPT_DIR/logging/trajectory_data") + if [ -n "$traj_dir" ]; then + bundle_args+=(--trajectory-dir "$traj_dir") + fi + # Include server and console logs (whichever exists) + local registry_log="$SCRIPT_DIR/registry_server.log" + if [ -f "$registry_log" ]; then + bundle_args+=(--log-files "$registry_log" "$CONSOLE_LOG") + else + bundle_args+=(--log-files /tmp/m3_registry.log "$CONSOLE_LOG") + fi + # Download Langfuse traces if available + bundle_args+=(--fetch-langfuse) + + uv run --no-sync python -m benchmarks.helpers.bundle "${bundle_args[@]}" || \ + echo -e "${YELLOW:-}Bundle creation reported errors (best-effort).${NC:-}" + + rm -f "$report_tmp" +} + cleanup() { local exit_code=$? echo "" echo -e "${YELLOW:-}Cleaning up...${NC:-}" + + # Best-effort bundle on interrupt/crash. Idempotent (no-op if already + # created on the success path below). Wrapped in `|| true` so a bundle + # failure can't override the original exit code. + create_bundle || true + if [ "${SKIP_SERVER_CLEANUP:-false}" != "true" ]; then if [ -n "$REGISTRY_PID" ] && kill -0 "$REGISTRY_PID" 2>/dev/null; then echo -e "${BLUE:-}Stopping registry server (PID: $REGISTRY_PID)${NC:-}" @@ -276,62 +372,12 @@ EVAL_EXIT=$? if [ $EVAL_EXIT -eq 0 ]; then echo -e "${GREEN:-}✓${NC:-} M3 evaluation completed successfully" - - # Create reproducibility bundle unless skipped - if [ "${NO_BUNDLE:-false}" != "true" ]; then - echo "" - echo -e "${YELLOW:-}Creating reproducibility bundle...${NC:-}" - - # Find the most recent result file - LATEST_RESULT=$(ls -t "$SCRIPT_DIR/results"/m3_*.json "$SCRIPT_DIR/results"/multiturn_*.json 2>/dev/null | head -1) - if [ -n "$LATEST_RESULT" ]; then - # Determine task file used - if [ "$MULTITURN" = "true" ]; then - TASK_FILE="$SCRIPT_DIR/data/olympics_mutliturn.json" - else - TASK_FILE="$SCRIPT_DIR/data/hockey.json" - fi - - # Generate eval report - REPORT_TMP=$(mktemp /tmp/m3_eval_report_XXXXXX) - uv run --no-sync python -m benchmarks.helpers.compare_report eval \ - --result-file "$LATEST_RESULT" --output "$REPORT_TMP" - - BUNDLE_ARGS=(assemble --benchmark m3 - --result-files "$LATEST_RESULT" - --task-files "$TASK_FILE" - --report "$REPORT_TMP") - if [ -n "$MODEL_PROFILE" ]; then - BUNDLE_ARGS+=(--model-profile "$MODEL_PROFILE") - fi - if [ "$NO_POLICIES" = "true" ]; then - BUNDLE_ARGS+=(--no-policies) - fi - if [ "${BUNDLE_ZIP:-false}" = "true" ]; then - BUNDLE_ARGS+=(--zip) - fi - # Include cuga trajectories - TRAJ_DIR=$(find_latest_trajectory "$SCRIPT_DIR/logging/trajectory_data") - if [ -n "$TRAJ_DIR" ]; then - BUNDLE_ARGS+=(--trajectory-dir "$TRAJ_DIR") - fi - # Include server and console logs - # Note: eval_m3.py creates registry_server.log in the benchmark directory - REGISTRY_LOG="$SCRIPT_DIR/registry_server.log" - if [ -f "$REGISTRY_LOG" ]; then - BUNDLE_ARGS+=(--log-files "$REGISTRY_LOG" "$CONSOLE_LOG") - else - # Fallback to /tmp location if registry_server.log doesn't exist - BUNDLE_ARGS+=(--log-files /tmp/m3_registry.log "$CONSOLE_LOG") - fi - # Download Langfuse traces if available - BUNDLE_ARGS+=(--fetch-langfuse) - uv run --no-sync python -m benchmarks.helpers.bundle "${BUNDLE_ARGS[@]}" - rm -f "$REPORT_TMP" - fi - fi + # Create reproducibility bundle (idempotent — cleanup trap also calls + # this on interrupt/crash, see #91, #92). + create_bundle else echo -e "${RED:-}✗ M3 evaluation failed (exit code: $EVAL_EXIT)${NC:-}" + # cleanup trap will call create_bundle to salvage what we have. fi # Re-echo the --m3-data summary as the very last thing on screen, so it's diff --git a/benchmarks/m3/eval_m3.py b/benchmarks/m3/eval_m3.py index efc8cb9..6c82c0f 100644 --- a/benchmarks/m3/eval_m3.py +++ b/benchmarks/m3/eval_m3.py @@ -2415,6 +2415,12 @@ async def run_config_mode(args, container_runtime: str, defer_save: bool = False batch_size = args.batch_size or 1 sequential_mode = batch_size < 2 + # Hoisted so the KeyboardInterrupt / Exception handlers below can save + # whatever was collected if the eval is interrupted (#91, #92). In + # sequential mode results are appended as tasks complete; in batched + # mode evaluate_tasks_in_batches replaces the list with its return. + all_results: List[Dict[str, Any]] = [] + try: # Start registry if enabled. In sequential mode we *don't* start a # shared registry here — each service spawns its own mini registry @@ -2615,13 +2621,21 @@ def _service_has_wanted_domain(svc_dict): # Concurrency: sequential by default, batched when --batch-size >= 2. # "Fully parallel" is just a large batch size (>= total tasks). - all_results: List[Dict[str, Any]] = [] + # (all_results is hoisted to before the try block; clear it here.) + all_results.clear() if not sequential_mode: - # Batched evaluation returns an already-flattened list. - all_results = await evaluate_tasks_in_batches( - task_evaluations=task_evaluations, - batch_size=batch_size, - args=args, + # Batched evaluation returns an already-flattened list. Use + # .extend() rather than reassignment so an interrupt during the + # gather doesn't drop any results that were already captured in + # the hoisted all_results (the batched helper itself uses + # return_exceptions=True, so completed batches' results survive + # individual failures). + all_results.extend( + await evaluate_tasks_in_batches( + task_evaluations=task_evaluations, + batch_size=batch_size, + args=args, + ) ) else: logger.info(f"\n{'=' * 80}") @@ -2705,6 +2719,37 @@ def _service_has_wanted_domain(svc_dict): return all_results + except (KeyboardInterrupt, asyncio.CancelledError): + # User hit Ctrl-C or the task group was cancelled. Save whatever + # tasks we managed to complete so the shell-side `create_bundle` + # has something to bundle, then re-raise so the script exits with + # the right status. (Bug #91, #92.) + logger.warning("⛔ Evaluation interrupted — saving any partial results before exit...") + try: + if all_results: + output_dir = Path(__file__).parent / "results" + prefix = "m3_config_no_gt_partial" if no_ground_truth else "m3_config_partial" + saved_path = save_evaluation_results(all_results, output_dir, prefix=prefix) + logger.warning(f"📁 Partial results ({len(all_results)} task-results) saved to: {saved_path}") + else: + logger.warning("(no partial results collected yet)") + except Exception as save_err: + logger.error(f"Failed to save partial results: {save_err}") + raise + except Exception as eval_err: + # An unexpected exception bubbled out of the eval loop. Same + # partial-save logic as the interrupt path, then re-raise. (Bug #92.) + logger.error(f"❌ Evaluation aborted by unexpected error: {eval_err}") + try: + if all_results: + output_dir = Path(__file__).parent / "results" + prefix = "m3_config_no_gt_partial" if no_ground_truth else "m3_config_partial" + saved_path = save_evaluation_results(all_results, output_dir, prefix=prefix) + logger.warning(f"📁 Partial results ({len(all_results)} task-results) saved to: {saved_path}") + except Exception as save_err: + logger.error(f"Failed to save partial results: {save_err}") + raise + finally: # Stop registry if it was started if registry_process is not None: diff --git a/benchmarks/m3/eval_m3_multiturn.py b/benchmarks/m3/eval_m3_multiturn.py index 144fe83..48a223c 100644 --- a/benchmarks/m3/eval_m3_multiturn.py +++ b/benchmarks/m3/eval_m3_multiturn.py @@ -3,7 +3,7 @@ This script: 1. Loads policies (optional) 2. Loads tools from the registry -3. Evaluates each multi-turn task in olympics_mutliturn.json +3. Evaluates each multi-turn task in olympics_multiturn.json 4. Handles multiple turns in the same conversation thread 5. Checks keywords in final responses 6. Reports results @@ -104,7 +104,7 @@ async def evaluate_multiturn_task(self, sample: Dict[str, Any], sample_index: in """Evaluate a single multi-turn task. Args: - sample: Sample dictionary from olympics_mutliturn.json + sample: Sample dictionary from olympics_multiturn.json sample_index: Index of the sample (for unique thread_id generation) Returns: @@ -148,13 +148,13 @@ async def evaluate_multiturn_task(self, sample: Dict[str, Any], sample_index: in async def evaluate_all(self, data_path: str = None): """ - Evaluate all samples from olympics_mutliturn.json. + Evaluate all samples from olympics_multiturn.json. Args: - data_path: Path to olympics_mutliturn.json file (defaults to data/olympics_mutliturn.json) + data_path: Path to olympics_multiturn.json file (defaults to data/olympics_multiturn.json) """ if data_path is None: - data_path = os.path.join(os.path.dirname(__file__), "data", "olympics_mutliturn.json") + data_path = os.path.join(os.path.dirname(__file__), "data", "olympics_multiturn.json") # Load test data with open(data_path, "r") as f: @@ -218,7 +218,7 @@ async def main(): dest="task", help="Run specific tasks/samples by ID (e.g., '91_sc_ONLY_API_OUT_DOMAIN'). Accepts multiple.", ) - default_data_file = os.getenv("M3_MULTITURN_DATA_FILE", "olympics_mutliturn.json") + default_data_file = os.getenv("M3_MULTITURN_DATA_FILE", "olympics_multiturn.json") parser.add_argument( "--data", type=str, diff --git a/benchmarks/m3/tests/conftest.py b/benchmarks/m3/tests/conftest.py index a86ad84..5fde18a 100644 --- a/benchmarks/m3/tests/conftest.py +++ b/benchmarks/m3/tests/conftest.py @@ -9,5 +9,15 @@ """ import os +import sys +from pathlib import Path + +# When pytest is invoked with a single m3 test path (e.g. pytest +# benchmarks/m3/tests/test_foo.py), the project root isn't on sys.path, +# so `from benchmarks.helpers...` imports fail. The bpo conftest does the +# same thing — mirror it here so m3 tests are runnable in isolation. +_project_root = Path(__file__).resolve().parents[3] +if str(_project_root) not in sys.path: + sys.path.insert(0, str(_project_root)) os.environ.setdefault("API_KEY", "test-key-not-used") # noqa: S105 diff --git a/benchmarks/m3/tests/test_partial_save_on_interrupt.py b/benchmarks/m3/tests/test_partial_save_on_interrupt.py new file mode 100644 index 0000000..61b6dc6 --- /dev/null +++ b/benchmarks/m3/tests/test_partial_save_on_interrupt.py @@ -0,0 +1,81 @@ +"""Regression tests for issues #91 and #92. + +When the M3 eval is interrupted (Ctrl-C) or crashes mid-run, we want: +- The already-completed task results to be saved as a JSON file so the + bundling step still has something to package. +- A clearly distinguishable filename prefix (``m3_config_partial``) so + consumers can tell a partial save from a complete run. + +The full ``run_config_mode`` is far too entangled (registry startup, +container runtime detection, MCP server, Vakra scoring) to drive end-to-end +in a unit test. Instead, these tests exercise the small contract that the +interrupt handler relies on: + +1. ``save_evaluation_results`` accepts a partial result list and writes + valid JSON with the expected ``m3_config_partial`` prefix. +2. ``save_evaluation_results`` accepts an empty list without crashing + (the handler guards against this, but it's worth verifying). + +A pure-bash regression for ``eval.sh`` / ``compare.sh`` is too brittle to +add to the standard regression suite because it requires the full eval +toolchain (uv, python entrypoints) to run. The shell-side behavior is +verified manually per the PR test plan. +""" + +import json +from pathlib import Path + +import pytest + +from benchmarks.helpers.sdk_eval_helpers import save_evaluation_results + +pytestmark = pytest.mark.regression + + +def _sample_result(task_name: str, success: bool) -> dict: + return { + "task_name": task_name, + "uuid": task_name, + "difficulty": "easy", + "success": success, + "match_rate": 1.0 if success else 0.0, + "found_keywords": [], + "missing_keywords": [], + } + + +def test_partial_results_saved_with_partial_prefix(tmp_path: Path) -> None: + """A non-empty partial result list lands in m3_config_partial_*.json.""" + partial_results = [ + _sample_result("hockey_395_0", success=True), + _sample_result("hockey_395_1", success=False), + ] + + saved_path_str = save_evaluation_results(partial_results, tmp_path, prefix="m3_config_partial") + saved_path = Path(saved_path_str) + + assert saved_path.exists(), f"expected partial result file at {saved_path}" + assert saved_path.name.startswith("m3_config_partial_"), ( + f"partial saves must use the 'm3_config_partial' prefix so they're " + f"distinguishable from complete runs; got: {saved_path.name}" + ) + + # File must contain valid JSON with both task results intact. + loaded = json.loads(saved_path.read_text(encoding="utf-8")) + if isinstance(loaded, dict): + # Some helper variants nest results under a top-level key + results = loaded.get("results", loaded) + else: + results = loaded + assert isinstance(results, list), f"top-level shape should be list-like; got {type(results)}" + task_names = {r.get("task_name") for r in results} + assert task_names == {"hockey_395_0", "hockey_395_1"} + + +def test_partial_save_with_no_ground_truth_prefix(tmp_path: Path) -> None: + """The --no-ground-truth branch uses a separate partial prefix.""" + partial_results = [_sample_result("hockey_395_0", success=True)] + + saved_path = Path(save_evaluation_results(partial_results, tmp_path, prefix="m3_config_no_gt_partial")) + assert saved_path.exists() + assert saved_path.name.startswith("m3_config_no_gt_partial_"), f"got: {saved_path.name}" diff --git a/justfile b/justfile index f5e20c6..52f4803 100644 --- a/justfile +++ b/justfile @@ -20,6 +20,10 @@ format: test-sanity: uv run pytest -m sanity +# Live smoke: 1 AppWorld (SDK), 1 AppWorld (ReAct), 1 M3 hockey task; checks bundle report.md. +test-smoke-e2e: + bash scripts/smoke_benchmarks.sh + # Sanity + regression suite, run on every PR and push to master. test-regression: uv run pytest -m "sanity or regression" diff --git a/pyproject.toml b/pyproject.toml index c3d1b43..43679a0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -91,10 +91,13 @@ markers = [ "regression: integration tests, run on every PR", "stability: long-running tests, scheduled only", ] -# --import-mode=importlib avoids module-name collisions between the four -# per-benchmark `tests/__init__.py` packages (each named ``tests``), which -# under the default ``prepend`` mode would shadow each other and fail to -# collect with ``ModuleNotFoundError: No module named 'tests.test_X'``. +# Use importlib import mode (`--import-mode=importlib`) so per-benchmark +# `tests/` packages don't collide on the shared `tests.*` module namespace. +# Without this, the top-level `tests/` dir and `benchmarks/*/tests/` +# packages both try to register as the `tests` package and pytest fails +# to collect with `ModuleNotFoundError: No module named 'tests.test_avg_steps'` +# (introduced by PR #87 when per-benchmark test dirs were added to +# testpaths). importlib mode imports each test file independently. addopts = "-ra --strict-markers --import-mode=importlib" asyncio_mode = "auto" diff --git a/scripts/create_eval_bundle.py b/scripts/create_eval_bundle.py index c1d0a98..4daa6f6 100644 --- a/scripts/create_eval_bundle.py +++ b/scripts/create_eval_bundle.py @@ -49,7 +49,7 @@ def _default_task_file(benchmark: str, result_file: Path) -> Path | None: data_dir = PROJECT_ROOT / "benchmarks" / benchmark / "data" if benchmark == "m3": if result_file.name.startswith("multiturn_"): - candidate = data_dir / "olympics_mutliturn.json" + candidate = data_dir / "olympics_multiturn.json" else: candidate = data_dir / "hockey.json" return candidate if candidate.exists() else None diff --git a/scripts/smoke_benchmarks.sh b/scripts/smoke_benchmarks.sh new file mode 100755 index 0000000..1ba4130 --- /dev/null +++ b/scripts/smoke_benchmarks.sh @@ -0,0 +1,80 @@ +#!/usr/bin/env bash +# End-to-end smoke: one AppWorld (SDK), one AppWorld (ReAct), one M3 hockey task. +# Validates bundle report.md metrics (tokens, steps, time, etc.; cost may be "--"). +set -euo pipefail + +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +cd "$ROOT" + +APPWORLD_TASK="${SMOKE_APPWORLD_TASK:-82e2fac_1}" +RUN_START_TS=$(date +%s) + +latest_bundle_report() { + local benchmark="$1" + local bundle_root="$ROOT/benchmarks/$benchmark/evaluation_bundles" + local newest="" newest_mtime=0 + local f mtime + while IFS= read -r -d '' f; do + mtime=$(stat -f %m "$f" 2>/dev/null || stat -c %Y "$f") + if [ "$mtime" -ge "$RUN_START_TS" ] && [ "$mtime" -gt "$newest_mtime" ]; then + newest_mtime=$mtime + newest=$f + fi + done < <(find "$bundle_root" -name report.md -type f -print0 2>/dev/null || true) + if [ -z "$newest" ]; then + echo "No report.md from this smoke run under $bundle_root" >&2 + return 1 + fi + echo "$newest" +} + +free_port() { + local port="$1" + command -v lsof >/dev/null 2>&1 || return 0 + lsof -ti ":$port" >/dev/null 2>&1 || return 0 + + echo "Freeing port $port..." + lsof -ti ":$port" | xargs kill 2>/dev/null || true + for _ in 1 2 3 4 5; do + lsof -ti ":$port" >/dev/null 2>&1 || return 0 + sleep 1 + done + echo "Port $port still occupied; sending SIGKILL..." + lsof -ti ":$port" | xargs kill -9 2>/dev/null || true + for _ in 1 2 3 4 5; do + lsof -ti ":$port" >/dev/null 2>&1 || return 0 + sleep 1 + done + echo "Port $port still occupied after SIGKILL" >&2 + return 1 +} + +run_and_check() { + local label="$1" + local benchmark="$2" + shift 2 + echo "" + echo "========== $label ==========" + "$@" + local report + report="$(latest_bundle_report "$benchmark")" + echo "Validating $report" + uv run python -m benchmarks.helpers.validate_bundle_report "$report" +} + +echo "Smoke benchmarks (ROOT=$ROOT, RUN_START_TS=$RUN_START_TS)" + +run_and_check "AppWorld SDK (cuga)" appworld \ + bash "$ROOT/benchmarks/appworld/eval.sh" --sdk --task "$APPWORLD_TASK" + +run_and_check "AppWorld ReAct" appworld \ + bash "$ROOT/benchmarks/appworld/eval.sh" --agent react --task "$APPWORLD_TASK" + +free_port 8001 +run_and_check "M3 hockey (m3_task_2, max-samples 1)" m3 \ + bash "$ROOT/benchmarks/m3/eval.sh" \ + --m3-data "$ROOT/benchmarks/m3/data/small_train.zip" \ + --capability m3_task_2 --domain hockey --max-samples 1 + +echo "" +echo "All smoke benchmark runs passed report validation."