diff --git a/.bob/commands/cuga-create-pr.md b/.bob/commands/cuga-create-pr.md
index e4a809e..7697637 100644
--- a/.bob/commands/cuga-create-pr.md
+++ b/.bob/commands/cuga-create-pr.md
@@ -81,6 +81,12 @@ just ci
 
 (`lint`, `test-regression`, and `security`). For docs-only or command-only changes, `just lint` may suffice.
 
+Optional live smoke (needs AppWorld + M3 runtime, API keys, capability containers):
+
+```bash
+just test-smoke-e2e
+```
+
 #### Run the Command
 
 
diff --git a/.claude/commands/cuga-create-pr.md b/.claude/commands/cuga-create-pr.md
index e4a809e..7697637 100644
--- a/.claude/commands/cuga-create-pr.md
+++ b/.claude/commands/cuga-create-pr.md
@@ -81,6 +81,12 @@ just ci
 
 (`lint`, `test-regression`, and `security`). For docs-only or command-only changes, `just lint` may suffice.
 
+Optional live smoke (needs AppWorld + M3 runtime, API keys, capability containers):
+
+```bash
+just test-smoke-e2e
+```
+
 #### Run the Command
 
 
diff --git a/.cursor/commands/cuga-create-pr.md b/.cursor/commands/cuga-create-pr.md
index e4a809e..7697637 100644
--- a/.cursor/commands/cuga-create-pr.md
+++ b/.cursor/commands/cuga-create-pr.md
@@ -81,6 +81,12 @@ just ci
 
 (`lint`, `test-regression`, and `security`). For docs-only or command-only changes, `just lint` may suffice.
 
+Optional live smoke (needs AppWorld + M3 runtime, API keys, capability containers):
+
+```bash
+just test-smoke-e2e
+```
+
 #### Run the Command
 
 
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 2b067b5..1b693ff 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -45,9 +45,10 @@ or under the root `tests/` directory.
 ```bash
 just lint            # ruff check + ruff format --check
 just test-sanity     # ~5s
+just test-smoke-e2e  # live: AppWorld + M3 (needs API keys, AppWorld, M3 containers)
 just test-regression # ~7s
 just security        # bandit + pip-audit
-just ci              # all of the above
+just ci              # lint + test-regression + security (smoke is optional/manual, not included)
 ```
 
 CI runs the same `lint`, `test-regression`, and `security` checks on
diff --git a/README.md b/README.md
index 18d3a27..e980d72 100644
--- a/README.md
+++ b/README.md
@@ -417,6 +417,16 @@ Bundle structure (comparison):
 
 Bundles are stored in `benchmarks/{benchmark}/evaluation_bundles/` and are git-ignored.
 
+#### Resilience: bundles and partial results on interrupt or crash
+
+M3's `eval.sh`/`compare.sh` salvage a best-effort bundle even when a run is interrupted (**Ctrl-C**) or crashes mid-flight, instead of silently losing everything collected so far:
+
+- **Partial result files**: if the evaluator is interrupted or hits an unexpected exception mid-run, it saves whatever task results were already collected to `benchmarks/m3/results/m3_config_partial_*.json` (or `m3_config_no_gt_partial_*.json` with `--no-ground-truth`), distinguishable from complete-run files by the `partial` prefix.
+- **Bundle on exit**: `create_bundle`/`create_compare_bundle` are idempotent and run from both the success path and the script's `cleanup` trap (`trap cleanup EXIT INT TERM ERR`), so a bundle is produced exactly once whether the run finishes normally, is interrupted, or crashes — picking up the freshest result file written during that run.
+- **Comparisons exclude partials**: `compare.sh` filters `m3_config_partial_*`/`m3_config_no_gt_partial_*` out of its result-file collection so an interrupted run doesn't skew aggregate pass-rate/token totals in a comparison report.
+
+This salvage behavior is best-effort and bounded to the currently in-flight task — completed tasks/domains are preserved, but progress within the task that was running at the moment of interruption may still be lost.
+
 ---
 
 ## 🔧 Configuration
diff --git a/benchmarks/appworld/eval_appworld_sdk.py b/benchmarks/appworld/eval_appworld_sdk.py
index 125b932..67c9f67 100644
--- a/benchmarks/appworld/eval_appworld_sdk.py
+++ b/benchmarks/appworld/eval_appworld_sdk.py
@@ -59,6 +59,7 @@
     save_evaluation_results,
     setup_agent_with_tools,
 )
+from benchmarks.helpers.sdk_eval_helpers import _react_steps_from_invoke_result
 
 tracker = ActivityTracker()
 var_manager = VariablesManager()
@@ -137,6 +138,7 @@ async def invoke_and_score_appworld(
     eval_dict: Dict[str, Any] = {}
     trace_id: Optional[str] = None
     _langfuse_metrics = None
+    invoke_result_holder: List[Any] = []
 
     async def run_invoke(invoke_config: Optional[dict] = None) -> None:
         nonlocal response, tool_calls, err, is_error, invoked
@@ -148,6 +150,8 @@ async def run_invoke(invoke_config: Optional[dict] = None) -> None:
                 track_tool_calls=track_tool_calls,
                 config=invoke_config or {},
             )
+            invoke_result_holder.clear()
+            invoke_result_holder.append(invoke_result)
             response = invoke_result.answer
             tool_calls = list(invoke_result.tool_calls or []) if track_tool_calls else []
             invoked = True
@@ -310,6 +314,14 @@ def complete_and_eval() -> None:
         result["llm_call_details"] = _langfuse_metrics.llm_call_details
         result["node_timings"] = _langfuse_metrics.node_timings
 
+    agent_steps = None
+    if invoke_result_holder:
+        agent_steps = _react_steps_from_invoke_result(invoke_result_holder[0])
+    if agent_steps is None:
+        agent_steps = len(tracker.steps) or len(tool_calls)
+    if agent_steps is not None:
+        result["steps"] = agent_steps
+
     return result
 
 
@@ -424,6 +436,9 @@ async def evaluate_task(self, task_id: str, task_index: int) -> Dict[str, Any]:
                 user_context = _build_user_context(world)
 
                 def tracker_callback(result: Dict[str, Any], keyword_check: Dict[str, Any], intent: str):
+                    agent_steps = result.get("steps")
+                    if agent_steps is None:
+                        agent_steps = len(tracker.steps) or len(result.get("tool_calls") or [])
                     eval_info = result.get("appworld_evaluation") or {}
                     report_md = json.dumps(
                         {
@@ -443,7 +458,7 @@ def tracker_callback(result: Dict[str, Any], keyword_check: Dict[str, Any], inte
                             score=0.0,
                             agent_answer="",
                             exception=True,
-                            num_steps=0,
+                            num_steps=agent_steps,
                             total_llm_calls=result.get("total_llm_calls", 0),
                             total_tokens=result.get("total_tokens", 0),
                             total_cost=result.get("total_cost", 0.0),
@@ -461,7 +476,7 @@ def tracker_callback(result: Dict[str, Any], keyword_check: Dict[str, Any], inte
                             score=score,
                             agent_answer=result.get("response", ""),
                             exception=False,
-                            num_steps=0,
+                            num_steps=agent_steps,
                             total_llm_calls=result.get("total_llm_calls", 0),
                             total_tokens=result.get("total_tokens", 0),
                             total_cost=result.get("total_cost", 0.0),
diff --git a/benchmarks/helpers/compare_report.py b/benchmarks/helpers/compare_report.py
index 81bfe7f..f2d4d89 100644
--- a/benchmarks/helpers/compare_report.py
+++ b/benchmarks/helpers/compare_report.py
@@ -48,8 +48,8 @@ def _format_config_label(config_key: str) -> str:
 
 
 def _fmt(val, fmt=","):
-    """Format a numeric value, returning '--' if zero/None."""
-    if val is None or val == 0:
+    """Format a numeric value, returning '--' if None (zero is shown as 0)."""
+    if val is None:
         return "--"
     if fmt == ",":
         # Use 1-decimal precision for floats so we don't surface float-repr
diff --git a/benchmarks/helpers/sdk_eval_helpers.py b/benchmarks/helpers/sdk_eval_helpers.py
index dd6e605..bc56338 100644
--- a/benchmarks/helpers/sdk_eval_helpers.py
+++ b/benchmarks/helpers/sdk_eval_helpers.py
@@ -1059,6 +1059,8 @@ async def evaluate_task_with_langfuse(
         react_steps = _react_steps_from_invoke_result(invoke_result)
         if react_steps is not None:
             result["steps"] = react_steps
+        elif result.get("steps") is None and tool_calls:
+            result["steps"] = len(tool_calls)
 
         if predefined_trace_id:
             result["trace_id"] = predefined_trace_id
diff --git a/benchmarks/helpers/tests/test_validate_bundle_report.py b/benchmarks/helpers/tests/test_validate_bundle_report.py
new file mode 100644
index 0000000..e00e998
--- /dev/null
+++ b/benchmarks/helpers/tests/test_validate_bundle_report.py
@@ -0,0 +1,43 @@
+"""Sanity checks for bundle report.md validation."""
+
+import pytest
+
+from benchmarks.helpers.validate_bundle_report import validate_report_md
+
+pytestmark = pytest.mark.sanity
+
+
+def test_validate_report_ok(tmp_path):
+    report = tmp_path / "report.md"
+    report.write_text(
+        """# Evaluation Report
+
+## Summary
+
+- **Total Tokens**: 1,234
+- **Total LLM Calls**: 5
+- **Total Duration**: 12.5s
+
+## Per-Task Results
+
+| Task | Result | Tokens | Cost | LLM Calls | Cache Tokens | Duration | Steps |
+|------|--------|--------|------|-----------|--------------|----------|-------|
+| t1 | ✓ | 1,234 | -- | 5 | 0 | 12.5s | 3 |
+"""
+    )
+    assert validate_report_md(report) == []
+
+
+def test_validate_report_flags_missing_metrics(tmp_path):
+    report = tmp_path / "report.md"
+    report.write_text(
+        """## Per-Task Results
+
+| Task | Result | Tokens | Cost | LLM Calls | Cache Tokens | Duration | Steps |
+|------|--------|--------|------|-----------|--------------|----------|-------|
+| t1 | ✓ | -- | -- | -- | -- | -- | -- |
+"""
+    )
+    errors = validate_report_md(report)
+    assert errors
+    assert any("Tokens" in e for e in errors)
diff --git a/benchmarks/helpers/validate_bundle_report.py b/benchmarks/helpers/validate_bundle_report.py
new file mode 100644
index 0000000..6020646
--- /dev/null
+++ b/benchmarks/helpers/validate_bundle_report.py
@@ -0,0 +1,97 @@
+"""Validate report.md from an eval bundle."""
+
+from __future__ import annotations
+
+import re
+from pathlib import Path
+
+_REQUIRED_COLS = frozenset({"Tokens", "LLM Calls", "Cache Tokens", "Duration", "Steps"})
+_EMPTY_MARKERS = frozenset({"--", "—", "-"})
+
+
+def _parse_table_header(line: str) -> list[str] | None:
+    if not line.startswith("|") or "---" in line:
+        return None
+    cells = [c.strip() for c in line.strip().strip("|").split("|")]
+    return cells if cells and cells[0] == "Task" else None
+
+
+def _is_separator(line: str) -> bool:
+    return line.startswith("|") and re.search(r"-{3,}", line) is not None
+
+
+def validate_report_md(path: Path) -> list[str]:
+    text = path.read_text()
+    errors: list[str] = []
+
+    in_per_task = False
+    header_cols: list[str] | None = None
+    required_indices: list[int] = []
+
+    for line_no, line in enumerate(text.splitlines(), start=1):
+        if line.startswith("## Per-Task"):
+            in_per_task = True
+            header_cols = None
+            required_indices = []
+            continue
+        if in_per_task and line.startswith("## "):
+            in_per_task = False
+            continue
+        if not in_per_task or not line.startswith("|"):
+            continue
+        if _is_separator(line):
+            continue
+
+        cols = _parse_table_header(line)
+        if cols and cols[0] == "Task":
+            header_cols = cols
+            required_indices = [i for i, name in enumerate(header_cols) if name in _REQUIRED_COLS]
+            continue
+
+        if not header_cols or not required_indices:
+            continue
+
+        cells = [c.strip() for c in line.strip().strip("|").split("|")]
+        if len(cells) < len(header_cols):
+            continue
+        if all(not cells[i] for i in range(min(3, len(cells)))) and cells[-1] in ("", "—"):
+            continue
+
+        for idx in required_indices:
+            col_name = header_cols[idx]
+            val = cells[idx] if idx < len(cells) else ""
+            if not val or val in _EMPTY_MARKERS:
+                task_label = cells[0] or cells[1] or f"line {line_no}"
+                errors.append(f"{path}:{line_no}: {col_name} is empty for task {task_label!r}")
+
+    for label in ("Total Tokens", "Total LLM Calls", "Total Duration"):
+        m = re.search(rf"\*\*{re.escape(label)}\*\*:\s*(.+)", text)
+        if m:
+            val = m.group(1).strip()
+            if not val or val in _EMPTY_MARKERS:
+                errors.append(f"{path}: summary {label} is missing")
+
+    return errors
+
+
+def main() -> int:
+    import argparse
+    import sys
+
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("report", type=Path)
+    args = parser.parse_args()
+    if not args.report.is_file():
+        print(f"report not found: {args.report}", file=sys.stderr)
+        return 1
+    errors = validate_report_md(args.report)
+    if errors:
+        for err in errors:
+            print(err, file=sys.stderr)
+        return 1
+    print(f"OK: {args.report}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/benchmarks/m3/compare.sh b/benchmarks/m3/compare.sh
index 4757077..8141108 100755
--- a/benchmarks/m3/compare.sh
+++ b/benchmarks/m3/compare.sh
@@ -181,7 +181,210 @@ runs_done=0
 runs_elapsed_total=0
 compare_t0=$(date +%s)
 
+BUNDLE_DONE=false
+
+# Best-effort comparison bundle. Defined as a function so it can be called
+# from both the success path at the bottom of the script AND from the
+# compare_cleanup trap on interrupt/crash (issues #91, #92). Idempotent via
+# BUNDLE_DONE. Reads CONFIG_RESULT_KEYS/VALS and CONFIG_TRAJ_KEYS/VALS which
+# are populated by the per-config loop below — on early interrupt those
+# arrays may be empty, in which case the function exits cleanly.
+create_compare_bundle() {
+    [ "$BUNDLE_DONE" = "true" ] && return 0
+    [[ "${NO_BUNDLE:-false}" == "true" ]] && return 0
+    BUNDLE_DONE=true
+
+    echo ""
+    echo -e "${YELLOW:-}Creating comparison bundle...${NC:-}"
+
+    # Build JSON input: {"model:agent": ["file1.json", ...]}
+    local JSON_PARTS=()
+    local ci config files file_list pfirst f
+    for ci in "${!CONFIG_RESULT_KEYS[@]}"; do
+        config="${CONFIG_RESULT_KEYS[$ci]}"
+        files="${CONFIG_RESULT_VALS[$ci]}"
+        if [[ -z "$files" ]]; then
+            continue
+        fi
+        file_list=""
+        pfirst=true
+        for f in $files; do
+            if [[ "$pfirst" != "true" ]]; then
+                file_list+=","
+            fi
+            pfirst=false
+            file_list+="\"${f}\""
+        done
+        JSON_PARTS+=("\"${config}\":[${file_list}]")
+    done
+
+    local JSON_INPUT="{"
+    local jfirst=true part
+    for part in "${JSON_PARTS[@]}"; do
+        if [[ "$jfirst" != "true" ]]; then
+            JSON_INPUT+=","
+        fi
+        jfirst=false
+        JSON_INPUT+="$part"
+    done
+    JSON_INPUT+="}"
+
+    if [[ "$JSON_INPUT" == "{}" ]]; then
+        echo -e "${YELLOW:-}No completed runs to bundle — skipping.${NC:-}"
+        return 0
+    fi
+
+    # Generate comparison report (best-effort)
+    echo -e "${YELLOW:-}Generating comparison report...${NC:-}"
+    local REPORT_TMP
+    REPORT_TMP=$(mktemp /tmp/m3_report_XXXXXX)
+    echo "$JSON_INPUT" | (cd "$PROJECT_ROOT" && uv run --no-sync python -m benchmarks.helpers.compare_report --output "$REPORT_TMP") \
+        || echo -e "${YELLOW:-}Report generation failed — bundling without comparison report.${NC:-}"
+    echo ""
+
+    # Build per-model env snapshot for bundle
+    local MODEL_ENVS_JSON=""
+    if type build_model_envs_json &>/dev/null; then
+        MODEL_ENVS_JSON=$(build_model_envs_json "${MODEL_LIST[@]}")
+    fi
+
+    # Build per-config trajectory dirs JSON grouped by run:
+    # {"model:agent:policy": [["/run1/domA", ...], ["/run2/domA", ...]]}
+    # CONFIG_TRAJ_VALS holds sentinel-delimited groups (one per eval run).
+    local TRAJ_JSON_PARTS=()
+    local tconfig tgroups groups_json cur_group in_group line
+    for ci in "${!CONFIG_TRAJ_KEYS[@]}"; do
+        tconfig="${CONFIG_TRAJ_KEYS[$ci]}"
+        tgroups="${CONFIG_TRAJ_VALS[$ci]}"
+        if [[ -z "$tgroups" ]]; then
+            continue
+        fi
+        groups_json=""
+        cur_group=""
+        in_group=false
+        while IFS= read -r line; do
+            if [[ "$line" == "$TRAJ_GROUP_SEP" ]]; then
+                if [[ "$in_group" == "true" ]]; then
+                    if [[ -n "$groups_json" ]]; then groups_json+=","; fi
+                    groups_json+="[${cur_group}]"
+                fi
+                cur_group=""
+                in_group=true
+                continue
+            fi
+            [[ -z "$line" ]] && continue
+            if [[ -n "$cur_group" ]]; then cur_group+=","; fi
+            cur_group+="\"${line}\""
+        done <<< "$tgroups"
+        if [[ "$in_group" == "true" ]]; then
+            if [[ -n "$groups_json" ]]; then groups_json+=","; fi
+            groups_json+="[${cur_group}]"
+        fi
+        if [[ -z "$groups_json" ]]; then
+            continue
+        fi
+        TRAJ_JSON_PARTS+=("\"${tconfig}\":[${groups_json}]")
+    done
+
+    local TRAJ_JSON_INPUT="{"
+    local tjfirst=true
+    for part in "${TRAJ_JSON_PARTS[@]}"; do
+        if [[ "$tjfirst" != "true" ]]; then
+            TRAJ_JSON_INPUT+=","
+        fi
+        tjfirst=false
+        TRAJ_JSON_INPUT+="$part"
+    done
+    TRAJ_JSON_INPUT+="}"
+
+    # Determine task file
+    local TASK_FILE="$SCRIPT_DIR/data/hockey.json"
+    local arg
+    for arg in "${FORWARDED_ARGS[@]}"; do
+        if [[ "$arg" == "--multiturn" ]]; then
+            TASK_FILE="$SCRIPT_DIR/data/olympics_multiturn.json"
+            break
+        fi
+    done
+
+    local BUNDLE_CMD=(uv run --no-sync python -m benchmarks.helpers.bundle assemble-compare
+        --benchmark m3
+        --config-results "$JSON_INPUT"
+        --report "$REPORT_TMP"
+        --task-files "$TASK_FILE")
+
+    if [[ -n "$MODEL_ENVS_JSON" ]]; then
+        BUNDLE_CMD+=(--model-envs "$MODEL_ENVS_JSON")
+    fi
+    if [[ "$TRAJ_JSON_INPUT" != "{}" ]]; then
+        BUNDLE_CMD+=(--trajectory-dirs "$TRAJ_JSON_INPUT")
+    fi
+    # Build per-config log JSON grouped by run (one console+registry log set
+    # per eval run) so each run folder gets its OWN logs:
+    # {"model:agent:policy": [["/run1/console.log", ...], ["/run2/...", ...]]}
+    local LOG_JSON_PARTS=()
+    local lconfig lgroups lgroups_json lcur_group lin_group
+    for ci in "${!CONFIG_LOG_KEYS[@]}"; do
+        lconfig="${CONFIG_LOG_KEYS[$ci]}"
+        lgroups="${CONFIG_LOG_VALS[$ci]}"
+        if [[ -z "$lgroups" ]]; then
+            continue
+        fi
+        lgroups_json=""
+        lcur_group=""
+        lin_group=false
+        while IFS= read -r line; do
+            if [[ "$line" == "$LOG_GROUP_SEP" ]]; then
+                if [[ "$lin_group" == "true" ]]; then
+                    if [[ -n "$lgroups_json" ]]; then lgroups_json+=","; fi
+                    lgroups_json+="[${lcur_group}]"
+                fi
+                lcur_group=""
+                lin_group=true
+                continue
+            fi
+            [[ -z "$line" ]] && continue
+            if [[ -n "$lcur_group" ]]; then lcur_group+=","; fi
+            lcur_group+="\"${line}\""
+        done <<< "$lgroups"
+        if [[ "$lin_group" == "true" ]]; then
+            if [[ -n "$lgroups_json" ]]; then lgroups_json+=","; fi
+            lgroups_json+="[${lcur_group}]"
+        fi
+        if [[ -z "$lgroups_json" ]]; then
+            continue
+        fi
+        LOG_JSON_PARTS+=("\"${lconfig}\":[${lgroups_json}]")
+    done
+    local LOG_JSON="{"
+    local ljfirst=true
+    for part in "${LOG_JSON_PARTS[@]}"; do
+        if [[ "$ljfirst" != "true" ]]; then LOG_JSON+=","; fi
+        ljfirst=false
+        LOG_JSON+="$part"
+    done
+    LOG_JSON+="}"
+    if [[ "$LOG_JSON" != "{}" ]]; then
+        BUNDLE_CMD+=(--log-files "$LOG_JSON")
+    fi
+    # Download Langfuse traces if available
+    BUNDLE_CMD+=(--fetch-langfuse)
+    if [[ "${BUNDLE_ZIP:-false}" == "true" ]]; then
+        BUNDLE_CMD+=(--zip)
+    fi
+
+    # Bundle CLI needs project root on PYTHONPATH
+    (cd "$PROJECT_ROOT" && "${BUNDLE_CMD[@]}") \
+        || echo -e "${YELLOW:-}Bundle creation reported errors (best-effort).${NC:-}"
+    rm -f "$REPORT_TMP"
+}
+
 compare_cleanup() {
+    # Best-effort comparison bundle on interrupt/crash (issues #91, #92).
+    # If we made it past the per-config loop the success path below will have
+    # already created the bundle; BUNDLE_DONE makes this idempotent.
+    create_compare_bundle || true
+
     echo -e "${YELLOW:-}Stopping servers...${NC:-}"
     kill_port_processes "${REGISTRY_PORT:-8001}"
     # Staged per-run logs were already copied into the bundle by now.
@@ -214,8 +417,14 @@ LOG_GROUP_SEP="@@RUN@@"
 # the right glob for each agent.
 _list_results_for_agent() {
     local agent="$1"
+    # Exclude interrupt/crash partial saves (m3_config_partial_*,
+    # m3_config_no_gt_partial_*) — they're incomplete runs and would skew
+    # compare_report's totals/pass-rate aggregates if folded in alongside
+    # complete runs.
     if [[ "$agent" == "cuga" ]]; then
-        ls -1 "$RESULTS_DIR"/m3_config_*.json 2>/dev/null | sort
+        ls -1 "$RESULTS_DIR"/m3_config_*.json 2>/dev/null \
+            | grep -vE '/m3_config_(no_gt_)?partial_' \
+            | sort
     else
         # react: m3_*.json but NOT m3_config_*.json (and not multiturn either,
         # which is a separate flow).
@@ -335,179 +544,7 @@ if [ -n "$OUTPUT_FILE" ]; then
     echo -e "${GREEN:-}✓${NC:-} Results in: $RESULTS_DIR"
 fi
 
-# Create reproducibility bundle unless skipped
-if [[ "${NO_BUNDLE:-false}" != "true" ]]; then
-    echo ""
-    echo -e "${YELLOW:-}Creating comparison bundle...${NC:-}"
-
-    # Build JSON input: {"model:agent": ["file1.json", ...]}
-    JSON_PARTS=()
-    for ci in "${!CONFIG_RESULT_KEYS[@]}"; do
-        config="${CONFIG_RESULT_KEYS[$ci]}"
-        files="${CONFIG_RESULT_VALS[$ci]}"
-        if [[ -z "$files" ]]; then
-            continue
-        fi
-        file_list=""
-        pfirst=true
-        for f in $files; do
-            if [[ "$pfirst" != "true" ]]; then
-                file_list+=","
-            fi
-            pfirst=false
-            file_list+="\"${f}\""
-        done
-        JSON_PARTS+=("\"${config}\":[${file_list}]")
-    done
-
-    JSON_INPUT="{"
-    jfirst=true
-    for part in "${JSON_PARTS[@]}"; do
-        if [[ "$jfirst" != "true" ]]; then
-            JSON_INPUT+=","
-        fi
-        jfirst=false
-        JSON_INPUT+="$part"
-    done
-    JSON_INPUT+="}"
-
-    if [[ "$JSON_INPUT" != "{}" ]]; then
-        # Generate comparison report
-        echo -e "${YELLOW:-}Generating comparison report...${NC:-}"
-        REPORT_TMP=$(mktemp /tmp/m3_report_XXXXXX)
-        echo "$JSON_INPUT" | (cd "$PROJECT_ROOT" && uv run --no-sync python -m benchmarks.helpers.compare_report --output "$REPORT_TMP")
-        echo ""
-
-        # Build per-model env snapshot for bundle
-        MODEL_ENVS_JSON=""
-        if type build_model_envs_json &>/dev/null; then
-            MODEL_ENVS_JSON=$(build_model_envs_json "${MODEL_LIST[@]}")
-        fi
-
-        # Build per-config trajectory JSON grouped by run:
-        # {"model:agent:policy": [["/run1/domA", ...], ["/run2/domA", ...]]}
-        # CONFIG_TRAJ_VALS holds sentinel-delimited groups (one per eval run).
-        TRAJ_JSON_PARTS=()
-        for ci in "${!CONFIG_TRAJ_KEYS[@]}"; do
-            tconfig="${CONFIG_TRAJ_KEYS[$ci]}"
-            tgroups="${CONFIG_TRAJ_VALS[$ci]}"
-            if [[ -z "$tgroups" ]]; then
-                continue
-            fi
-            groups_json=""
-            cur_group=""
-            in_group=false
-            while IFS= read -r line; do
-                if [[ "$line" == "$TRAJ_GROUP_SEP" ]]; then
-                    if [[ "$in_group" == "true" ]]; then
-                        if [[ -n "$groups_json" ]]; then groups_json+=","; fi
-                        groups_json+="[${cur_group}]"
-                    fi
-                    cur_group=""
-                    in_group=true
-                    continue
-                fi
-                [[ -z "$line" ]] && continue
-                if [[ -n "$cur_group" ]]; then cur_group+=","; fi
-                cur_group+="\"${line}\""
-            done <<< "$tgroups"
-            if [[ "$in_group" == "true" ]]; then
-                if [[ -n "$groups_json" ]]; then groups_json+=","; fi
-                groups_json+="[${cur_group}]"
-            fi
-            if [[ -z "$groups_json" ]]; then
-                continue
-            fi
-            TRAJ_JSON_PARTS+=("\"${tconfig}\":[${groups_json}]")
-        done
-
-        TRAJ_JSON_INPUT="{"
-        tjfirst=true
-        for part in "${TRAJ_JSON_PARTS[@]}"; do
-            if [[ "$tjfirst" != "true" ]]; then
-                TRAJ_JSON_INPUT+=","
-            fi
-            tjfirst=false
-            TRAJ_JSON_INPUT+="$part"
-        done
-        TRAJ_JSON_INPUT+="}"
-
-        # Determine task file
-        TASK_FILE="$SCRIPT_DIR/data/hockey.json"
-        for arg in "${FORWARDED_ARGS[@]}"; do
-            if [[ "$arg" == "--multiturn" ]]; then
-                TASK_FILE="$SCRIPT_DIR/data/olympics_mutliturn.json"
-                break
-            fi
-        done
-
-        BUNDLE_CMD=(uv run --no-sync python -m benchmarks.helpers.bundle assemble-compare
-            --benchmark m3
-            --config-results "$JSON_INPUT"
-            --report "$REPORT_TMP"
-            --task-files "$TASK_FILE")
-
-        if [[ -n "$MODEL_ENVS_JSON" ]]; then
-            BUNDLE_CMD+=(--model-envs "$MODEL_ENVS_JSON")
-        fi
-        if [[ "$TRAJ_JSON_INPUT" != "{}" ]]; then
-            BUNDLE_CMD+=(--trajectory-dirs "$TRAJ_JSON_INPUT")
-        fi
-        # Build per-config log JSON grouped by run (one console+registry log set
-        # per eval run) so each run folder gets its OWN logs:
-        # {"model:agent:policy": [["/run1/console.log", ...], ["/run2/...", ...]]}
-        LOG_JSON_PARTS=()
-        for ci in "${!CONFIG_LOG_KEYS[@]}"; do
-            lconfig="${CONFIG_LOG_KEYS[$ci]}"
-            lgroups="${CONFIG_LOG_VALS[$ci]}"
-            if [[ -z "$lgroups" ]]; then
-                continue
-            fi
-            lgroups_json=""
-            lcur_group=""
-            lin_group=false
-            while IFS= read -r line; do
-                if [[ "$line" == "$LOG_GROUP_SEP" ]]; then
-                    if [[ "$lin_group" == "true" ]]; then
-                        if [[ -n "$lgroups_json" ]]; then lgroups_json+=","; fi
-                        lgroups_json+="[${lcur_group}]"
-                    fi
-                    lcur_group=""
-                    lin_group=true
-                    continue
-                fi
-                [[ -z "$line" ]] && continue
-                if [[ -n "$lcur_group" ]]; then lcur_group+=","; fi
-                lcur_group+="\"${line}\""
-            done <<< "$lgroups"
-            if [[ "$lin_group" == "true" ]]; then
-                if [[ -n "$lgroups_json" ]]; then lgroups_json+=","; fi
-                lgroups_json+="[${lcur_group}]"
-            fi
-            if [[ -z "$lgroups_json" ]]; then
-                continue
-            fi
-            LOG_JSON_PARTS+=("\"${lconfig}\":[${lgroups_json}]")
-        done
-        LOG_JSON="{"
-        ljfirst=true
-        for part in "${LOG_JSON_PARTS[@]}"; do
-            if [[ "$ljfirst" != "true" ]]; then LOG_JSON+=","; fi
-            ljfirst=false
-            LOG_JSON+="$part"
-        done
-        LOG_JSON+="}"
-        if [[ "$LOG_JSON" != "{}" ]]; then
-            BUNDLE_CMD+=(--log-files "$LOG_JSON")
-        fi
-        # Download Langfuse traces if available
-        BUNDLE_CMD+=(--fetch-langfuse)
-        if [[ "${BUNDLE_ZIP:-false}" == "true" ]]; then
-            BUNDLE_CMD+=(--zip)
-        fi
-
-        # Bundle CLI needs project root on PYTHONPATH
-        (cd "$PROJECT_ROOT" && "${BUNDLE_CMD[@]}")
-        rm -f "$REPORT_TMP"
-    fi
-fi
+# Create the comparison bundle (success path). Idempotent — if the cleanup
+# trap already created it on interrupt, this is a no-op. See create_compare_bundle
+# definition near the top of this script and #91, #92.
+create_compare_bundle
diff --git a/benchmarks/m3/data/olympics_mutliturn.json b/benchmarks/m3/data/olympics_multiturn.json
similarity index 100%
rename from benchmarks/m3/data/olympics_mutliturn.json
rename to benchmarks/m3/data/olympics_multiturn.json
diff --git a/benchmarks/m3/eval.sh b/benchmarks/m3/eval.sh
index f9cafde..250b8f9 100755
--- a/benchmarks/m3/eval.sh
+++ b/benchmarks/m3/eval.sh
@@ -114,10 +114,106 @@ done
 
 REGISTRY_PID=""
 
+# Timestamp captured before the eval starts. Used by create_bundle to pick
+# only the result file(s) produced by *this* run, not a leftover from earlier.
+RUN_START_TS=$(date +%s)
+BUNDLE_DONE=false
+
+# Best-effort bundle creation. Called from the success path AND from the
+# cleanup trap on Ctrl-C / crash / non-zero exit (issues #91, #92), so a
+# long run that is interrupted still leaves logs + trajectories + any
+# results that were already written. Skips silently if --no-bundle was
+# passed, or if nothing from this run was produced yet.
+create_bundle() {
+    [ "$BUNDLE_DONE" = "true" ] && return 0
+    [ "${NO_BUNDLE:-false}" = "true" ] && return 0
+    BUNDLE_DONE=true
+
+    echo ""
+    echo -e "${YELLOW:-}Creating reproducibility bundle...${NC:-}"
+
+    # Find the most recent result file produced by *this* run (mtime newer
+    # than RUN_START_TS). If the run was killed before any save, there'll be
+    # nothing here and we skip the bundle — there's nothing meaningful to
+    # bundle without at least one results JSON.
+    local latest_result=""
+    local f
+    for f in $(ls -t "$SCRIPT_DIR/results"/m3_*.json "$SCRIPT_DIR/results"/multiturn_*.json 2>/dev/null); do
+        local f_mtime
+        f_mtime=$(stat -f %m "$f" 2>/dev/null || stat -c %Y "$f" 2>/dev/null)
+        if [ -n "$f_mtime" ] && [ "$f_mtime" -ge "$RUN_START_TS" ]; then
+            latest_result="$f"
+            break
+        fi
+    done
+
+    if [ -z "$latest_result" ]; then
+        echo -e "${YELLOW:-}No result file from this run was found — skipping bundle.${NC:-}"
+        echo -e "${YELLOW:-}(Console log is still at $CONSOLE_LOG.)${NC:-}"
+        return 0
+    fi
+
+    # Determine task file used
+    local task_file
+    if [ "$MULTITURN" = "true" ]; then
+        task_file="$SCRIPT_DIR/data/olympics_multiturn.json"
+    else
+        task_file="$SCRIPT_DIR/data/hockey.json"
+    fi
+
+    # Generate eval report (best effort — if report generation fails we still
+    # want the bundle, so don't let `set -e` abort here).
+    local report_tmp
+    report_tmp=$(mktemp /tmp/m3_eval_report_XXXXXX)
+    uv run --no-sync python -m benchmarks.helpers.compare_report eval \
+        --result-file "$latest_result" --output "$report_tmp" || \
+        echo -e "${YELLOW:-}Report generation failed — bundling without report.${NC:-}"
+
+    local bundle_args=(assemble --benchmark m3
+        --result-files "$latest_result"
+        --task-files "$task_file"
+        --report "$report_tmp")
+    if [ -n "$MODEL_PROFILE" ]; then
+        bundle_args+=(--model-profile "$MODEL_PROFILE")
+    fi
+    if [ "$NO_POLICIES" = "true" ]; then
+        bundle_args+=(--no-policies)
+    fi
+    if [ "${BUNDLE_ZIP:-false}" = "true" ]; then
+        bundle_args+=(--zip)
+    fi
+    # Include cuga trajectories
+    local traj_dir
+    traj_dir=$(find_latest_trajectory "$SCRIPT_DIR/logging/trajectory_data")
+    if [ -n "$traj_dir" ]; then
+        bundle_args+=(--trajectory-dir "$traj_dir")
+    fi
+    # Include server and console logs (whichever exists)
+    local registry_log="$SCRIPT_DIR/registry_server.log"
+    if [ -f "$registry_log" ]; then
+        bundle_args+=(--log-files "$registry_log" "$CONSOLE_LOG")
+    else
+        bundle_args+=(--log-files /tmp/m3_registry.log "$CONSOLE_LOG")
+    fi
+    # Download Langfuse traces if available
+    bundle_args+=(--fetch-langfuse)
+
+    uv run --no-sync python -m benchmarks.helpers.bundle "${bundle_args[@]}" || \
+        echo -e "${YELLOW:-}Bundle creation reported errors (best-effort).${NC:-}"
+
+    rm -f "$report_tmp"
+}
+
 cleanup() {
     local exit_code=$?
     echo ""
     echo -e "${YELLOW:-}Cleaning up...${NC:-}"
+
+    # Best-effort bundle on interrupt/crash. Idempotent (no-op if already
+    # created on the success path below). Wrapped in `|| true` so a bundle
+    # failure can't override the original exit code.
+    create_bundle || true
+
     if [ "${SKIP_SERVER_CLEANUP:-false}" != "true" ]; then
         if [ -n "$REGISTRY_PID" ] && kill -0 "$REGISTRY_PID" 2>/dev/null; then
             echo -e "${BLUE:-}Stopping registry server (PID: $REGISTRY_PID)${NC:-}"
@@ -276,62 +372,12 @@ EVAL_EXIT=$?
 
 if [ $EVAL_EXIT -eq 0 ]; then
     echo -e "${GREEN:-}✓${NC:-} M3 evaluation completed successfully"
-
-    # Create reproducibility bundle unless skipped
-    if [ "${NO_BUNDLE:-false}" != "true" ]; then
-        echo ""
-        echo -e "${YELLOW:-}Creating reproducibility bundle...${NC:-}"
-
-        # Find the most recent result file
-        LATEST_RESULT=$(ls -t "$SCRIPT_DIR/results"/m3_*.json "$SCRIPT_DIR/results"/multiturn_*.json 2>/dev/null | head -1)
-        if [ -n "$LATEST_RESULT" ]; then
-            # Determine task file used
-            if [ "$MULTITURN" = "true" ]; then
-                TASK_FILE="$SCRIPT_DIR/data/olympics_mutliturn.json"
-            else
-                TASK_FILE="$SCRIPT_DIR/data/hockey.json"
-            fi
-
-            # Generate eval report
-            REPORT_TMP=$(mktemp /tmp/m3_eval_report_XXXXXX)
-            uv run --no-sync python -m benchmarks.helpers.compare_report eval \
-                --result-file "$LATEST_RESULT" --output "$REPORT_TMP"
-
-            BUNDLE_ARGS=(assemble --benchmark m3
-                --result-files "$LATEST_RESULT"
-                --task-files "$TASK_FILE"
-                --report "$REPORT_TMP")
-            if [ -n "$MODEL_PROFILE" ]; then
-                BUNDLE_ARGS+=(--model-profile "$MODEL_PROFILE")
-            fi
-            if [ "$NO_POLICIES" = "true" ]; then
-                BUNDLE_ARGS+=(--no-policies)
-            fi
-            if [ "${BUNDLE_ZIP:-false}" = "true" ]; then
-                BUNDLE_ARGS+=(--zip)
-            fi
-            # Include cuga trajectories
-            TRAJ_DIR=$(find_latest_trajectory "$SCRIPT_DIR/logging/trajectory_data")
-            if [ -n "$TRAJ_DIR" ]; then
-                BUNDLE_ARGS+=(--trajectory-dir "$TRAJ_DIR")
-            fi
-            # Include server and console logs
-            # Note: eval_m3.py creates registry_server.log in the benchmark directory
-            REGISTRY_LOG="$SCRIPT_DIR/registry_server.log"
-            if [ -f "$REGISTRY_LOG" ]; then
-                BUNDLE_ARGS+=(--log-files "$REGISTRY_LOG" "$CONSOLE_LOG")
-            else
-                # Fallback to /tmp location if registry_server.log doesn't exist
-                BUNDLE_ARGS+=(--log-files /tmp/m3_registry.log "$CONSOLE_LOG")
-            fi
-            # Download Langfuse traces if available
-            BUNDLE_ARGS+=(--fetch-langfuse)
-            uv run --no-sync python -m benchmarks.helpers.bundle "${BUNDLE_ARGS[@]}"
-            rm -f "$REPORT_TMP"
-        fi
-    fi
+    # Create reproducibility bundle (idempotent — cleanup trap also calls
+    # this on interrupt/crash, see #91, #92).
+    create_bundle
 else
     echo -e "${RED:-}✗ M3 evaluation failed (exit code: $EVAL_EXIT)${NC:-}"
+    # cleanup trap will call create_bundle to salvage what we have.
 fi
 
 # Re-echo the --m3-data summary as the very last thing on screen, so it's
diff --git a/benchmarks/m3/eval_m3.py b/benchmarks/m3/eval_m3.py
index efc8cb9..6c82c0f 100644
--- a/benchmarks/m3/eval_m3.py
+++ b/benchmarks/m3/eval_m3.py
@@ -2415,6 +2415,12 @@ async def run_config_mode(args, container_runtime: str, defer_save: bool = False
     batch_size = args.batch_size or 1
     sequential_mode = batch_size < 2
 
+    # Hoisted so the KeyboardInterrupt / Exception handlers below can save
+    # whatever was collected if the eval is interrupted (#91, #92). In
+    # sequential mode results are appended as tasks complete; in batched
+    # mode evaluate_tasks_in_batches replaces the list with its return.
+    all_results: List[Dict[str, Any]] = []
+
     try:
         # Start registry if enabled. In sequential mode we *don't* start a
         # shared registry here — each service spawns its own mini registry
@@ -2615,13 +2621,21 @@ def _service_has_wanted_domain(svc_dict):
 
         # Concurrency: sequential by default, batched when --batch-size >= 2.
         # "Fully parallel" is just a large batch size (>= total tasks).
-        all_results: List[Dict[str, Any]] = []
+        # (all_results is hoisted to before the try block; clear it here.)
+        all_results.clear()
         if not sequential_mode:
-            # Batched evaluation returns an already-flattened list.
-            all_results = await evaluate_tasks_in_batches(
-                task_evaluations=task_evaluations,
-                batch_size=batch_size,
-                args=args,
+            # Batched evaluation returns an already-flattened list. Use
+            # .extend() rather than reassignment so an interrupt during the
+            # gather doesn't drop any results that were already captured in
+            # the hoisted all_results (the batched helper itself uses
+            # return_exceptions=True, so completed batches' results survive
+            # individual failures).
+            all_results.extend(
+                await evaluate_tasks_in_batches(
+                    task_evaluations=task_evaluations,
+                    batch_size=batch_size,
+                    args=args,
+                )
             )
         else:
             logger.info(f"\n{'=' * 80}")
@@ -2705,6 +2719,37 @@ def _service_has_wanted_domain(svc_dict):
 
         return all_results
 
+    except (KeyboardInterrupt, asyncio.CancelledError):
+        # User hit Ctrl-C or the task group was cancelled. Save whatever
+        # tasks we managed to complete so the shell-side `create_bundle`
+        # has something to bundle, then re-raise so the script exits with
+        # the right status. (Bug #91, #92.)
+        logger.warning("⛔ Evaluation interrupted — saving any partial results before exit...")
+        try:
+            if all_results:
+                output_dir = Path(__file__).parent / "results"
+                prefix = "m3_config_no_gt_partial" if no_ground_truth else "m3_config_partial"
+                saved_path = save_evaluation_results(all_results, output_dir, prefix=prefix)
+                logger.warning(f"📁 Partial results ({len(all_results)} task-results) saved to: {saved_path}")
+            else:
+                logger.warning("(no partial results collected yet)")
+        except Exception as save_err:
+            logger.error(f"Failed to save partial results: {save_err}")
+        raise
+    except Exception as eval_err:
+        # An unexpected exception bubbled out of the eval loop. Same
+        # partial-save logic as the interrupt path, then re-raise. (Bug #92.)
+        logger.error(f"❌ Evaluation aborted by unexpected error: {eval_err}")
+        try:
+            if all_results:
+                output_dir = Path(__file__).parent / "results"
+                prefix = "m3_config_no_gt_partial" if no_ground_truth else "m3_config_partial"
+                saved_path = save_evaluation_results(all_results, output_dir, prefix=prefix)
+                logger.warning(f"📁 Partial results ({len(all_results)} task-results) saved to: {saved_path}")
+        except Exception as save_err:
+            logger.error(f"Failed to save partial results: {save_err}")
+        raise
+
     finally:
         # Stop registry if it was started
         if registry_process is not None:
diff --git a/benchmarks/m3/eval_m3_multiturn.py b/benchmarks/m3/eval_m3_multiturn.py
index 144fe83..48a223c 100644
--- a/benchmarks/m3/eval_m3_multiturn.py
+++ b/benchmarks/m3/eval_m3_multiturn.py
@@ -3,7 +3,7 @@
 This script:
 1. Loads policies (optional)
 2. Loads tools from the registry
-3. Evaluates each multi-turn task in olympics_mutliturn.json
+3. Evaluates each multi-turn task in olympics_multiturn.json
 4. Handles multiple turns in the same conversation thread
 5. Checks keywords in final responses
 6. Reports results
@@ -104,7 +104,7 @@ async def evaluate_multiturn_task(self, sample: Dict[str, Any], sample_index: in
         """Evaluate a single multi-turn task.
 
         Args:
-            sample: Sample dictionary from olympics_mutliturn.json
+            sample: Sample dictionary from olympics_multiturn.json
             sample_index: Index of the sample (for unique thread_id generation)
 
         Returns:
@@ -148,13 +148,13 @@ async def evaluate_multiturn_task(self, sample: Dict[str, Any], sample_index: in
 
     async def evaluate_all(self, data_path: str = None):
         """
-        Evaluate all samples from olympics_mutliturn.json.
+        Evaluate all samples from olympics_multiturn.json.
 
         Args:
-            data_path: Path to olympics_mutliturn.json file (defaults to data/olympics_mutliturn.json)
+            data_path: Path to olympics_multiturn.json file (defaults to data/olympics_multiturn.json)
         """
         if data_path is None:
-            data_path = os.path.join(os.path.dirname(__file__), "data", "olympics_mutliturn.json")
+            data_path = os.path.join(os.path.dirname(__file__), "data", "olympics_multiturn.json")
 
         # Load test data
         with open(data_path, "r") as f:
@@ -218,7 +218,7 @@ async def main():
         dest="task",
         help="Run specific tasks/samples by ID (e.g., '91_sc_ONLY_API_OUT_DOMAIN'). Accepts multiple.",
     )
-    default_data_file = os.getenv("M3_MULTITURN_DATA_FILE", "olympics_mutliturn.json")
+    default_data_file = os.getenv("M3_MULTITURN_DATA_FILE", "olympics_multiturn.json")
     parser.add_argument(
         "--data",
         type=str,
diff --git a/benchmarks/m3/tests/conftest.py b/benchmarks/m3/tests/conftest.py
index a86ad84..5fde18a 100644
--- a/benchmarks/m3/tests/conftest.py
+++ b/benchmarks/m3/tests/conftest.py
@@ -9,5 +9,15 @@
 """
 
 import os
+import sys
+from pathlib import Path
+
+# When pytest is invoked with a single m3 test path (e.g. pytest
+# benchmarks/m3/tests/test_foo.py), the project root isn't on sys.path,
+# so `from benchmarks.helpers...` imports fail. The bpo conftest does the
+# same thing — mirror it here so m3 tests are runnable in isolation.
+_project_root = Path(__file__).resolve().parents[3]
+if str(_project_root) not in sys.path:
+    sys.path.insert(0, str(_project_root))
 
 os.environ.setdefault("API_KEY", "test-key-not-used")  # noqa: S105
diff --git a/benchmarks/m3/tests/test_partial_save_on_interrupt.py b/benchmarks/m3/tests/test_partial_save_on_interrupt.py
new file mode 100644
index 0000000..61b6dc6
--- /dev/null
+++ b/benchmarks/m3/tests/test_partial_save_on_interrupt.py
@@ -0,0 +1,81 @@
+"""Regression tests for issues #91 and #92.
+
+When the M3 eval is interrupted (Ctrl-C) or crashes mid-run, we want:
+- The already-completed task results to be saved as a JSON file so the
+  bundling step still has something to package.
+- A clearly distinguishable filename prefix (``m3_config_partial``) so
+  consumers can tell a partial save from a complete run.
+
+The full ``run_config_mode`` is far too entangled (registry startup,
+container runtime detection, MCP server, Vakra scoring) to drive end-to-end
+in a unit test. Instead, these tests exercise the small contract that the
+interrupt handler relies on:
+
+1. ``save_evaluation_results`` accepts a partial result list and writes
+   valid JSON with the expected ``m3_config_partial`` prefix.
+2. ``save_evaluation_results`` accepts an empty list without crashing
+   (the handler guards against this, but it's worth verifying).
+
+A pure-bash regression for ``eval.sh`` / ``compare.sh`` is too brittle to
+add to the standard regression suite because it requires the full eval
+toolchain (uv, python entrypoints) to run. The shell-side behavior is
+verified manually per the PR test plan.
+"""
+
+import json
+from pathlib import Path
+
+import pytest
+
+from benchmarks.helpers.sdk_eval_helpers import save_evaluation_results
+
+pytestmark = pytest.mark.regression
+
+
+def _sample_result(task_name: str, success: bool) -> dict:
+    return {
+        "task_name": task_name,
+        "uuid": task_name,
+        "difficulty": "easy",
+        "success": success,
+        "match_rate": 1.0 if success else 0.0,
+        "found_keywords": [],
+        "missing_keywords": [],
+    }
+
+
+def test_partial_results_saved_with_partial_prefix(tmp_path: Path) -> None:
+    """A non-empty partial result list lands in m3_config_partial_*.json."""
+    partial_results = [
+        _sample_result("hockey_395_0", success=True),
+        _sample_result("hockey_395_1", success=False),
+    ]
+
+    saved_path_str = save_evaluation_results(partial_results, tmp_path, prefix="m3_config_partial")
+    saved_path = Path(saved_path_str)
+
+    assert saved_path.exists(), f"expected partial result file at {saved_path}"
+    assert saved_path.name.startswith("m3_config_partial_"), (
+        f"partial saves must use the 'm3_config_partial' prefix so they're "
+        f"distinguishable from complete runs; got: {saved_path.name}"
+    )
+
+    # File must contain valid JSON with both task results intact.
+    loaded = json.loads(saved_path.read_text(encoding="utf-8"))
+    if isinstance(loaded, dict):
+        # Some helper variants nest results under a top-level key
+        results = loaded.get("results", loaded)
+    else:
+        results = loaded
+    assert isinstance(results, list), f"top-level shape should be list-like; got {type(results)}"
+    task_names = {r.get("task_name") for r in results}
+    assert task_names == {"hockey_395_0", "hockey_395_1"}
+
+
+def test_partial_save_with_no_ground_truth_prefix(tmp_path: Path) -> None:
+    """The --no-ground-truth branch uses a separate partial prefix."""
+    partial_results = [_sample_result("hockey_395_0", success=True)]
+
+    saved_path = Path(save_evaluation_results(partial_results, tmp_path, prefix="m3_config_no_gt_partial"))
+    assert saved_path.exists()
+    assert saved_path.name.startswith("m3_config_no_gt_partial_"), f"got: {saved_path.name}"
diff --git a/justfile b/justfile
index f5e20c6..52f4803 100644
--- a/justfile
+++ b/justfile
@@ -20,6 +20,10 @@ format:
 test-sanity:
     uv run pytest -m sanity
 
+# Live smoke: 1 AppWorld (SDK), 1 AppWorld (ReAct), 1 M3 hockey task; checks bundle report.md.
+test-smoke-e2e:
+    bash scripts/smoke_benchmarks.sh
+
 # Sanity + regression suite, run on every PR and push to master.
 test-regression:
     uv run pytest -m "sanity or regression"
diff --git a/pyproject.toml b/pyproject.toml
index c3d1b43..43679a0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -91,10 +91,13 @@ markers = [
     "regression: integration tests, run on every PR",
     "stability: long-running tests, scheduled only",
 ]
-# --import-mode=importlib avoids module-name collisions between the four
-# per-benchmark `tests/__init__.py` packages (each named ``tests``), which
-# under the default ``prepend`` mode would shadow each other and fail to
-# collect with ``ModuleNotFoundError: No module named 'tests.test_X'``.
+# Use importlib import mode (`--import-mode=importlib`) so per-benchmark
+# `tests/` packages don't collide on the shared `tests.*` module namespace.
+# Without this, the top-level `tests/` dir and `benchmarks/*/tests/`
+# packages both try to register as the `tests` package and pytest fails
+# to collect with `ModuleNotFoundError: No module named 'tests.test_avg_steps'`
+# (introduced by PR #87 when per-benchmark test dirs were added to
+# testpaths). importlib mode imports each test file independently.
 addopts = "-ra --strict-markers --import-mode=importlib"
 asyncio_mode = "auto"
 
diff --git a/scripts/create_eval_bundle.py b/scripts/create_eval_bundle.py
index c1d0a98..4daa6f6 100644
--- a/scripts/create_eval_bundle.py
+++ b/scripts/create_eval_bundle.py
@@ -49,7 +49,7 @@ def _default_task_file(benchmark: str, result_file: Path) -> Path | None:
     data_dir = PROJECT_ROOT / "benchmarks" / benchmark / "data"
     if benchmark == "m3":
         if result_file.name.startswith("multiturn_"):
-            candidate = data_dir / "olympics_mutliturn.json"
+            candidate = data_dir / "olympics_multiturn.json"
         else:
             candidate = data_dir / "hockey.json"
         return candidate if candidate.exists() else None
diff --git a/scripts/smoke_benchmarks.sh b/scripts/smoke_benchmarks.sh
new file mode 100755
index 0000000..1ba4130
--- /dev/null
+++ b/scripts/smoke_benchmarks.sh
@@ -0,0 +1,80 @@
+#!/usr/bin/env bash
+# End-to-end smoke: one AppWorld (SDK), one AppWorld (ReAct), one M3 hockey task.
+# Validates bundle report.md metrics (tokens, steps, time, etc.; cost may be "--").
+set -euo pipefail
+
+ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+cd "$ROOT"
+
+APPWORLD_TASK="${SMOKE_APPWORLD_TASK:-82e2fac_1}"
+RUN_START_TS=$(date +%s)
+
+latest_bundle_report() {
+  local benchmark="$1"
+  local bundle_root="$ROOT/benchmarks/$benchmark/evaluation_bundles"
+  local newest="" newest_mtime=0
+  local f mtime
+  while IFS= read -r -d '' f; do
+    mtime=$(stat -f %m "$f" 2>/dev/null || stat -c %Y "$f")
+    if [ "$mtime" -ge "$RUN_START_TS" ] && [ "$mtime" -gt "$newest_mtime" ]; then
+      newest_mtime=$mtime
+      newest=$f
+    fi
+  done < <(find "$bundle_root" -name report.md -type f -print0 2>/dev/null || true)
+  if [ -z "$newest" ]; then
+    echo "No report.md from this smoke run under $bundle_root" >&2
+    return 1
+  fi
+  echo "$newest"
+}
+
+free_port() {
+  local port="$1"
+  command -v lsof >/dev/null 2>&1 || return 0
+  lsof -ti ":$port" >/dev/null 2>&1 || return 0
+
+  echo "Freeing port $port..."
+  lsof -ti ":$port" | xargs kill 2>/dev/null || true
+  for _ in 1 2 3 4 5; do
+    lsof -ti ":$port" >/dev/null 2>&1 || return 0
+    sleep 1
+  done
+  echo "Port $port still occupied; sending SIGKILL..."
+  lsof -ti ":$port" | xargs kill -9 2>/dev/null || true
+  for _ in 1 2 3 4 5; do
+    lsof -ti ":$port" >/dev/null 2>&1 || return 0
+    sleep 1
+  done
+  echo "Port $port still occupied after SIGKILL" >&2
+  return 1
+}
+
+run_and_check() {
+  local label="$1"
+  local benchmark="$2"
+  shift 2
+  echo ""
+  echo "========== $label =========="
+  "$@"
+  local report
+  report="$(latest_bundle_report "$benchmark")"
+  echo "Validating $report"
+  uv run python -m benchmarks.helpers.validate_bundle_report "$report"
+}
+
+echo "Smoke benchmarks (ROOT=$ROOT, RUN_START_TS=$RUN_START_TS)"
+
+run_and_check "AppWorld SDK (cuga)" appworld \
+  bash "$ROOT/benchmarks/appworld/eval.sh" --sdk --task "$APPWORLD_TASK"
+
+run_and_check "AppWorld ReAct" appworld \
+  bash "$ROOT/benchmarks/appworld/eval.sh" --agent react --task "$APPWORLD_TASK"
+
+free_port 8001
+run_and_check "M3 hockey (m3_task_2, max-samples 1)" m3 \
+  bash "$ROOT/benchmarks/m3/eval.sh" \
+  --m3-data "$ROOT/benchmarks/m3/data/small_train.zip" \
+  --capability m3_task_2 --domain hockey --max-samples 1
+
+echo ""
+echo "All smoke benchmark runs passed report validation."