From 44af3a9f2c4a5f74d7b3f2286d156f35b9421dff Mon Sep 17 00:00:00 2001
From: Chathurangi Shyalika
 <chathurangishyalika@Chathurangis-MacBook-Pro.local>
Date: Thu, 18 Jun 2026 16:41:34 -0400
Subject: [PATCH 1/4] feat(benchmark): add scenario suite runner

Signed-off-by: Chathurangi Shyalika <chathurangishyalika@Chathurangis-MacBook-Pro.local>
---
 benchmarks/scenario_suite/README.md           | 205 ++++++++++
 benchmarks/scenario_suite/scenarios.txt       |   2 +
 src/agent/direct_llm_agent/__init__.py        |   2 +-
 src/agent/direct_llm_agent/cli.py             |   2 +-
 src/agent/direct_llm_agent/runner.py          |  42 +-
 src/benchmark/__init__.py                     |   1 +
 src/benchmark/scenario_suite_runner.py        | 386 ++++++++++++++++++
 src/benchmark/tests/__init__.py               |   1 +
 .../tests/test_scenario_suite_runner.py       | 158 +++++++
 src/evaluation/cli.py                         |   3 +-
 src/evaluation/evaluator.py                   |   4 +-
 src/evaluation/loader.py                      |  24 +-
 src/evaluation/metrics.py                     |  10 +-
 src/evaluation/models.py                      |   1 +
 src/evaluation/report.py                      | 140 ++++++-
 src/evaluation/scorers/__init__.py            |   6 +-
 src/evaluation/scorers/llm_judge.py           |   4 +-
 src/evaluation/scorers/static_json.py         |  12 +-
 src/evaluation/tests/test_loader.py           |  14 +-
 src/evaluation/tests/test_metrics.py          |  47 +--
 src/evaluation/tests/test_models.py           |   4 +-
 src/evaluation/tests/test_report.py           |  65 ++-
 src/evaluation/tests/test_runner.py           |  12 +-
 .../tests/test_static_json_scorer.py          |   4 +-
 24 files changed, 1039 insertions(+), 110 deletions(-)
 create mode 100644 benchmarks/scenario_suite/README.md
 create mode 100644 benchmarks/scenario_suite/scenarios.txt
 create mode 100644 src/benchmark/__init__.py
 create mode 100644 src/benchmark/scenario_suite_runner.py
 create mode 100644 src/benchmark/tests/__init__.py
 create mode 100644 src/benchmark/tests/test_scenario_suite_runner.py

diff --git a/benchmarks/scenario_suite/README.md b/benchmarks/scenario_suite/README.md
new file mode 100644
index 00000000..603c8261
--- /dev/null
+++ b/benchmarks/scenario_suite/README.md
@@ -0,0 +1,205 @@
+# Benchmark Runner
+
+This folder contains the scenario list and usage notes for the benchmark.
+
+In the benchmark, users can add the scenario IDs they want to execute.
+
+The benchmark runner executes each scenario sequentially, saves trajectories, and then invokes the existing evaluation pipeline to generate per-scenario and aggregate reports.
+
+## Scenario ID file
+
+The benchmark registry is a plain text file:
+
+```text
+benchmarks/scenario_suite/scenarios.txt
+```
+
+Each line contains one scenario id:
+
+```text
+11
+12
+14
+15
+```
+
+Blank lines and lines starting with `#` are ignored, so you can also use comments:
+
+```text
+# User 1
+11
+12
+14
+15
+
+# User 2
+21
+22
+23
+```
+
+## Expected scenario folder layout
+
+The runner expects a scenario root directory containing folders like:
+
+```text
+scenarios_data/
+  scenario_11/
+    question.txt
+    manifest.json
+    groundtruth.txt
+  scenario_12/
+    question.txt
+    manifest.json
+    groundtruth.txt
+```
+
+For each scenario:
+
+- `question.txt` is passed to the agent
+- `manifest.json` is used by couchdb to load the data
+- `groundtruth.txt` is used by the evaluator
+
+The scenario folder name must match the id from `scenarios.txt`:
+
+- `11` → `scenario_11`
+- `12` → `scenario_12`
+
+## Run direct LLM
+
+Run the direct LLM baseline sequentially over the listed scenarios:
+
+```bash
+uv run python -m benchmark.scenario_suite_runner   --scenario-ids benchmarks/scenario_suite/scenarios.txt   --scenario-root /.../scenarios_data   --method direct_llm --direct-model-id tokenrouter/MiniMax-M3
+```
+
+This writes trajectories to:
+
+```text
+traces/trajectories/scenario_suite/direct_llm/
+```
+
+and reports to:
+
+```text
+reports/scenario_suite/direct_llm/
+```
+
+## Run Stirrup agent
+
+Run the Stirrup agent sequentially over the listed scenarios:
+
+```bash
+uv run python -m benchmark.scenario_suite_runner   --scenario-ids benchmarks/scenario_suite/scenarios.txt   --scenario-root /.../scenarios_data   --agent_name stirrup_agent
+```
+
+Run the Stirrup agent sequentially over the listed scenarios using the MiniMax model
+
+```bash
+uv run python -m benchmark.scenario_suite_runner \
+  --scenario-ids benchmarks/scenario_suite/scenarios.txt \
+  --scenario-root /.../scenarios_data \
+  --method stirrup_agent \
+  --stirrup-model-id tokenrouter/MiniMax-M3
+```
+
+This writes trajectories to:
+
+```text
+traces/trajectories/scenario_suite/stirrup_agent/
+```
+
+and reports to:
+
+```text
+reports/scenario_suite/stirrup_agent/
+```
+
+## Run all methods
+
+Run both supported methods one after the other:
+
+```bash
+uv run python -m benchmark.scenario_suite_runner   --scenario-ids benchmarks/scenario_suite/scenarios.txt   --scenario-root /.../scenarios_data   --method all
+```
+
+## Useful options
+
+### Dry run
+
+Print the commands without executing them:
+
+```bash
+uv run python -m benchmark.scenario_suite_runner   --scenario-ids benchmarks/scenario_suite/scenarios.txt   --scenario-root /.../scenarios_data   --method direct_llm   --dry-run
+```
+
+### Skip existing trajectories
+
+Skip scenarios whose trajectory files already exist:
+
+```bash
+uv run python -m benchmark.scenario_suite_runner   --scenario-ids benchmarks/scenario_suite/scenarios.txt   --scenario-root /.../scenarios_data   --method direct_llm   --skip-existing
+```
+
+### Continue after errors
+
+Keep running later scenarios even if one fails:
+
+```bash
+uv run python -m benchmark.scenario_suite_runner   --scenario-ids benchmarks/scenario_suite/scenarios.txt   --scenario-root /.../scenarios_data   --method direct_llm   --continue-on-error
+```
+
+## Environment variables
+
+The direct LLM baseline uses TokenRouter by default. Set these before running:
+
+```bash
+export TOKENROUTER_API_KEY=your_tokenrouter_key
+export TOKENROUTER_BASE_URL=https://api.tokenrouter.com/v1
+```
+
+If you use a different model or backend, set the corresponding environment variables required by that backend.
+
+## Output layout
+
+Typical outputs look like this:
+
+```text
+traces/trajectories/scenario_suite/
+  direct_llm/
+    direct_llm_11.json
+    direct_llm_12.json
+    direct_llm_14.json
+    direct_llm_15.json
+  stirrup_agent/
+    stirrup_agent_11.json
+    stirrup_agent_12.json
+```
+
+```text
+reports/scenario_suite/
+  direct_llm/
+    direct_llm_11.json
+    direct_llm_12.json
+    _aggregate.json
+  stirrup_agent/
+    stirrup_agent_11.json
+    stirrup_agent_12.json
+    _aggregate.json
+```
+
+Each per-scenario report contains the final answer, score, and operational metrics. The aggregate report summarizes the full batch.
+
+## Tests
+
+Run the benchmark runner tests with:
+
+```bash
+uv run pytest src/benchmark/tests/test_scenario_suite_runner.py -v
+```
+
+Run all benchmark tests with:
+
+```bash
+uv run pytest src/benchmark/tests -v
+```
diff --git a/benchmarks/scenario_suite/scenarios.txt b/benchmarks/scenario_suite/scenarios.txt
new file mode 100644
index 00000000..a01f5504
--- /dev/null
+++ b/benchmarks/scenario_suite/scenarios.txt
@@ -0,0 +1,2 @@
+39
+38
\ No newline at end of file
diff --git a/src/agent/direct_llm_agent/__init__.py b/src/agent/direct_llm_agent/__init__.py
index 6f317b52..102c9726 100644
--- a/src/agent/direct_llm_agent/__init__.py
+++ b/src/agent/direct_llm_agent/__init__.py
@@ -2,4 +2,4 @@
 
 from .runner import DirectLLMAgentRunner
 
-__all__ = ["DirectLLMAgentRunner"]
+__all__ = ["DirectLLMAgentRunner"]
\ No newline at end of file
diff --git a/src/agent/direct_llm_agent/cli.py b/src/agent/direct_llm_agent/cli.py
index be8a7c66..b17328a3 100644
--- a/src/agent/direct_llm_agent/cli.py
+++ b/src/agent/direct_llm_agent/cli.py
@@ -75,4 +75,4 @@ def main() -> None:
 
 
 if __name__ == "__main__":
-    main()
+    main()
\ No newline at end of file
diff --git a/src/agent/direct_llm_agent/runner.py b/src/agent/direct_llm_agent/runner.py
index 6b4a35c4..544e3e93 100644
--- a/src/agent/direct_llm_agent/runner.py
+++ b/src/agent/direct_llm_agent/runner.py
@@ -7,6 +7,8 @@
 
 from __future__ import annotations
 
+import json
+import re
 import time
 from datetime import datetime, timezone
 from pathlib import Path
@@ -28,11 +30,43 @@
 data is missing. Use the task wording, general industrial-maintenance knowledge,
 and a reasonable guess if needed.
 
-Return only the final answer requested by the user. Do not include reasoning,
-tool-use claims, markdown, or extra explanation unless explicitly requested.
+Return only the final answer requested by the user.
+Do not include reasoning, chain-of-thought, <think> tags, tool-use claims,
+markdown, explanations, or any extra text.
+If the answer must be JSON, output only valid JSON.
 """
 
 
+def _clean_final_answer(text: str) -> str:
+    """Remove <think> blocks and keep only the final structured answer."""
+    if not text:
+        return ""
+
+    cleaned = text.strip()
+
+    # Remove <think>...</think> blocks if present.
+    cleaned = re.sub(
+        r"<think>.*?</think>",
+        "",
+        cleaned,
+        flags=re.IGNORECASE | re.DOTALL,
+    ).strip()
+
+    # Keep a trailing JSON object/array if the model added extra text.
+    json_match = re.search(r"(\{.*\}|\[.*\])\s*$", cleaned, flags=re.DOTALL)
+    if json_match:
+        cleaned = json_match.group(1).strip()
+
+    # Canonicalize JSON if possible so the saved answer is clean and strict.
+    try:
+        parsed = json.loads(cleaned)
+        cleaned = json.dumps(parsed, ensure_ascii=False, separators=(",", ":"))
+    except json.JSONDecodeError:
+        pass
+
+    return cleaned
+
+
 class DirectLLMAgentRunner(AgentRunner):
     """A simple model-only runner with no MCP tool calls."""
 
@@ -65,7 +99,7 @@ async def run(self, question: str) -> AgentResult:
             duration_ms = (time.perf_counter() - call_started) * 1000
             total_duration_ms = (time.perf_counter() - run_started) * 1000
 
-            answer = result.text.strip()
+            answer = _clean_final_answer(result.text)
 
             trajectory = Trajectory(
                 started_at=started,
@@ -100,4 +134,4 @@ async def run(self, question: str) -> AgentResult:
                 question=question,
                 answer=answer,
                 trajectory=trajectory,
-            )
+            )
\ No newline at end of file
diff --git a/src/benchmark/__init__.py b/src/benchmark/__init__.py
new file mode 100644
index 00000000..610df31f
--- /dev/null
+++ b/src/benchmark/__init__.py
@@ -0,0 +1 @@
+"""Benchmark runners for AssetOpsBench."""
\ No newline at end of file
diff --git a/src/benchmark/scenario_suite_runner.py b/src/benchmark/scenario_suite_runner.py
new file mode 100644
index 00000000..09845479
--- /dev/null
+++ b/src/benchmark/scenario_suite_runner.py
@@ -0,0 +1,386 @@
+"""Sequential runner for the benchmark scenarios.
+
+This runner reads a simple scenario-id file, runs each scenario with the
+selected agent method, saves trajectories through AGENT_TRAJECTORY_DIR, and
+optionally invokes the existing evaluator to generate reports.
+
+Example:
+
+    uv run python -m benchmark.scenario_suite_runner \
+      --scenario-ids benchmarks/scenario_suite/scenarios.txt \
+      --scenario-root /path/to/scenarios_data \
+      --method direct_llm
+
+The scenario root is expected to contain folders such as:
+
+    scenarios_data/
+      scenario_11/
+        question.txt
+        groundtruth.txt
+      scenario_12/
+        question.txt
+        groundtruth.txt
+"""
+
+from __future__ import annotations
+
+import argparse
+import os
+import subprocess
+import sys
+from dataclasses import dataclass
+from pathlib import Path
+
+REPO_ROOT = Path(__file__).resolve().parents[2]
+
+_DEFAULT_DIRECT_LLM_MODEL = "tokenrouter/MiniMax-M3"
+_DEFAULT_STIRRUP_MODEL = "litellm_proxy/aws/claude-opus-4-8"
+
+
+@dataclass(frozen=True)
+class MethodConfig:
+    """Configuration for one benchmark method."""
+
+    agent_name: str
+    command: str
+    model_id: str
+
+
+def load_scenario_ids(path: Path) -> list[str]:
+    """Load scenario ids from a plain text file.
+
+    The file format is intentionally simple:
+
+        11
+        12
+        14
+
+    Blank lines and lines beginning with '#' are ignored.
+    """
+    if not path.exists():
+        raise FileNotFoundError(f"Scenario id file not found: {path}")
+
+    scenario_ids: list[str] = []
+    for raw_line in path.read_text(encoding="utf-8").splitlines():
+        line = raw_line.strip()
+        if not line or line.startswith("#"):
+            continue
+        scenario_ids.append(line)
+
+    if not scenario_ids:
+        raise ValueError(f"No scenario ids found in {path}")
+
+    return scenario_ids
+
+
+def scenario_dir_for_id(scenario_root: Path, scenario_id: str) -> Path:
+    """Return the expected scenario folder path for a scenario id."""
+    return scenario_root / f"scenario_{scenario_id}"
+
+
+def read_question(scenario_root: Path, scenario_id: str) -> str:
+    """Read question.txt for a scenario."""
+    scenario_dir = scenario_dir_for_id(scenario_root, scenario_id)
+    question_path = scenario_dir / "question.txt"
+
+    if not question_path.exists():
+        raise FileNotFoundError(
+            f"Missing question file for scenario {scenario_id}: {question_path}"
+        )
+
+    question = question_path.read_text(encoding="utf-8").strip()
+    if not question:
+        raise ValueError(
+            f"Question file is empty for scenario {scenario_id}: {question_path}"
+        )
+
+    return question
+
+
+def validate_groundtruth_exists(scenario_root: Path, scenario_id: str) -> None:
+    """Warn if groundtruth.txt is missing.
+
+    The agent run itself only needs question.txt, but evaluation needs
+    groundtruth.txt.
+    """
+    scenario_dir = scenario_dir_for_id(scenario_root, scenario_id)
+    groundtruth_path = scenario_dir / "groundtruth.txt"
+
+    if not groundtruth_path.exists():
+        print(
+            f"warning: missing groundtruth for scenario {scenario_id}: {groundtruth_path}",
+            file=sys.stderr,
+        )
+
+
+def reset_and_load_couchdb(scenario_id: str, scenario_root: Path, dry_run: bool) -> None:
+    """Reset CouchDB and load the scenario-specific data from scenario_root."""
+    env = os.environ.copy()
+    env["SCENARIOS_DATA_DIR"] = str(scenario_root)
+
+    reset_cmd = [sys.executable, "src/couchdb/init_data.py", "--reset-only"]
+    load_cmd = [sys.executable, "src/couchdb/init_data.py", scenario_id]
+
+    print("\n" + "-" * 80)
+    print(f"Preparing CouchDB for scenario {scenario_id}")
+    print("Reset command:")
+    print(" ".join(reset_cmd))
+    print("Load command:")
+    print(" ".join(load_cmd))
+    print("-" * 80)
+
+    if dry_run:
+        return
+
+    subprocess.run(reset_cmd, check=True, cwd=str(REPO_ROOT), env=env)
+    subprocess.run(load_cmd, check=True, cwd=str(REPO_ROOT), env=env)
+
+
+def run_agent_for_scenario(
+    *,
+    method: MethodConfig,
+    scenario_id: str,
+    question: str,
+    trajectory_dir: Path,
+    dry_run: bool,
+) -> None:
+    """Run one scenario with one method."""
+    run_id = f"{method.agent_name}_{scenario_id}"
+
+    env = os.environ.copy()
+    env["AGENT_TRAJECTORY_DIR"] = str(trajectory_dir)
+
+    cmd = [
+        "uv",
+        "run",
+        method.command,
+        "--model-id",
+        method.model_id,
+        "--scenario-id",
+        scenario_id,
+        "--run-id",
+        run_id,
+        question,
+    ]
+
+    print("\n" + "=" * 80)
+    print(f"Method:      {method.agent_name}")
+    print(f"Scenario ID: {scenario_id}")
+    print(f"Run ID:      {run_id}")
+    print(f"Trajectories: {trajectory_dir}")
+    print("Command:")
+    print(" ".join(cmd[:-1]) + " <question>")
+    print("=" * 80)
+
+    if dry_run:
+        return
+
+    subprocess.run(cmd, check=True, env=env)
+
+
+def run_evaluation(
+    *,
+    trajectory_dir: Path,
+    scenario_root: Path,
+    report_dir: Path,
+    dry_run: bool,
+) -> None:
+    """Run the existing AssetOpsBench evaluator for one method."""
+    cmd = [
+        "uv",
+        "run",
+        "evaluate",
+        "--trajectories",
+        str(trajectory_dir),
+        "--scenarios",
+        str(scenario_root),
+        "--scorer-default",
+        "static_json",
+        "--reports-dir",
+        str(report_dir),
+    ]
+
+    print("\n" + "=" * 80)
+    print("Running evaluation")
+    print(f"Trajectories: {trajectory_dir}")
+    print(f"Scenarios:    {scenario_root}")
+    print(f"Reports:      {report_dir}")
+    print("Command:")
+    print(" ".join(cmd))
+    print("=" * 80)
+
+    if dry_run:
+        return
+
+    subprocess.run(cmd, check=True)
+
+
+def build_methods(args: argparse.Namespace) -> dict[str, MethodConfig]:
+    """Build available method configs from CLI args."""
+    return {
+        "direct_llm": MethodConfig(
+            agent_name="direct_llm",
+            command="direct-llm-agent",
+            model_id=args.direct_model_id,
+        ),
+        "stirrup_agent": MethodConfig(
+            agent_name="stirrup_agent",
+            command="stirrup-agent",
+            model_id=args.stirrup_model_id,
+        ),
+    }
+
+
+def selected_methods(
+    *,
+    method_name: str,
+    methods: dict[str, MethodConfig],
+) -> list[MethodConfig]:
+    """Resolve the requested method selection."""
+    if method_name == "all":
+        return list(methods.values())
+
+    if method_name not in methods:
+        valid = ", ".join(sorted([*methods.keys(), "all"]))
+        raise ValueError(f"Unknown method '{method_name}'. Valid choices: {valid}")
+
+    return [methods[method_name]]
+
+
+def _build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        prog="scenario_suite_runner",
+        description="Run benchmark scenarios sequentially.",
+    )
+
+    parser.add_argument(
+        "--scenario-ids",
+        type=Path,
+        default=Path("benchmarks/scenario_suite/scenarios.txt"),
+        help="Plain text file containing one scenario id per line.",
+    )
+    parser.add_argument(
+        "--scenario-root",
+        type=Path,
+        required=True,
+        help="Directory containing scenario_<id>/question.txt and groundtruth.txt folders.",
+    )
+    parser.add_argument(
+        "--agent_name",
+        choices=["direct_llm", "stirrup_agent", "all"],
+        default="direct_llm",
+        help="Which method to run.",
+    )
+    parser.add_argument(
+        "--trajectory-root",
+        type=Path,
+        default=Path("traces/trajectories/scenario_suite"),
+        help="Root directory for saved trajectories.",
+    )
+    parser.add_argument(
+        "--reports-root",
+        type=Path,
+        default=Path("reports/scenario_suite"),
+        help="Root directory for evaluation reports.",
+    )
+    parser.add_argument(
+        "--direct-model-id",
+        default=_DEFAULT_DIRECT_LLM_MODEL,
+        help="Model id for direct_llm.",
+    )
+    parser.add_argument(
+        "--stirrup-model-id",
+        default=_DEFAULT_STIRRUP_MODEL,
+        help="Model id for stirrup_agent.",
+    )
+    parser.add_argument(
+        "--skip-existing",
+        action="store_true",
+        help="Skip a scenario if its expected trajectory file already exists.",
+    )
+    parser.add_argument(
+        "--no-evaluate",
+        action="store_true",
+        help="Run agents only; do not invoke evaluator after the run.",
+    )
+    parser.add_argument(
+        "--continue-on-error",
+        action="store_true",
+        help="Continue running later scenarios if one scenario fails.",
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Print commands without executing them.",
+    )
+
+    return parser
+
+
+def main() -> None:
+    parser = _build_parser()
+    args = parser.parse_args()
+
+    scenario_ids = load_scenario_ids(args.scenario_ids)
+    methods = selected_methods(
+        method_name=args.agent_name,
+        methods=build_methods(args),
+    )
+
+    print(f"Loaded {len(scenario_ids)} scenario ids from {args.scenario_ids}")
+    print(f"Selected methods: {', '.join(method.agent_name for method in methods)}")
+
+    for method in methods:
+        trajectory_dir = args.trajectory_root / method.agent_name
+        report_dir = args.reports_root / method.agent_name
+
+        if not args.dry_run:
+            trajectory_dir.mkdir(parents=True, exist_ok=True)
+            report_dir.mkdir(parents=True, exist_ok=True)
+
+        for scenario_id in scenario_ids:
+            expected_trajectory = trajectory_dir / f"{method.agent_name}_{scenario_id}.json"
+
+            if args.skip_existing and expected_trajectory.exists():
+                print(
+                    f"Skipping scenario {scenario_id}; trajectory exists: {expected_trajectory}"
+                )
+                continue
+
+            try:
+                validate_groundtruth_exists(args.scenario_root, scenario_id)
+                question = read_question(args.scenario_root, scenario_id)
+
+                # Uniform CouchDB preparation for every agent and every scenario.
+                reset_and_load_couchdb(
+                    scenario_id=scenario_id,
+                    scenario_root=args.scenario_root,
+                    dry_run=args.dry_run,
+                )
+
+                run_agent_for_scenario(
+                    method=method,
+                    scenario_id=scenario_id,
+                    question=question,
+                    trajectory_dir=trajectory_dir,
+                    dry_run=args.dry_run,
+                )
+            except Exception as exc:
+                print(
+                    f"error: scenario {scenario_id} failed for method {method.agent_name}: {exc}",
+                    file=sys.stderr,
+                )
+                if not args.continue_on_error:
+                    raise
+
+        if not args.no_evaluate:
+            run_evaluation(
+                trajectory_dir=trajectory_dir,
+                scenario_root=args.scenario_root,
+                report_dir=report_dir,
+                dry_run=args.dry_run,
+            )
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/src/benchmark/tests/__init__.py b/src/benchmark/tests/__init__.py
new file mode 100644
index 00000000..d25c211d
--- /dev/null
+++ b/src/benchmark/tests/__init__.py
@@ -0,0 +1 @@
+"""Tests for benchmark runners."""
\ No newline at end of file
diff --git a/src/benchmark/tests/test_scenario_suite_runner.py b/src/benchmark/tests/test_scenario_suite_runner.py
new file mode 100644
index 00000000..ae435a42
--- /dev/null
+++ b/src/benchmark/tests/test_scenario_suite_runner.py
@@ -0,0 +1,158 @@
+from __future__ import annotations
+
+from argparse import Namespace
+from pathlib import Path
+
+import pytest
+
+from benchmark import scenario_suite_runner as mr
+
+
+def test_load_scenario_ids_ignores_blank_lines_and_comments(tmp_path: Path) -> None:
+    p = tmp_path / "scenarios.txt"
+    p.write_text(
+        """
+        # scenario_suite scenarios
+
+        11
+        12
+
+        # more
+        14
+        15
+        """,
+        encoding="utf-8",
+    )
+
+    assert mr.load_scenario_ids(p) == ["11", "12", "14", "15"]
+
+
+def test_load_scenario_ids_raises_for_missing_file(tmp_path: Path) -> None:
+    p = tmp_path / "missing.txt"
+
+    with pytest.raises(FileNotFoundError):
+        mr.load_scenario_ids(p)
+
+
+def test_scenario_dir_for_id() -> None:
+    root = Path("/tmp/scenarios_data")
+    assert mr.scenario_dir_for_id(root, "11") == root / "scenario_11"
+
+
+def test_read_question_reads_question_txt(tmp_path: Path) -> None:
+    scenario_dir = tmp_path / "scenario_11"
+    scenario_dir.mkdir()
+    (scenario_dir / "question.txt").write_text("What is the count?", encoding="utf-8")
+
+    assert mr.read_question(tmp_path, "11") == "What is the count?"
+
+
+def test_read_question_raises_when_missing(tmp_path: Path) -> None:
+    (tmp_path / "scenario_11").mkdir()
+
+    with pytest.raises(FileNotFoundError):
+        mr.read_question(tmp_path, "11")
+
+
+def test_build_methods_uses_cli_defaults() -> None:
+    args = Namespace(
+        direct_model_id="tokenrouter/MiniMax-M3",
+        stirrup_model_id="tokenrouter/MiniMax-M3",
+    )
+
+    methods = mr.build_methods(args)
+
+    assert methods["direct_llm"].command == "direct-llm-agent"
+    assert methods["direct_llm"].model_id == "tokenrouter/MiniMax-M3"
+    assert methods["stirrup_agent"].command == "stirrup-agent"
+    assert methods["stirrup_agent"].model_id == "tokenrouter/MiniMax-M3"
+
+
+def test_selected_methods_direct_llm_only() -> None:
+    methods = {
+        "direct_llm": mr.MethodConfig(
+            agent_name="direct_llm",
+            command="direct-llm-agent",
+            model_id="tokenrouter/MiniMax-M3",
+        ),
+        "stirrup_agent": mr.MethodConfig(
+            agent_name="stirrup_agent",
+            command="stirrup-agent",
+            model_id="tokenrouter/MiniMax-M3",
+        ),
+    }
+
+    selected = mr.selected_methods(method_name="direct_llm", methods=methods)
+
+    assert len(selected) == 1
+    assert selected[0].agent_name == "direct_llm"
+
+
+def test_selected_methods_all_returns_both() -> None:
+    methods = {
+        "direct_llm": mr.MethodConfig(
+            agent_name="direct_llm",
+            command="direct-llm-agent",
+            model_id="tokenrouter/MiniMax-M3",
+        ),
+        "stirrup_agent": mr.MethodConfig(
+            agent_name="stirrup_agent",
+            command="stirrup-agent",
+            model_id="tokenrouter/MiniMax-M3",
+        ),
+    }
+
+    selected = mr.selected_methods(method_name="all", methods=methods)
+
+    assert [m.agent_name for m in selected] == ["direct_llm", "stirrup_agent"]
+
+
+def test_run_agent_for_scenario_dry_run_does_not_call_subprocess(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    called = False
+
+    def fake_run(*args, **kwargs):
+        nonlocal called
+        called = True
+        raise AssertionError("subprocess.run should not be called in dry_run")
+
+    monkeypatch.setattr(mr.subprocess, "run", fake_run)
+
+    method = mr.MethodConfig(
+        name="direct_llm",
+        command="direct-llm-agent",
+        model_id="tokenrouter/MiniMax-M3",
+    )
+
+    mr.run_agent_for_scenario(
+        method=method,
+        scenario_id="11",
+        question="What is the count?",
+        trajectory_dir=tmp_path / "traj",
+        dry_run=True,
+    )
+
+    assert called is False
+
+
+def test_run_evaluation_dry_run_does_not_call_subprocess(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    called = False
+
+    def fake_run(*args, **kwargs):
+        nonlocal called
+        called = True
+        raise AssertionError("subprocess.run should not be called in dry_run")
+
+    monkeypatch.setattr(mr.subprocess, "run", fake_run)
+
+    mr.run_evaluation(
+        trajectory_dir=tmp_path / "traj",
+        scenario_root=tmp_path / "scenarios",
+        report_dir=tmp_path / "reports",
+        dry_run=True,
+    )
+
+    assert called is False
\ No newline at end of file
diff --git a/src/evaluation/cli.py b/src/evaluation/cli.py
index 22661725..66ee508d 100644
--- a/src/evaluation/cli.py
+++ b/src/evaluation/cli.py
@@ -47,7 +47,8 @@ def _build_parser() -> argparse.ArgumentParser:
         "--scorer-default",
         dest="scorer_default",
         default="llm_judge",
-        help="Scorer name when scenario.scoring_method is unset. Default: llm_judge.",
+        help="Scorer name when scenario.scoring_method is unset. "
+        "Default: llm_judge.",
     )
     p.add_argument(
         "--judge-model",
diff --git a/src/evaluation/evaluator.py b/src/evaluation/evaluator.py
index a9a1bed9..27845f08 100644
--- a/src/evaluation/evaluator.py
+++ b/src/evaluation/evaluator.py
@@ -82,9 +82,7 @@ def _score_one(
     def _resolve(name: str) -> Scorer:
         return scorer_registry.get(name)
 
-    def _validate_judge_model(
-        self, scorer_name: str, traj: PersistedTrajectory
-    ) -> None:
+    def _validate_judge_model(self, scorer_name: str, traj: PersistedTrajectory) -> None:
         if scorer_name != "llm_judge" or not self.judge_model:
             return
 
diff --git a/src/evaluation/loader.py b/src/evaluation/loader.py
index e1481f0f..1d5c0c9d 100644
--- a/src/evaluation/loader.py
+++ b/src/evaluation/loader.py
@@ -45,20 +45,20 @@ def _load_one_trajectory(path: Path) -> PersistedTrajectory:
 def load_scenarios(paths: Iterable[Path] | Path) -> list[Scenario]:
     """Load scenarios from one or more files or directories.
 
-      Supported inputs:
+    Supported inputs:
 
-      1. Existing JSON / JSONL scenario files.
-      2. A directory containing scenario subdirectories, each with
-         ``groundtruth.txt``. For example:
+    1. Existing JSON / JSONL scenario files.
+    2. A directory containing scenario subdirectories, each with
+       ``groundtruth.txt``. For example:
 
-         scenarios_data/
-    scenario_11/
-      groundtruth.txt
-    scenario_12/
-      groundtruth.txt
+       scenarios_data/
+  scenario_11/
+    groundtruth.txt
+  scenario_12/
+    groundtruth.txt
 
-      For folder-based scenarios, the folder name becomes the scenario id and
-      ``groundtruth.txt`` becomes ``expected_answer``.
+    For folder-based scenarios, the folder name becomes the scenario id and
+    ``groundtruth.txt`` becomes ``expected_answer``.
     """
     if isinstance(paths, (str, Path)):
         paths = [Path(paths)]
@@ -154,4 +154,4 @@ def join_records(
             continue
         scenario = by_id.get(traj.scenario_id)
         if scenario is not None:
-            yield scenario, traj
+            yield scenario, traj
\ No newline at end of file
diff --git a/src/evaluation/metrics.py b/src/evaluation/metrics.py
index 0255263c..325074a7 100644
--- a/src/evaluation/metrics.py
+++ b/src/evaluation/metrics.py
@@ -40,9 +40,7 @@ def _from_sdk_trajectory(traj: dict, model: str) -> OpsMetrics:
     tokens_in = sum(int(t.get("input_tokens") or 0) for t in turns)
     tokens_out = sum(int(t.get("output_tokens") or 0) for t in turns)
 
-    durations_ms = [
-        t.get("duration_ms") for t in turns if t.get("duration_ms") is not None
-    ]
+    durations_ms = [t.get("duration_ms") for t in turns if t.get("duration_ms") is not None]
     duration_ms = sum(durations_ms) if durations_ms else None
 
     tool_names: list[str] = []
@@ -67,7 +65,11 @@ def _from_plan_execute(steps: list[Any], model: str) -> OpsMetrics:
     # plan-execute persists ``list[StepResult]``; the dataclass exposes
     # ``server`` / ``tool`` / ``response`` fields but no per-step token
     # counts, so we surface what is available and leave the rest at zero.
-    tool_names = [s.get("tool") for s in steps if isinstance(s, dict) and s.get("tool")]
+    tool_names = [
+        s.get("tool")
+        for s in steps
+        if isinstance(s, dict) and s.get("tool")
+    ]
     return OpsMetrics(
         turn_count=len(steps),
         tool_call_count=len(tool_names),
diff --git a/src/evaluation/models.py b/src/evaluation/models.py
index 2f57e0b3..353619f2 100644
--- a/src/evaluation/models.py
+++ b/src/evaluation/models.py
@@ -116,3 +116,4 @@ class EvalReport(BaseModel):
     by_scenario_type: dict[str, TypeBreakdown] = Field(default_factory=dict)
     ops: AggregateOps = Field(default_factory=AggregateOps)
     results: list[ScenarioResult] = Field(default_factory=list)
+    score_summary: dict[str, Any] | None = None
diff --git a/src/evaluation/report.py b/src/evaluation/report.py
index 6ff5a9d2..638f2aa5 100644
--- a/src/evaluation/report.py
+++ b/src/evaluation/report.py
@@ -6,6 +6,7 @@
 import json
 from collections import defaultdict
 from pathlib import Path
+from typing import Any
 
 from .metrics import aggregate_ops
 from .models import EvalReport, ScenarioResult, TypeBreakdown
@@ -13,6 +14,95 @@
 _AGGREGATE_FILENAME = "_aggregate.json"
 
 
+def _safe_float(value: Any) -> float | None:
+    if isinstance(value, bool):
+        return None
+    if isinstance(value, (int, float)):
+        return float(value)
+    return None
+
+
+def _avg(values: list[float]) -> float | None:
+    return round(sum(values) / len(values), 4) if values else None
+
+
+def _aggregate_score_summary(results: list[ScenarioResult]) -> dict[str, Any]:
+    """Aggregate static_json-style score.details across all results.
+
+    Per-scenario key-level details stay in each result. Here we summarize the
+    numeric metrics and count totals across the full batch.
+    """
+    metric_names = [
+        "partial_exact_match_accuracy",
+        "strict_exact_match_accuracy",
+        "partial_similarity_score",
+        "precision",
+        "recall",
+        "f1",
+        "total_gold_keys",
+        "total_model_keys",
+        "matched_keys",
+        "exact_value_matches",
+    ]
+
+    score_values: dict[str, list[float]] = {name: [] for name in metric_names}
+    score_values["score"] = []
+
+    missing_keys_total = 0
+    extra_keys_total = 0
+    detail_entries_total = 0
+    scored_results = 0
+
+    for result in results:
+        # Top-level score field, if present
+        score_value = _safe_float(result.score.score)
+        if score_value is not None:
+            score_values["score"].append(score_value)
+
+        details = result.score.details
+        if not isinstance(details, dict):
+            continue
+
+        scored_results += 1
+
+        for name in metric_names:
+            value = _safe_float(details.get(name))
+            if value is not None:
+                score_values[name].append(value)
+
+        missing = details.get("missing_keys")
+        if isinstance(missing, list):
+            missing_keys_total += len(missing)
+
+        extra = details.get("extra_keys")
+        if isinstance(extra, list):
+            extra_keys_total += len(extra)
+
+        per_key_details = details.get("details")
+        if isinstance(per_key_details, list):
+            detail_entries_total += len(per_key_details)
+
+    return {
+        "scored_results": scored_results,
+        "score_avg": _avg(score_values["score"]),
+        "score_min": round(min(score_values["score"]), 4) if score_values["score"] else None,
+        "score_max": round(max(score_values["score"]), 4) if score_values["score"] else None,
+        "partial_exact_match_accuracy_avg": _avg(score_values["partial_exact_match_accuracy"]),
+        "strict_exact_match_accuracy_avg": _avg(score_values["strict_exact_match_accuracy"]),
+        "partial_similarity_score_avg": _avg(score_values["partial_similarity_score"]),
+        "precision_avg": _avg(score_values["precision"]),
+        "recall_avg": _avg(score_values["recall"]),
+        "f1_avg": _avg(score_values["f1"]),
+        "total_gold_keys_avg": _avg(score_values["total_gold_keys"]),
+        "total_model_keys_avg": _avg(score_values["total_model_keys"]),
+        "matched_keys_avg": _avg(score_values["matched_keys"]),
+        "exact_value_matches_avg": _avg(score_values["exact_value_matches"]),
+        "missing_keys_total": missing_keys_total,
+        "extra_keys_total": extra_keys_total,
+        "detail_entries_total": detail_entries_total,
+    }
+
+
 def build_report(results: list[ScenarioResult]) -> EvalReport:
     total = len(results)
     passed = sum(1 for r in results if r.score.passed)
@@ -43,6 +133,7 @@ def build_report(results: list[ScenarioResult]) -> EvalReport:
         },
         by_scenario_type=breakdown,
         ops=aggregate_ops(results),
+        score_summary=_aggregate_score_summary(results),
         results=results,
     )
 
@@ -58,7 +149,7 @@ def write_reports_dir(report: EvalReport, reports_dir: Path) -> Path:
     """Write one JSON file per result (``<run_id>.json``) plus an aggregate.
 
     Results without a ``run_id`` fall back to ``<scenario_id>.json`` so
-    nothing is dropped.  Returns the directory path.
+    nothing is dropped. Returns the directory path.
     """
     reports_dir = Path(reports_dir)
     reports_dir.mkdir(parents=True, exist_ok=True)
@@ -66,7 +157,6 @@ def write_reports_dir(report: EvalReport, reports_dir: Path) -> Path:
     used: dict[str, int] = {}
     for r in report.results:
         stem = r.run_id or f"scenario-{r.scenario_id}"
-        # Disambiguate any collisions deterministically.
         suffix = used.get(stem, 0)
         used[stem] = suffix + 1
         name = stem if suffix == 0 else f"{stem}-{suffix}"
@@ -88,6 +178,49 @@ def render_summary(report: EvalReport) -> str:
         f"Passed: {t.get('passed', 0)}  "
         f"Pass rate: {t.get('pass_rate', 0):.1%}"
     )
+
+    if report.score_summary:
+        s = report.score_summary
+        lines.append("")
+        lines.append("Static JSON summary:")
+        if s.get("score_avg") is not None:
+            lines.append(f"  score_avg:                  {s['score_avg']:.4f}")
+        if s.get("score_min") is not None:
+            lines.append(f"  score_min:                  {s['score_min']:.4f}")
+        if s.get("score_max") is not None:
+            lines.append(f"  score_max:                  {s['score_max']:.4f}")
+        if s.get("partial_exact_match_accuracy_avg") is not None:
+            lines.append(
+                f"  partial_exact_match_avg:     {s['partial_exact_match_accuracy_avg']:.4f}"
+            )
+        if s.get("strict_exact_match_accuracy_avg") is not None:
+            lines.append(
+                f"  strict_exact_match_avg:      {s['strict_exact_match_accuracy_avg']:.4f}"
+            )
+        if s.get("partial_similarity_score_avg") is not None:
+            lines.append(
+                f"  partial_similarity_avg:      {s['partial_similarity_score_avg']:.4f}"
+            )
+        if s.get("precision_avg") is not None:
+            lines.append(f"  precision_avg:               {s['precision_avg']:.4f}")
+        if s.get("recall_avg") is not None:
+            lines.append(f"  recall_avg:                  {s['recall_avg']:.4f}")
+        if s.get("f1_avg") is not None:
+            lines.append(f"  f1_avg:                      {s['f1_avg']:.4f}")
+        if s.get("total_gold_keys_avg") is not None:
+            lines.append(f"  total_gold_keys_avg:         {s['total_gold_keys_avg']:.4f}")
+        if s.get("total_model_keys_avg") is not None:
+            lines.append(f"  total_model_keys_avg:        {s['total_model_keys_avg']:.4f}")
+        if s.get("matched_keys_avg") is not None:
+            lines.append(f"  matched_keys_avg:            {s['matched_keys_avg']:.4f}")
+        if s.get("exact_value_matches_avg") is not None:
+            lines.append(
+                f"  exact_value_matches_avg:     {s['exact_value_matches_avg']:.4f}"
+            )
+        lines.append(f"  missing_keys_total:          {s.get('missing_keys_total', 0)}")
+        lines.append(f"  extra_keys_total:            {s.get('extra_keys_total', 0)}")
+        lines.append(f"  detail_entries_total:        {s.get('detail_entries_total', 0)}")
+
     if report.by_scenario_type:
         lines.append("")
         lines.append("By scenario type:")
@@ -95,6 +228,7 @@ def render_summary(report: EvalReport) -> str:
             lines.append(
                 f"  {stype:<16} {b.passed:>4}/{b.total:<4}  ({b.pass_rate:.1%})"
             )
+
     o = report.ops
     lines.append("")
     lines.append("Operational metrics:")
@@ -112,4 +246,4 @@ def render_summary(report: EvalReport) -> str:
 
 def report_to_json(report: EvalReport) -> str:
     """Convenience JSON dump that round-trips through pydantic."""
-    return json.dumps(json.loads(report.model_dump_json()), indent=2)
+    return json.dumps(json.loads(report.model_dump_json()), indent=2)
\ No newline at end of file
diff --git a/src/evaluation/scorers/__init__.py b/src/evaluation/scorers/__init__.py
index 37973fae..f681844a 100644
--- a/src/evaluation/scorers/__init__.py
+++ b/src/evaluation/scorers/__init__.py
@@ -30,7 +30,9 @@ def register(name: str, scorer: Scorer) -> None:
 
 def get(name: str) -> Scorer:
     if name not in _REGISTRY:
-        raise KeyError(f"unknown scorer {name!r}; registered: {sorted(_REGISTRY)}")
+        raise KeyError(
+            f"unknown scorer {name!r}; registered: {sorted(_REGISTRY)}"
+        )
     return _REGISTRY[name]
 
 
@@ -46,4 +48,4 @@ def names() -> list[str]:
 from . import semantic  # noqa: E402,F401
 from .static_json import install as _install_static_json  # noqa: E402
 
-_install_static_json()
+_install_static_json()
\ No newline at end of file
diff --git a/src/evaluation/scorers/llm_judge.py b/src/evaluation/scorers/llm_judge.py
index 139744dd..e37ecc21 100644
--- a/src/evaluation/scorers/llm_judge.py
+++ b/src/evaluation/scorers/llm_judge.py
@@ -140,7 +140,9 @@ def __call__(
         if review.get("hallucinations") is True:
             score = max(0.0, score - 0.2)
 
-        rationale = str(review.get("suggestions") or review.get("reason") or "")[:500]
+        rationale = str(
+            review.get("suggestions") or review.get("reason") or ""
+        )[:500]
         return ScorerResult(
             scorer=self.name,
             passed=passed,
diff --git a/src/evaluation/scorers/static_json.py b/src/evaluation/scorers/static_json.py
index 20fe7262..a26c53db 100644
--- a/src/evaluation/scorers/static_json.py
+++ b/src/evaluation/scorers/static_json.py
@@ -114,7 +114,9 @@ def _extract_balanced_structure(content: str) -> str:
         (content.find("("), "(", ")"),
     ]
     candidates = [
-        (idx, open_ch, close_ch) for idx, open_ch, close_ch in candidates if idx != -1
+        (idx, open_ch, close_ch)
+        for idx, open_ch, close_ch in candidates
+        if idx != -1
     ]
 
     if not candidates:
@@ -365,7 +367,9 @@ def evaluate_static_json(
     precision = exact_matches / total_model_keys if total_model_keys else 0.0
     recall = exact_matches / total_gold_keys if total_gold_keys else 0.0
     f1 = (
-        2 * precision * recall / (precision + recall) if precision + recall > 0 else 0.0
+        2 * precision * recall / (precision + recall)
+        if precision + recall > 0
+        else 0.0
     )
 
     partial_exact = exact_matches / total_gold_keys if total_gold_keys else 0.0
@@ -388,7 +392,6 @@ def evaluate_static_json(
         details=details,
     )
 
-
 def evaluate_static_json_batch(
     pairs: list[tuple[Any, Any]],
     *,
@@ -436,7 +439,6 @@ def evaluate_static_json_batch(
         "examples": [score.to_dict() for score in scores],
     }
 
-
 class StaticJsonScorer:
     """Evaluation scorer wrapper for the trajectory-based pipeline."""
 
@@ -480,4 +482,4 @@ def __call__(
 
 def install(name: str = "static_json") -> None:
     """Register the static JSON scorer."""
-    register(name, StaticJsonScorer(name=name))
+    register(name, StaticJsonScorer(name=name))
\ No newline at end of file
diff --git a/src/evaluation/tests/test_loader.py b/src/evaluation/tests/test_loader.py
index 72b3b3e5..27d5c9a9 100644
--- a/src/evaluation/tests/test_loader.py
+++ b/src/evaluation/tests/test_loader.py
@@ -21,9 +21,7 @@ def test_load_trajectories_from_dir(trajectory_dir: Path):
 
 
 def test_load_trajectories_skips_unparseable(tmp_path: Path, make_persisted_record):
-    (tmp_path / "good.json").write_text(
-        json.dumps(make_persisted_record()), encoding="utf-8"
-    )
+    (tmp_path / "good.json").write_text(json.dumps(make_persisted_record()), encoding="utf-8")
     (tmp_path / "bad.json").write_text("{not json", encoding="utf-8")
     records = load_trajectories(tmp_path)
     assert len(records) == 1
@@ -32,7 +30,9 @@ def test_load_trajectories_skips_unparseable(tmp_path: Path, make_persisted_reco
 def test_load_scenarios_json_list(tmp_path: Path):
     p = tmp_path / "s.json"
     p.write_text(
-        json.dumps([{"id": 1, "text": "Q1"}, {"id": "2", "text": "Q2"}]),
+        json.dumps(
+            [{"id": 1, "text": "Q1"}, {"id": "2", "text": "Q2"}]
+        ),
         encoding="utf-8",
     )
     out = load_scenarios(p)
@@ -65,9 +65,7 @@ def test_join_drops_orphans(make_persisted_record):
     ]
     trajs = [
         PersistedTrajectory.from_raw(make_persisted_record(scenario_id=1)),
-        PersistedTrajectory.from_raw(
-            make_persisted_record(run_id="r2", scenario_id=99)
-        ),
+        PersistedTrajectory.from_raw(make_persisted_record(run_id="r2", scenario_id=99)),
     ]
     pairs = list(join_records(scenarios, trajs))
     assert len(pairs) == 1
@@ -110,4 +108,4 @@ def test_load_scenarios_from_groundtruth_folders(tmp_path):
     assert len(scenarios) == 1
     assert scenarios[0].id == "11"
     assert scenarios[0].expected_answer == "{'energy': 14, 'material': 48}"
-    assert scenarios[0].scoring_method == "static_json"
+    assert scenarios[0].scoring_method == "static_json"
\ No newline at end of file
diff --git a/src/evaluation/tests/test_metrics.py b/src/evaluation/tests/test_metrics.py
index df096d03..21f097b1 100644
--- a/src/evaluation/tests/test_metrics.py
+++ b/src/evaluation/tests/test_metrics.py
@@ -47,27 +47,9 @@ def test_plan_execute_list_trajectory(self, make_persisted_record):
         rec = PersistedTrajectory.from_raw(
             make_persisted_record(
                 trajectory=[
-                    {
-                        "step_number": 1,
-                        "task": "t",
-                        "server": "iot",
-                        "tool": "sites",
-                        "response": "ok",
-                    },
-                    {
-                        "step_number": 2,
-                        "task": "t2",
-                        "server": "iot",
-                        "tool": "assets",
-                        "response": "ok",
-                    },
-                    {
-                        "step_number": 3,
-                        "task": "t3",
-                        "server": "iot",
-                        "tool": "sites",
-                        "response": "ok",
-                    },
+                    {"step_number": 1, "task": "t", "server": "iot", "tool": "sites", "response": "ok"},
+                    {"step_number": 2, "task": "t2", "server": "iot", "tool": "assets", "response": "ok"},
+                    {"step_number": 3, "task": "t3", "server": "iot", "tool": "sites", "response": "ok"},
                 ]
             )
         )
@@ -85,21 +67,9 @@ def test_empty(self):
 
     def test_sums_and_percentiles(self):
         results = [
-            _result(
-                ops=OpsMetrics(
-                    tokens_in=10, tokens_out=5, duration_ms=100.0, tool_call_count=1
-                )
-            ),
-            _result(
-                ops=OpsMetrics(
-                    tokens_in=20, tokens_out=10, duration_ms=300.0, tool_call_count=2
-                )
-            ),
-            _result(
-                ops=OpsMetrics(
-                    tokens_in=30, tokens_out=15, duration_ms=500.0, tool_call_count=3
-                )
-            ),
+            _result(ops=OpsMetrics(tokens_in=10, tokens_out=5, duration_ms=100.0, tool_call_count=1)),
+            _result(ops=OpsMetrics(tokens_in=20, tokens_out=10, duration_ms=300.0, tool_call_count=2)),
+            _result(ops=OpsMetrics(tokens_in=30, tokens_out=15, duration_ms=500.0, tool_call_count=3)),
         ]
         agg = aggregate_ops(results)
         assert agg.tokens_in_total == 60
@@ -120,10 +90,7 @@ def test_cost_only_when_some_present(self):
 
 class TestNormalizeModel:
     def test_strips_provider_prefix(self):
-        assert (
-            _normalize_model("litellm_proxy/anthropic/claude-opus-4-5")
-            == "claude-opus-4-5"
-        )
+        assert _normalize_model("litellm_proxy/anthropic/claude-opus-4-5") == "claude-opus-4-5"
         assert _normalize_model("watsonx/ibm/granite-13b") == "granite-13b"
 
     def test_strips_long_numeric_suffix(self):
diff --git a/src/evaluation/tests/test_models.py b/src/evaluation/tests/test_models.py
index 621107a0..4aca4d55 100644
--- a/src/evaluation/tests/test_models.py
+++ b/src/evaluation/tests/test_models.py
@@ -10,9 +10,7 @@ def test_scenario_from_raw_coerces_int_id_to_str():
 
 
 def test_scenario_preserves_extra_fields():
-    s = Scenario.from_raw(
-        {"id": "1", "text": "Q", "characteristic_form": "X", "tolerance": 0.01}
-    )
+    s = Scenario.from_raw({"id": "1", "text": "Q", "characteristic_form": "X", "tolerance": 0.01})
     extra = s.model_extra or {}
     assert extra.get("tolerance") == 0.01
 
diff --git a/src/evaluation/tests/test_report.py b/src/evaluation/tests/test_report.py
index aabb5042..0c821f5c 100644
--- a/src/evaluation/tests/test_report.py
+++ b/src/evaluation/tests/test_report.py
@@ -27,9 +27,7 @@ def _result(stype: str, passed: bool, run_id: str = "", **ops_kwargs) -> Scenari
         model="watsonx/ibm/granite",
         question="q",
         answer="a",
-        score=ScorerResult(
-            scorer="llm_judge", passed=passed, score=1.0 if passed else 0.0
-        ),
+        score=ScorerResult(scorer="llm_judge", passed=passed, score=1.0 if passed else 0.0),
         ops=OpsMetrics(**ops_kwargs),
     )
 
@@ -100,17 +98,62 @@ def test_write_reports_dir_falls_back_to_scenario_id(tmp_path: Path):
 
 def test_render_summary_includes_headlines():
     results = [
-        _result(
-            "iot",
-            True,
-            tokens_in=10,
-            tokens_out=5,
-            duration_ms=100.0,
-            tool_call_count=1,
-        ),
+        _result("iot", True, tokens_in=10, tokens_out=5, duration_ms=100.0, tool_call_count=1),
         _result("iot", False, tokens_in=8, tokens_out=4, duration_ms=200.0),
     ]
     text = render_summary(build_report(results))
     assert "Pass rate" in text
     assert "iot" in text
     assert "tokens_in_total" in text
+
+def test_build_report_includes_score_summary():
+    from evaluation.models import ScenarioResult, ScorerResult, OpsMetrics
+
+    results = [
+        ScenarioResult(
+            scenario_id="11",
+            scenario_type="structured",
+            run_id="direct_llm_11",
+            runner="direct-llm-agent",
+            model="tokenrouter/MiniMax-M3",
+            question="Q",
+            answer='{"energy":0,"material":0}',
+            score=ScorerResult(
+                scorer="static_json",
+                passed=False,
+                score=0.0,
+                rationale="structured answer differs from ground truth",
+                details={
+                    "partial_exact_match_accuracy": 0.0,
+                    "strict_exact_match_accuracy": 0.0,
+                    "partial_similarity_score": 0.0,
+                    "precision": 0.0,
+                    "recall": 0.0,
+                    "f1": 0.0,
+                    "total_gold_keys": 2,
+                    "total_model_keys": 2,
+                    "matched_keys": 2,
+                    "exact_value_matches": 0,
+                    "missing_keys": [],
+                    "extra_keys": [],
+                    "details": [],
+                },
+            ),
+            ops=OpsMetrics(
+                turn_count=1,
+                tool_call_count=0,
+                unique_tools=[],
+                tokens_in=390,
+                tokens_out=245,
+                duration_ms=6224.3382,
+                est_cost_usd=None,
+            ),
+        )
+    ]
+
+    report = build_report(results)
+
+    assert report.score_summary is not None
+    assert report.score_summary["partial_exact_match_accuracy_avg"] == 0.0
+    assert report.score_summary["strict_exact_match_accuracy_avg"] == 0.0
+    assert report.score_summary["missing_keys_total"] == 0
\ No newline at end of file
diff --git a/src/evaluation/tests/test_runner.py b/src/evaluation/tests/test_runner.py
index b82123f7..f8a936db 100644
--- a/src/evaluation/tests/test_runner.py
+++ b/src/evaluation/tests/test_runner.py
@@ -10,9 +10,7 @@
 from evaluation import scorers as registry
 
 
-def _always_pass_scorer(
-    scenario: Scenario, answer: str, trajectory_text: str
-) -> ScorerResult:
+def _always_pass_scorer(scenario: Scenario, answer: str, trajectory_text: str) -> ScorerResult:
     return ScorerResult(scorer="stub", passed=True, score=1.0)
 
 
@@ -48,15 +46,11 @@ def test_evaluate_end_to_end(tmp_path: Path, make_persisted_record):
     assert report.ops.tokens_in_total > 0
 
 
-def _always_fail_scorer(
-    scenario: Scenario, answer: str, trajectory_text: str
-) -> ScorerResult:
+def _always_fail_scorer(scenario: Scenario, answer: str, trajectory_text: str) -> ScorerResult:
     return ScorerResult(scorer="stub-fail", passed=False, score=0.0)
 
 
-def test_evaluate_uses_per_scenario_scoring_method(
-    tmp_path: Path, make_persisted_record
-):
+def test_evaluate_uses_per_scenario_scoring_method(tmp_path: Path, make_persisted_record):
     rec = make_persisted_record(run_id="run-x", scenario_id=1, answer="A.")
     (tmp_path / "run-x.json").write_text(json.dumps(rec), encoding="utf-8")
 
diff --git a/src/evaluation/tests/test_static_json_scorer.py b/src/evaluation/tests/test_static_json_scorer.py
index 97ce7239..175a320c 100644
--- a/src/evaluation/tests/test_static_json_scorer.py
+++ b/src/evaluation/tests/test_static_json_scorer.py
@@ -5,7 +5,6 @@
     parse_structured_answer,
 )
 
-
 def test_parse_json_object_from_noisy_markdown_answer():
     raw = 'Answer:\n```json\n{"energy": 3, "material": 12}\n```'
 
@@ -123,6 +122,7 @@ def test_batch_evaluation():
     assert result["strict_exact_match_accuracy"] == 0.5
 
 
+
 from evaluation.models import Scenario
 from evaluation.scorers.static_json import StaticJsonScorer
 
@@ -147,4 +147,4 @@ def test_static_json_scorer_wrapper_exact_match():
     assert result.scorer == "static_json"
     assert result.passed is True
     assert result.score == 1.0
-    assert result.details["strict_exact_match_accuracy"] == 1.0
+    assert result.details["strict_exact_match_accuracy"] == 1.0
\ No newline at end of file

From 72d7ad075cdfa283ee107b39d6869aef98a0205a Mon Sep 17 00:00:00 2001
From: Chathurangi Shyalika
 <chathurangishyalika@Chathurangis-MacBook-Pro.local>
Date: Thu, 18 Jun 2026 16:51:21 -0400
Subject: [PATCH 2/4] Updating README.md

Signed-off-by: Chathurangi Shyalika <chathurangishyalika@Chathurangis-MacBook-Pro.local>
---
 benchmarks/scenario_suite/README.md | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/benchmarks/scenario_suite/README.md b/benchmarks/scenario_suite/README.md
index 603c8261..e6239f65 100644
--- a/benchmarks/scenario_suite/README.md
+++ b/benchmarks/scenario_suite/README.md
@@ -70,7 +70,7 @@ The scenario folder name must match the id from `scenarios.txt`:
 Run the direct LLM baseline sequentially over the listed scenarios:
 
 ```bash
-uv run python -m benchmark.scenario_suite_runner   --scenario-ids benchmarks/scenario_suite/scenarios.txt   --scenario-root /.../scenarios_data   --method direct_llm --direct-model-id tokenrouter/MiniMax-M3
+uv run python -m benchmark.scenario_suite_runner   --scenario-ids benchmarks/scenario_suite/scenarios.txt   --scenario-root /.../scenarios_data   --agent_name direct_llm --direct-model-id tokenrouter/MiniMax-M3
 ```
 
 This writes trajectories to:
@@ -99,7 +99,7 @@ Run the Stirrup agent sequentially over the listed scenarios using the MiniMax m
 uv run python -m benchmark.scenario_suite_runner \
   --scenario-ids benchmarks/scenario_suite/scenarios.txt \
   --scenario-root /.../scenarios_data \
-  --method stirrup_agent \
+  --agent_name stirrup_agent \
   --stirrup-model-id tokenrouter/MiniMax-M3
 ```
 
@@ -115,12 +115,12 @@ and reports to:
 reports/scenario_suite/stirrup_agent/
 ```
 
-## Run all methods
+## Run all agents
 
 Run both supported methods one after the other:
 
 ```bash
-uv run python -m benchmark.scenario_suite_runner   --scenario-ids benchmarks/scenario_suite/scenarios.txt   --scenario-root /.../scenarios_data   --method all
+uv run python -m benchmark.scenario_suite_runner   --scenario-ids benchmarks/scenario_suite/scenarios.txt   --scenario-root /.../scenarios_data   --agent_name all
 ```
 
 ## Useful options
@@ -130,7 +130,7 @@ uv run python -m benchmark.scenario_suite_runner   --scenario-ids benchmarks/sce
 Print the commands without executing them:
 
 ```bash
-uv run python -m benchmark.scenario_suite_runner   --scenario-ids benchmarks/scenario_suite/scenarios.txt   --scenario-root /.../scenarios_data   --method direct_llm   --dry-run
+uv run python -m benchmark.scenario_suite_runner   --scenario-ids benchmarks/scenario_suite/scenarios.txt   --scenario-root /.../scenarios_data   --agent_name direct_llm   --dry-run
 ```
 
 ### Skip existing trajectories
@@ -138,7 +138,7 @@ uv run python -m benchmark.scenario_suite_runner   --scenario-ids benchmarks/sce
 Skip scenarios whose trajectory files already exist:
 
 ```bash
-uv run python -m benchmark.scenario_suite_runner   --scenario-ids benchmarks/scenario_suite/scenarios.txt   --scenario-root /.../scenarios_data   --method direct_llm   --skip-existing
+uv run python -m benchmark.scenario_suite_runner   --scenario-ids benchmarks/scenario_suite/scenarios.txt   --scenario-root /.../scenarios_data   --agent_name direct_llm   --skip-existing
 ```
 
 ### Continue after errors
@@ -146,7 +146,7 @@ uv run python -m benchmark.scenario_suite_runner   --scenario-ids benchmarks/sce
 Keep running later scenarios even if one fails:
 
 ```bash
-uv run python -m benchmark.scenario_suite_runner   --scenario-ids benchmarks/scenario_suite/scenarios.txt   --scenario-root /.../scenarios_data   --method direct_llm   --continue-on-error
+uv run python -m benchmark.scenario_suite_runner   --scenario-ids benchmarks/scenario_suite/scenarios.txt   --scenario-root /.../scenarios_data   --agent_name direct_llm   --continue-on-error
 ```
 
 ## Environment variables

From 055a3c1ffcc5856fc43c6a7a257a27e5251c94bf Mon Sep 17 00:00:00 2001
From: Chathurangi Shyalika
 <chathurangishyalika@Chathurangis-MacBook-Pro.local>
Date: Thu, 18 Jun 2026 19:25:26 -0400
Subject: [PATCH 3/4] Updating model parameters & Updating README.md

Signed-off-by: Chathurangi Shyalika <chathurangishyalika@Chathurangis-MacBook-Pro.local>
---
 benchmarks/scenario_suite/README.md    | 16 ++++++++--------
 src/benchmark/scenario_suite_runner.py | 23 +++++++++--------------
 2 files changed, 17 insertions(+), 22 deletions(-)

diff --git a/benchmarks/scenario_suite/README.md b/benchmarks/scenario_suite/README.md
index e6239f65..ea21337f 100644
--- a/benchmarks/scenario_suite/README.md
+++ b/benchmarks/scenario_suite/README.md
@@ -70,7 +70,7 @@ The scenario folder name must match the id from `scenarios.txt`:
 Run the direct LLM baseline sequentially over the listed scenarios:
 
 ```bash
-uv run python -m benchmark.scenario_suite_runner   --scenario-ids benchmarks/scenario_suite/scenarios.txt   --scenario-root /.../scenarios_data   --agent_name direct_llm --direct-model-id tokenrouter/MiniMax-M3
+uv run python -m benchmark.scenario_suite_runner   --scenario-ids benchmarks/scenario_suite/scenarios.txt   --scenario-root /.../scenarios_data   --method direct_llm --model-id tokenrouter/MiniMax-M3
 ```
 
 This writes trajectories to:
@@ -99,8 +99,8 @@ Run the Stirrup agent sequentially over the listed scenarios using the MiniMax m
 uv run python -m benchmark.scenario_suite_runner \
   --scenario-ids benchmarks/scenario_suite/scenarios.txt \
   --scenario-root /.../scenarios_data \
-  --agent_name stirrup_agent \
-  --stirrup-model-id tokenrouter/MiniMax-M3
+  --method stirrup_agent \
+  --model-id tokenrouter/MiniMax-M3
 ```
 
 This writes trajectories to:
@@ -115,12 +115,12 @@ and reports to:
 reports/scenario_suite/stirrup_agent/
 ```
 
-## Run all agents
+## Run all methods
 
 Run both supported methods one after the other:
 
 ```bash
-uv run python -m benchmark.scenario_suite_runner   --scenario-ids benchmarks/scenario_suite/scenarios.txt   --scenario-root /.../scenarios_data   --agent_name all
+uv run python -m benchmark.scenario_suite_runner   --scenario-ids benchmarks/scenario_suite/scenarios.txt   --scenario-root /.../scenarios_data   --method all
 ```
 
 ## Useful options
@@ -130,7 +130,7 @@ uv run python -m benchmark.scenario_suite_runner   --scenario-ids benchmarks/sce
 Print the commands without executing them:
 
 ```bash
-uv run python -m benchmark.scenario_suite_runner   --scenario-ids benchmarks/scenario_suite/scenarios.txt   --scenario-root /.../scenarios_data   --agent_name direct_llm   --dry-run
+uv run python -m benchmark.scenario_suite_runner   --scenario-ids benchmarks/scenario_suite/scenarios.txt   --scenario-root /.../scenarios_data   --method direct_llm   --dry-run
 ```
 
 ### Skip existing trajectories
@@ -138,7 +138,7 @@ uv run python -m benchmark.scenario_suite_runner   --scenario-ids benchmarks/sce
 Skip scenarios whose trajectory files already exist:
 
 ```bash
-uv run python -m benchmark.scenario_suite_runner   --scenario-ids benchmarks/scenario_suite/scenarios.txt   --scenario-root /.../scenarios_data   --agent_name direct_llm   --skip-existing
+uv run python -m benchmark.scenario_suite_runner   --scenario-ids benchmarks/scenario_suite/scenarios.txt   --scenario-root /.../scenarios_data   --method direct_llm   --skip-existing
 ```
 
 ### Continue after errors
@@ -146,7 +146,7 @@ uv run python -m benchmark.scenario_suite_runner   --scenario-ids benchmarks/sce
 Keep running later scenarios even if one fails:
 
 ```bash
-uv run python -m benchmark.scenario_suite_runner   --scenario-ids benchmarks/scenario_suite/scenarios.txt   --scenario-root /.../scenarios_data   --agent_name direct_llm   --continue-on-error
+uv run python -m benchmark.scenario_suite_runner   --scenario-ids benchmarks/scenario_suite/scenarios.txt   --scenario-root /.../scenarios_data   --method direct_llm   --continue-on-error
 ```
 
 ## Environment variables
diff --git a/src/benchmark/scenario_suite_runner.py b/src/benchmark/scenario_suite_runner.py
index 09845479..fe836698 100644
--- a/src/benchmark/scenario_suite_runner.py
+++ b/src/benchmark/scenario_suite_runner.py
@@ -9,7 +9,8 @@
     uv run python -m benchmark.scenario_suite_runner \
       --scenario-ids benchmarks/scenario_suite/scenarios.txt \
       --scenario-root /path/to/scenarios_data \
-      --method direct_llm
+      --agent_name direct_llm \
+      --model-id tokenrouter/MiniMax-M3
 
 The scenario root is expected to contain folders such as:
 
@@ -33,8 +34,7 @@
 
 REPO_ROOT = Path(__file__).resolve().parents[2]
 
-_DEFAULT_DIRECT_LLM_MODEL = "tokenrouter/MiniMax-M3"
-_DEFAULT_STIRRUP_MODEL = "litellm_proxy/aws/claude-opus-4-8"
+_DEFAULT_MODEL_ID = "tokenrouter/MiniMax-M3"
 
 
 @dataclass(frozen=True)
@@ -221,12 +221,12 @@ def build_methods(args: argparse.Namespace) -> dict[str, MethodConfig]:
         "direct_llm": MethodConfig(
             agent_name="direct_llm",
             command="direct-llm-agent",
-            model_id=args.direct_model_id,
+            model_id=args.model_id,
         ),
         "stirrup_agent": MethodConfig(
             agent_name="stirrup_agent",
             command="stirrup-agent",
-            model_id=args.stirrup_model_id,
+            model_id=args.model_id,
         ),
     }
 
@@ -269,7 +269,7 @@ def _build_parser() -> argparse.ArgumentParser:
         "--agent_name",
         choices=["direct_llm", "stirrup_agent", "all"],
         default="direct_llm",
-        help="Which method to run.",
+        help="Which agent to run.",
     )
     parser.add_argument(
         "--trajectory-root",
@@ -284,14 +284,9 @@ def _build_parser() -> argparse.ArgumentParser:
         help="Root directory for evaluation reports.",
     )
     parser.add_argument(
-        "--direct-model-id",
-        default=_DEFAULT_DIRECT_LLM_MODEL,
-        help="Model id for direct_llm.",
-    )
-    parser.add_argument(
-        "--stirrup-model-id",
-        default=_DEFAULT_STIRRUP_MODEL,
-        help="Model id for stirrup_agent.",
+        "--model-id",
+        default=_DEFAULT_MODEL_ID,
+        help="Model id used by both agents.",
     )
     parser.add_argument(
         "--skip-existing",

From afb3437ffe902f5e3a42d7692959d4179cef8b9a Mon Sep 17 00:00:00 2001
From: Chathurangi Shyalika
 <chathurangishyalika@Chathurangis-MacBook-Pro.local>
Date: Thu, 18 Jun 2026 19:29:55 -0400
Subject: [PATCH 4/4] Updating README.md

Signed-off-by: Chathurangi Shyalika <chathurangishyalika@Chathurangis-MacBook-Pro.local>
---
 benchmarks/scenario_suite/README.md | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/benchmarks/scenario_suite/README.md b/benchmarks/scenario_suite/README.md
index ea21337f..2fd08335 100644
--- a/benchmarks/scenario_suite/README.md
+++ b/benchmarks/scenario_suite/README.md
@@ -70,7 +70,7 @@ The scenario folder name must match the id from `scenarios.txt`:
 Run the direct LLM baseline sequentially over the listed scenarios:
 
 ```bash
-uv run python -m benchmark.scenario_suite_runner   --scenario-ids benchmarks/scenario_suite/scenarios.txt   --scenario-root /.../scenarios_data   --method direct_llm --model-id tokenrouter/MiniMax-M3
+uv run python -m benchmark.scenario_suite_runner   --scenario-ids benchmarks/scenario_suite/scenarios.txt   --scenario-root /.../scenarios_data   --agent_name direct_llm --model-id tokenrouter/MiniMax-M3
 ```
 
 This writes trajectories to:
@@ -99,7 +99,7 @@ Run the Stirrup agent sequentially over the listed scenarios using the MiniMax m
 uv run python -m benchmark.scenario_suite_runner \
   --scenario-ids benchmarks/scenario_suite/scenarios.txt \
   --scenario-root /.../scenarios_data \
-  --method stirrup_agent \
+  --agent_name stirrup_agent \
   --model-id tokenrouter/MiniMax-M3
 ```
 
@@ -115,12 +115,12 @@ and reports to:
 reports/scenario_suite/stirrup_agent/
 ```
 
-## Run all methods
+## Run all agents
 
-Run both supported methods one after the other:
+Run all supported agents one after the other:
 
 ```bash
-uv run python -m benchmark.scenario_suite_runner   --scenario-ids benchmarks/scenario_suite/scenarios.txt   --scenario-root /.../scenarios_data   --method all
+uv run python -m benchmark.scenario_suite_runner   --scenario-ids benchmarks/scenario_suite/scenarios.txt   --scenario-root /.../scenarios_data   --agent_name all
 ```
 
 ## Useful options
@@ -130,7 +130,7 @@ uv run python -m benchmark.scenario_suite_runner   --scenario-ids benchmarks/sce
 Print the commands without executing them:
 
 ```bash
-uv run python -m benchmark.scenario_suite_runner   --scenario-ids benchmarks/scenario_suite/scenarios.txt   --scenario-root /.../scenarios_data   --method direct_llm   --dry-run
+uv run python -m benchmark.scenario_suite_runner   --scenario-ids benchmarks/scenario_suite/scenarios.txt   --scenario-root /.../scenarios_data   --agent_name direct_llm   --dry-run
 ```
 
 ### Skip existing trajectories
@@ -138,7 +138,7 @@ uv run python -m benchmark.scenario_suite_runner   --scenario-ids benchmarks/sce
 Skip scenarios whose trajectory files already exist:
 
 ```bash
-uv run python -m benchmark.scenario_suite_runner   --scenario-ids benchmarks/scenario_suite/scenarios.txt   --scenario-root /.../scenarios_data   --method direct_llm   --skip-existing
+uv run python -m benchmark.scenario_suite_runner   --scenario-ids benchmarks/scenario_suite/scenarios.txt   --scenario-root /.../scenarios_data   --agent_name direct_llm   --skip-existing
 ```
 
 ### Continue after errors
@@ -146,7 +146,7 @@ uv run python -m benchmark.scenario_suite_runner   --scenario-ids benchmarks/sce
 Keep running later scenarios even if one fails:
 
 ```bash
-uv run python -m benchmark.scenario_suite_runner   --scenario-ids benchmarks/scenario_suite/scenarios.txt   --scenario-root /.../scenarios_data   --method direct_llm   --continue-on-error
+uv run python -m benchmark.scenario_suite_runner   --scenario-ids benchmarks/scenario_suite/scenarios.txt   --scenario-root /.../scenarios_data   --agent_name direct_llm   --continue-on-error
 ```
 
 ## Environment variables