From 44af3a9f2c4a5f74d7b3f2286d156f35b9421dff Mon Sep 17 00:00:00 2001 From: Chathurangi Shyalika Date: Thu, 18 Jun 2026 16:41:34 -0400 Subject: [PATCH 1/4] feat(benchmark): add scenario suite runner Signed-off-by: Chathurangi Shyalika --- benchmarks/scenario_suite/README.md | 205 ++++++++++ benchmarks/scenario_suite/scenarios.txt | 2 + src/agent/direct_llm_agent/__init__.py | 2 +- src/agent/direct_llm_agent/cli.py | 2 +- src/agent/direct_llm_agent/runner.py | 42 +- src/benchmark/__init__.py | 1 + src/benchmark/scenario_suite_runner.py | 386 ++++++++++++++++++ src/benchmark/tests/__init__.py | 1 + .../tests/test_scenario_suite_runner.py | 158 +++++++ src/evaluation/cli.py | 3 +- src/evaluation/evaluator.py | 4 +- src/evaluation/loader.py | 24 +- src/evaluation/metrics.py | 10 +- src/evaluation/models.py | 1 + src/evaluation/report.py | 140 ++++++- src/evaluation/scorers/__init__.py | 6 +- src/evaluation/scorers/llm_judge.py | 4 +- src/evaluation/scorers/static_json.py | 12 +- src/evaluation/tests/test_loader.py | 14 +- src/evaluation/tests/test_metrics.py | 47 +-- src/evaluation/tests/test_models.py | 4 +- src/evaluation/tests/test_report.py | 65 ++- src/evaluation/tests/test_runner.py | 12 +- .../tests/test_static_json_scorer.py | 4 +- 24 files changed, 1039 insertions(+), 110 deletions(-) create mode 100644 benchmarks/scenario_suite/README.md create mode 100644 benchmarks/scenario_suite/scenarios.txt create mode 100644 src/benchmark/__init__.py create mode 100644 src/benchmark/scenario_suite_runner.py create mode 100644 src/benchmark/tests/__init__.py create mode 100644 src/benchmark/tests/test_scenario_suite_runner.py diff --git a/benchmarks/scenario_suite/README.md b/benchmarks/scenario_suite/README.md new file mode 100644 index 00000000..603c8261 --- /dev/null +++ b/benchmarks/scenario_suite/README.md @@ -0,0 +1,205 @@ +# Benchmark Runner + +This folder contains the scenario list and usage notes for the benchmark. + +In the benchmark, users can add the scenario IDs they want to execute. + +The benchmark runner executes each scenario sequentially, saves trajectories, and then invokes the existing evaluation pipeline to generate per-scenario and aggregate reports. + +## Scenario ID file + +The benchmark registry is a plain text file: + +```text +benchmarks/scenario_suite/scenarios.txt +``` + +Each line contains one scenario id: + +```text +11 +12 +14 +15 +``` + +Blank lines and lines starting with `#` are ignored, so you can also use comments: + +```text +# User 1 +11 +12 +14 +15 + +# User 2 +21 +22 +23 +``` + +## Expected scenario folder layout + +The runner expects a scenario root directory containing folders like: + +```text +scenarios_data/ + scenario_11/ + question.txt + manifest.json + groundtruth.txt + scenario_12/ + question.txt + manifest.json + groundtruth.txt +``` + +For each scenario: + +- `question.txt` is passed to the agent +- `manifest.json` is used by couchdb to load the data +- `groundtruth.txt` is used by the evaluator + +The scenario folder name must match the id from `scenarios.txt`: + +- `11` → `scenario_11` +- `12` → `scenario_12` + +## Run direct LLM + +Run the direct LLM baseline sequentially over the listed scenarios: + +```bash +uv run python -m benchmark.scenario_suite_runner --scenario-ids benchmarks/scenario_suite/scenarios.txt --scenario-root /.../scenarios_data --method direct_llm --direct-model-id tokenrouter/MiniMax-M3 +``` + +This writes trajectories to: + +```text +traces/trajectories/scenario_suite/direct_llm/ +``` + +and reports to: + +```text +reports/scenario_suite/direct_llm/ +``` + +## Run Stirrup agent + +Run the Stirrup agent sequentially over the listed scenarios: + +```bash +uv run python -m benchmark.scenario_suite_runner --scenario-ids benchmarks/scenario_suite/scenarios.txt --scenario-root /.../scenarios_data --agent_name stirrup_agent +``` + +Run the Stirrup agent sequentially over the listed scenarios using the MiniMax model + +```bash +uv run python -m benchmark.scenario_suite_runner \ + --scenario-ids benchmarks/scenario_suite/scenarios.txt \ + --scenario-root /.../scenarios_data \ + --method stirrup_agent \ + --stirrup-model-id tokenrouter/MiniMax-M3 +``` + +This writes trajectories to: + +```text +traces/trajectories/scenario_suite/stirrup_agent/ +``` + +and reports to: + +```text +reports/scenario_suite/stirrup_agent/ +``` + +## Run all methods + +Run both supported methods one after the other: + +```bash +uv run python -m benchmark.scenario_suite_runner --scenario-ids benchmarks/scenario_suite/scenarios.txt --scenario-root /.../scenarios_data --method all +``` + +## Useful options + +### Dry run + +Print the commands without executing them: + +```bash +uv run python -m benchmark.scenario_suite_runner --scenario-ids benchmarks/scenario_suite/scenarios.txt --scenario-root /.../scenarios_data --method direct_llm --dry-run +``` + +### Skip existing trajectories + +Skip scenarios whose trajectory files already exist: + +```bash +uv run python -m benchmark.scenario_suite_runner --scenario-ids benchmarks/scenario_suite/scenarios.txt --scenario-root /.../scenarios_data --method direct_llm --skip-existing +``` + +### Continue after errors + +Keep running later scenarios even if one fails: + +```bash +uv run python -m benchmark.scenario_suite_runner --scenario-ids benchmarks/scenario_suite/scenarios.txt --scenario-root /.../scenarios_data --method direct_llm --continue-on-error +``` + +## Environment variables + +The direct LLM baseline uses TokenRouter by default. Set these before running: + +```bash +export TOKENROUTER_API_KEY=your_tokenrouter_key +export TOKENROUTER_BASE_URL=https://api.tokenrouter.com/v1 +``` + +If you use a different model or backend, set the corresponding environment variables required by that backend. + +## Output layout + +Typical outputs look like this: + +```text +traces/trajectories/scenario_suite/ + direct_llm/ + direct_llm_11.json + direct_llm_12.json + direct_llm_14.json + direct_llm_15.json + stirrup_agent/ + stirrup_agent_11.json + stirrup_agent_12.json +``` + +```text +reports/scenario_suite/ + direct_llm/ + direct_llm_11.json + direct_llm_12.json + _aggregate.json + stirrup_agent/ + stirrup_agent_11.json + stirrup_agent_12.json + _aggregate.json +``` + +Each per-scenario report contains the final answer, score, and operational metrics. The aggregate report summarizes the full batch. + +## Tests + +Run the benchmark runner tests with: + +```bash +uv run pytest src/benchmark/tests/test_scenario_suite_runner.py -v +``` + +Run all benchmark tests with: + +```bash +uv run pytest src/benchmark/tests -v +``` diff --git a/benchmarks/scenario_suite/scenarios.txt b/benchmarks/scenario_suite/scenarios.txt new file mode 100644 index 00000000..a01f5504 --- /dev/null +++ b/benchmarks/scenario_suite/scenarios.txt @@ -0,0 +1,2 @@ +39 +38 \ No newline at end of file diff --git a/src/agent/direct_llm_agent/__init__.py b/src/agent/direct_llm_agent/__init__.py index 6f317b52..102c9726 100644 --- a/src/agent/direct_llm_agent/__init__.py +++ b/src/agent/direct_llm_agent/__init__.py @@ -2,4 +2,4 @@ from .runner import DirectLLMAgentRunner -__all__ = ["DirectLLMAgentRunner"] +__all__ = ["DirectLLMAgentRunner"] \ No newline at end of file diff --git a/src/agent/direct_llm_agent/cli.py b/src/agent/direct_llm_agent/cli.py index be8a7c66..b17328a3 100644 --- a/src/agent/direct_llm_agent/cli.py +++ b/src/agent/direct_llm_agent/cli.py @@ -75,4 +75,4 @@ def main() -> None: if __name__ == "__main__": - main() + main() \ No newline at end of file diff --git a/src/agent/direct_llm_agent/runner.py b/src/agent/direct_llm_agent/runner.py index 6b4a35c4..544e3e93 100644 --- a/src/agent/direct_llm_agent/runner.py +++ b/src/agent/direct_llm_agent/runner.py @@ -7,6 +7,8 @@ from __future__ import annotations +import json +import re import time from datetime import datetime, timezone from pathlib import Path @@ -28,11 +30,43 @@ data is missing. Use the task wording, general industrial-maintenance knowledge, and a reasonable guess if needed. -Return only the final answer requested by the user. Do not include reasoning, -tool-use claims, markdown, or extra explanation unless explicitly requested. +Return only the final answer requested by the user. +Do not include reasoning, chain-of-thought, tags, tool-use claims, +markdown, explanations, or any extra text. +If the answer must be JSON, output only valid JSON. """ +def _clean_final_answer(text: str) -> str: + """Remove blocks and keep only the final structured answer.""" + if not text: + return "" + + cleaned = text.strip() + + # Remove ... blocks if present. + cleaned = re.sub( + r".*?", + "", + cleaned, + flags=re.IGNORECASE | re.DOTALL, + ).strip() + + # Keep a trailing JSON object/array if the model added extra text. + json_match = re.search(r"(\{.*\}|\[.*\])\s*$", cleaned, flags=re.DOTALL) + if json_match: + cleaned = json_match.group(1).strip() + + # Canonicalize JSON if possible so the saved answer is clean and strict. + try: + parsed = json.loads(cleaned) + cleaned = json.dumps(parsed, ensure_ascii=False, separators=(",", ":")) + except json.JSONDecodeError: + pass + + return cleaned + + class DirectLLMAgentRunner(AgentRunner): """A simple model-only runner with no MCP tool calls.""" @@ -65,7 +99,7 @@ async def run(self, question: str) -> AgentResult: duration_ms = (time.perf_counter() - call_started) * 1000 total_duration_ms = (time.perf_counter() - run_started) * 1000 - answer = result.text.strip() + answer = _clean_final_answer(result.text) trajectory = Trajectory( started_at=started, @@ -100,4 +134,4 @@ async def run(self, question: str) -> AgentResult: question=question, answer=answer, trajectory=trajectory, - ) + ) \ No newline at end of file diff --git a/src/benchmark/__init__.py b/src/benchmark/__init__.py new file mode 100644 index 00000000..610df31f --- /dev/null +++ b/src/benchmark/__init__.py @@ -0,0 +1 @@ +"""Benchmark runners for AssetOpsBench.""" \ No newline at end of file diff --git a/src/benchmark/scenario_suite_runner.py b/src/benchmark/scenario_suite_runner.py new file mode 100644 index 00000000..09845479 --- /dev/null +++ b/src/benchmark/scenario_suite_runner.py @@ -0,0 +1,386 @@ +"""Sequential runner for the benchmark scenarios. + +This runner reads a simple scenario-id file, runs each scenario with the +selected agent method, saves trajectories through AGENT_TRAJECTORY_DIR, and +optionally invokes the existing evaluator to generate reports. + +Example: + + uv run python -m benchmark.scenario_suite_runner \ + --scenario-ids benchmarks/scenario_suite/scenarios.txt \ + --scenario-root /path/to/scenarios_data \ + --method direct_llm + +The scenario root is expected to contain folders such as: + + scenarios_data/ + scenario_11/ + question.txt + groundtruth.txt + scenario_12/ + question.txt + groundtruth.txt +""" + +from __future__ import annotations + +import argparse +import os +import subprocess +import sys +from dataclasses import dataclass +from pathlib import Path + +REPO_ROOT = Path(__file__).resolve().parents[2] + +_DEFAULT_DIRECT_LLM_MODEL = "tokenrouter/MiniMax-M3" +_DEFAULT_STIRRUP_MODEL = "litellm_proxy/aws/claude-opus-4-8" + + +@dataclass(frozen=True) +class MethodConfig: + """Configuration for one benchmark method.""" + + agent_name: str + command: str + model_id: str + + +def load_scenario_ids(path: Path) -> list[str]: + """Load scenario ids from a plain text file. + + The file format is intentionally simple: + + 11 + 12 + 14 + + Blank lines and lines beginning with '#' are ignored. + """ + if not path.exists(): + raise FileNotFoundError(f"Scenario id file not found: {path}") + + scenario_ids: list[str] = [] + for raw_line in path.read_text(encoding="utf-8").splitlines(): + line = raw_line.strip() + if not line or line.startswith("#"): + continue + scenario_ids.append(line) + + if not scenario_ids: + raise ValueError(f"No scenario ids found in {path}") + + return scenario_ids + + +def scenario_dir_for_id(scenario_root: Path, scenario_id: str) -> Path: + """Return the expected scenario folder path for a scenario id.""" + return scenario_root / f"scenario_{scenario_id}" + + +def read_question(scenario_root: Path, scenario_id: str) -> str: + """Read question.txt for a scenario.""" + scenario_dir = scenario_dir_for_id(scenario_root, scenario_id) + question_path = scenario_dir / "question.txt" + + if not question_path.exists(): + raise FileNotFoundError( + f"Missing question file for scenario {scenario_id}: {question_path}" + ) + + question = question_path.read_text(encoding="utf-8").strip() + if not question: + raise ValueError( + f"Question file is empty for scenario {scenario_id}: {question_path}" + ) + + return question + + +def validate_groundtruth_exists(scenario_root: Path, scenario_id: str) -> None: + """Warn if groundtruth.txt is missing. + + The agent run itself only needs question.txt, but evaluation needs + groundtruth.txt. + """ + scenario_dir = scenario_dir_for_id(scenario_root, scenario_id) + groundtruth_path = scenario_dir / "groundtruth.txt" + + if not groundtruth_path.exists(): + print( + f"warning: missing groundtruth for scenario {scenario_id}: {groundtruth_path}", + file=sys.stderr, + ) + + +def reset_and_load_couchdb(scenario_id: str, scenario_root: Path, dry_run: bool) -> None: + """Reset CouchDB and load the scenario-specific data from scenario_root.""" + env = os.environ.copy() + env["SCENARIOS_DATA_DIR"] = str(scenario_root) + + reset_cmd = [sys.executable, "src/couchdb/init_data.py", "--reset-only"] + load_cmd = [sys.executable, "src/couchdb/init_data.py", scenario_id] + + print("\n" + "-" * 80) + print(f"Preparing CouchDB for scenario {scenario_id}") + print("Reset command:") + print(" ".join(reset_cmd)) + print("Load command:") + print(" ".join(load_cmd)) + print("-" * 80) + + if dry_run: + return + + subprocess.run(reset_cmd, check=True, cwd=str(REPO_ROOT), env=env) + subprocess.run(load_cmd, check=True, cwd=str(REPO_ROOT), env=env) + + +def run_agent_for_scenario( + *, + method: MethodConfig, + scenario_id: str, + question: str, + trajectory_dir: Path, + dry_run: bool, +) -> None: + """Run one scenario with one method.""" + run_id = f"{method.agent_name}_{scenario_id}" + + env = os.environ.copy() + env["AGENT_TRAJECTORY_DIR"] = str(trajectory_dir) + + cmd = [ + "uv", + "run", + method.command, + "--model-id", + method.model_id, + "--scenario-id", + scenario_id, + "--run-id", + run_id, + question, + ] + + print("\n" + "=" * 80) + print(f"Method: {method.agent_name}") + print(f"Scenario ID: {scenario_id}") + print(f"Run ID: {run_id}") + print(f"Trajectories: {trajectory_dir}") + print("Command:") + print(" ".join(cmd[:-1]) + " ") + print("=" * 80) + + if dry_run: + return + + subprocess.run(cmd, check=True, env=env) + + +def run_evaluation( + *, + trajectory_dir: Path, + scenario_root: Path, + report_dir: Path, + dry_run: bool, +) -> None: + """Run the existing AssetOpsBench evaluator for one method.""" + cmd = [ + "uv", + "run", + "evaluate", + "--trajectories", + str(trajectory_dir), + "--scenarios", + str(scenario_root), + "--scorer-default", + "static_json", + "--reports-dir", + str(report_dir), + ] + + print("\n" + "=" * 80) + print("Running evaluation") + print(f"Trajectories: {trajectory_dir}") + print(f"Scenarios: {scenario_root}") + print(f"Reports: {report_dir}") + print("Command:") + print(" ".join(cmd)) + print("=" * 80) + + if dry_run: + return + + subprocess.run(cmd, check=True) + + +def build_methods(args: argparse.Namespace) -> dict[str, MethodConfig]: + """Build available method configs from CLI args.""" + return { + "direct_llm": MethodConfig( + agent_name="direct_llm", + command="direct-llm-agent", + model_id=args.direct_model_id, + ), + "stirrup_agent": MethodConfig( + agent_name="stirrup_agent", + command="stirrup-agent", + model_id=args.stirrup_model_id, + ), + } + + +def selected_methods( + *, + method_name: str, + methods: dict[str, MethodConfig], +) -> list[MethodConfig]: + """Resolve the requested method selection.""" + if method_name == "all": + return list(methods.values()) + + if method_name not in methods: + valid = ", ".join(sorted([*methods.keys(), "all"])) + raise ValueError(f"Unknown method '{method_name}'. Valid choices: {valid}") + + return [methods[method_name]] + + +def _build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + prog="scenario_suite_runner", + description="Run benchmark scenarios sequentially.", + ) + + parser.add_argument( + "--scenario-ids", + type=Path, + default=Path("benchmarks/scenario_suite/scenarios.txt"), + help="Plain text file containing one scenario id per line.", + ) + parser.add_argument( + "--scenario-root", + type=Path, + required=True, + help="Directory containing scenario_/question.txt and groundtruth.txt folders.", + ) + parser.add_argument( + "--agent_name", + choices=["direct_llm", "stirrup_agent", "all"], + default="direct_llm", + help="Which method to run.", + ) + parser.add_argument( + "--trajectory-root", + type=Path, + default=Path("traces/trajectories/scenario_suite"), + help="Root directory for saved trajectories.", + ) + parser.add_argument( + "--reports-root", + type=Path, + default=Path("reports/scenario_suite"), + help="Root directory for evaluation reports.", + ) + parser.add_argument( + "--direct-model-id", + default=_DEFAULT_DIRECT_LLM_MODEL, + help="Model id for direct_llm.", + ) + parser.add_argument( + "--stirrup-model-id", + default=_DEFAULT_STIRRUP_MODEL, + help="Model id for stirrup_agent.", + ) + parser.add_argument( + "--skip-existing", + action="store_true", + help="Skip a scenario if its expected trajectory file already exists.", + ) + parser.add_argument( + "--no-evaluate", + action="store_true", + help="Run agents only; do not invoke evaluator after the run.", + ) + parser.add_argument( + "--continue-on-error", + action="store_true", + help="Continue running later scenarios if one scenario fails.", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Print commands without executing them.", + ) + + return parser + + +def main() -> None: + parser = _build_parser() + args = parser.parse_args() + + scenario_ids = load_scenario_ids(args.scenario_ids) + methods = selected_methods( + method_name=args.agent_name, + methods=build_methods(args), + ) + + print(f"Loaded {len(scenario_ids)} scenario ids from {args.scenario_ids}") + print(f"Selected methods: {', '.join(method.agent_name for method in methods)}") + + for method in methods: + trajectory_dir = args.trajectory_root / method.agent_name + report_dir = args.reports_root / method.agent_name + + if not args.dry_run: + trajectory_dir.mkdir(parents=True, exist_ok=True) + report_dir.mkdir(parents=True, exist_ok=True) + + for scenario_id in scenario_ids: + expected_trajectory = trajectory_dir / f"{method.agent_name}_{scenario_id}.json" + + if args.skip_existing and expected_trajectory.exists(): + print( + f"Skipping scenario {scenario_id}; trajectory exists: {expected_trajectory}" + ) + continue + + try: + validate_groundtruth_exists(args.scenario_root, scenario_id) + question = read_question(args.scenario_root, scenario_id) + + # Uniform CouchDB preparation for every agent and every scenario. + reset_and_load_couchdb( + scenario_id=scenario_id, + scenario_root=args.scenario_root, + dry_run=args.dry_run, + ) + + run_agent_for_scenario( + method=method, + scenario_id=scenario_id, + question=question, + trajectory_dir=trajectory_dir, + dry_run=args.dry_run, + ) + except Exception as exc: + print( + f"error: scenario {scenario_id} failed for method {method.agent_name}: {exc}", + file=sys.stderr, + ) + if not args.continue_on_error: + raise + + if not args.no_evaluate: + run_evaluation( + trajectory_dir=trajectory_dir, + scenario_root=args.scenario_root, + report_dir=report_dir, + dry_run=args.dry_run, + ) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/src/benchmark/tests/__init__.py b/src/benchmark/tests/__init__.py new file mode 100644 index 00000000..d25c211d --- /dev/null +++ b/src/benchmark/tests/__init__.py @@ -0,0 +1 @@ +"""Tests for benchmark runners.""" \ No newline at end of file diff --git a/src/benchmark/tests/test_scenario_suite_runner.py b/src/benchmark/tests/test_scenario_suite_runner.py new file mode 100644 index 00000000..ae435a42 --- /dev/null +++ b/src/benchmark/tests/test_scenario_suite_runner.py @@ -0,0 +1,158 @@ +from __future__ import annotations + +from argparse import Namespace +from pathlib import Path + +import pytest + +from benchmark import scenario_suite_runner as mr + + +def test_load_scenario_ids_ignores_blank_lines_and_comments(tmp_path: Path) -> None: + p = tmp_path / "scenarios.txt" + p.write_text( + """ + # scenario_suite scenarios + + 11 + 12 + + # more + 14 + 15 + """, + encoding="utf-8", + ) + + assert mr.load_scenario_ids(p) == ["11", "12", "14", "15"] + + +def test_load_scenario_ids_raises_for_missing_file(tmp_path: Path) -> None: + p = tmp_path / "missing.txt" + + with pytest.raises(FileNotFoundError): + mr.load_scenario_ids(p) + + +def test_scenario_dir_for_id() -> None: + root = Path("/tmp/scenarios_data") + assert mr.scenario_dir_for_id(root, "11") == root / "scenario_11" + + +def test_read_question_reads_question_txt(tmp_path: Path) -> None: + scenario_dir = tmp_path / "scenario_11" + scenario_dir.mkdir() + (scenario_dir / "question.txt").write_text("What is the count?", encoding="utf-8") + + assert mr.read_question(tmp_path, "11") == "What is the count?" + + +def test_read_question_raises_when_missing(tmp_path: Path) -> None: + (tmp_path / "scenario_11").mkdir() + + with pytest.raises(FileNotFoundError): + mr.read_question(tmp_path, "11") + + +def test_build_methods_uses_cli_defaults() -> None: + args = Namespace( + direct_model_id="tokenrouter/MiniMax-M3", + stirrup_model_id="tokenrouter/MiniMax-M3", + ) + + methods = mr.build_methods(args) + + assert methods["direct_llm"].command == "direct-llm-agent" + assert methods["direct_llm"].model_id == "tokenrouter/MiniMax-M3" + assert methods["stirrup_agent"].command == "stirrup-agent" + assert methods["stirrup_agent"].model_id == "tokenrouter/MiniMax-M3" + + +def test_selected_methods_direct_llm_only() -> None: + methods = { + "direct_llm": mr.MethodConfig( + agent_name="direct_llm", + command="direct-llm-agent", + model_id="tokenrouter/MiniMax-M3", + ), + "stirrup_agent": mr.MethodConfig( + agent_name="stirrup_agent", + command="stirrup-agent", + model_id="tokenrouter/MiniMax-M3", + ), + } + + selected = mr.selected_methods(method_name="direct_llm", methods=methods) + + assert len(selected) == 1 + assert selected[0].agent_name == "direct_llm" + + +def test_selected_methods_all_returns_both() -> None: + methods = { + "direct_llm": mr.MethodConfig( + agent_name="direct_llm", + command="direct-llm-agent", + model_id="tokenrouter/MiniMax-M3", + ), + "stirrup_agent": mr.MethodConfig( + agent_name="stirrup_agent", + command="stirrup-agent", + model_id="tokenrouter/MiniMax-M3", + ), + } + + selected = mr.selected_methods(method_name="all", methods=methods) + + assert [m.agent_name for m in selected] == ["direct_llm", "stirrup_agent"] + + +def test_run_agent_for_scenario_dry_run_does_not_call_subprocess( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + called = False + + def fake_run(*args, **kwargs): + nonlocal called + called = True + raise AssertionError("subprocess.run should not be called in dry_run") + + monkeypatch.setattr(mr.subprocess, "run", fake_run) + + method = mr.MethodConfig( + name="direct_llm", + command="direct-llm-agent", + model_id="tokenrouter/MiniMax-M3", + ) + + mr.run_agent_for_scenario( + method=method, + scenario_id="11", + question="What is the count?", + trajectory_dir=tmp_path / "traj", + dry_run=True, + ) + + assert called is False + + +def test_run_evaluation_dry_run_does_not_call_subprocess( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + called = False + + def fake_run(*args, **kwargs): + nonlocal called + called = True + raise AssertionError("subprocess.run should not be called in dry_run") + + monkeypatch.setattr(mr.subprocess, "run", fake_run) + + mr.run_evaluation( + trajectory_dir=tmp_path / "traj", + scenario_root=tmp_path / "scenarios", + report_dir=tmp_path / "reports", + dry_run=True, + ) + + assert called is False \ No newline at end of file diff --git a/src/evaluation/cli.py b/src/evaluation/cli.py index 22661725..66ee508d 100644 --- a/src/evaluation/cli.py +++ b/src/evaluation/cli.py @@ -47,7 +47,8 @@ def _build_parser() -> argparse.ArgumentParser: "--scorer-default", dest="scorer_default", default="llm_judge", - help="Scorer name when scenario.scoring_method is unset. Default: llm_judge.", + help="Scorer name when scenario.scoring_method is unset. " + "Default: llm_judge.", ) p.add_argument( "--judge-model", diff --git a/src/evaluation/evaluator.py b/src/evaluation/evaluator.py index a9a1bed9..27845f08 100644 --- a/src/evaluation/evaluator.py +++ b/src/evaluation/evaluator.py @@ -82,9 +82,7 @@ def _score_one( def _resolve(name: str) -> Scorer: return scorer_registry.get(name) - def _validate_judge_model( - self, scorer_name: str, traj: PersistedTrajectory - ) -> None: + def _validate_judge_model(self, scorer_name: str, traj: PersistedTrajectory) -> None: if scorer_name != "llm_judge" or not self.judge_model: return diff --git a/src/evaluation/loader.py b/src/evaluation/loader.py index e1481f0f..1d5c0c9d 100644 --- a/src/evaluation/loader.py +++ b/src/evaluation/loader.py @@ -45,20 +45,20 @@ def _load_one_trajectory(path: Path) -> PersistedTrajectory: def load_scenarios(paths: Iterable[Path] | Path) -> list[Scenario]: """Load scenarios from one or more files or directories. - Supported inputs: + Supported inputs: - 1. Existing JSON / JSONL scenario files. - 2. A directory containing scenario subdirectories, each with - ``groundtruth.txt``. For example: + 1. Existing JSON / JSONL scenario files. + 2. A directory containing scenario subdirectories, each with + ``groundtruth.txt``. For example: - scenarios_data/ - scenario_11/ - groundtruth.txt - scenario_12/ - groundtruth.txt + scenarios_data/ + scenario_11/ + groundtruth.txt + scenario_12/ + groundtruth.txt - For folder-based scenarios, the folder name becomes the scenario id and - ``groundtruth.txt`` becomes ``expected_answer``. + For folder-based scenarios, the folder name becomes the scenario id and + ``groundtruth.txt`` becomes ``expected_answer``. """ if isinstance(paths, (str, Path)): paths = [Path(paths)] @@ -154,4 +154,4 @@ def join_records( continue scenario = by_id.get(traj.scenario_id) if scenario is not None: - yield scenario, traj + yield scenario, traj \ No newline at end of file diff --git a/src/evaluation/metrics.py b/src/evaluation/metrics.py index 0255263c..325074a7 100644 --- a/src/evaluation/metrics.py +++ b/src/evaluation/metrics.py @@ -40,9 +40,7 @@ def _from_sdk_trajectory(traj: dict, model: str) -> OpsMetrics: tokens_in = sum(int(t.get("input_tokens") or 0) for t in turns) tokens_out = sum(int(t.get("output_tokens") or 0) for t in turns) - durations_ms = [ - t.get("duration_ms") for t in turns if t.get("duration_ms") is not None - ] + durations_ms = [t.get("duration_ms") for t in turns if t.get("duration_ms") is not None] duration_ms = sum(durations_ms) if durations_ms else None tool_names: list[str] = [] @@ -67,7 +65,11 @@ def _from_plan_execute(steps: list[Any], model: str) -> OpsMetrics: # plan-execute persists ``list[StepResult]``; the dataclass exposes # ``server`` / ``tool`` / ``response`` fields but no per-step token # counts, so we surface what is available and leave the rest at zero. - tool_names = [s.get("tool") for s in steps if isinstance(s, dict) and s.get("tool")] + tool_names = [ + s.get("tool") + for s in steps + if isinstance(s, dict) and s.get("tool") + ] return OpsMetrics( turn_count=len(steps), tool_call_count=len(tool_names), diff --git a/src/evaluation/models.py b/src/evaluation/models.py index 2f57e0b3..353619f2 100644 --- a/src/evaluation/models.py +++ b/src/evaluation/models.py @@ -116,3 +116,4 @@ class EvalReport(BaseModel): by_scenario_type: dict[str, TypeBreakdown] = Field(default_factory=dict) ops: AggregateOps = Field(default_factory=AggregateOps) results: list[ScenarioResult] = Field(default_factory=list) + score_summary: dict[str, Any] | None = None diff --git a/src/evaluation/report.py b/src/evaluation/report.py index 6ff5a9d2..638f2aa5 100644 --- a/src/evaluation/report.py +++ b/src/evaluation/report.py @@ -6,6 +6,7 @@ import json from collections import defaultdict from pathlib import Path +from typing import Any from .metrics import aggregate_ops from .models import EvalReport, ScenarioResult, TypeBreakdown @@ -13,6 +14,95 @@ _AGGREGATE_FILENAME = "_aggregate.json" +def _safe_float(value: Any) -> float | None: + if isinstance(value, bool): + return None + if isinstance(value, (int, float)): + return float(value) + return None + + +def _avg(values: list[float]) -> float | None: + return round(sum(values) / len(values), 4) if values else None + + +def _aggregate_score_summary(results: list[ScenarioResult]) -> dict[str, Any]: + """Aggregate static_json-style score.details across all results. + + Per-scenario key-level details stay in each result. Here we summarize the + numeric metrics and count totals across the full batch. + """ + metric_names = [ + "partial_exact_match_accuracy", + "strict_exact_match_accuracy", + "partial_similarity_score", + "precision", + "recall", + "f1", + "total_gold_keys", + "total_model_keys", + "matched_keys", + "exact_value_matches", + ] + + score_values: dict[str, list[float]] = {name: [] for name in metric_names} + score_values["score"] = [] + + missing_keys_total = 0 + extra_keys_total = 0 + detail_entries_total = 0 + scored_results = 0 + + for result in results: + # Top-level score field, if present + score_value = _safe_float(result.score.score) + if score_value is not None: + score_values["score"].append(score_value) + + details = result.score.details + if not isinstance(details, dict): + continue + + scored_results += 1 + + for name in metric_names: + value = _safe_float(details.get(name)) + if value is not None: + score_values[name].append(value) + + missing = details.get("missing_keys") + if isinstance(missing, list): + missing_keys_total += len(missing) + + extra = details.get("extra_keys") + if isinstance(extra, list): + extra_keys_total += len(extra) + + per_key_details = details.get("details") + if isinstance(per_key_details, list): + detail_entries_total += len(per_key_details) + + return { + "scored_results": scored_results, + "score_avg": _avg(score_values["score"]), + "score_min": round(min(score_values["score"]), 4) if score_values["score"] else None, + "score_max": round(max(score_values["score"]), 4) if score_values["score"] else None, + "partial_exact_match_accuracy_avg": _avg(score_values["partial_exact_match_accuracy"]), + "strict_exact_match_accuracy_avg": _avg(score_values["strict_exact_match_accuracy"]), + "partial_similarity_score_avg": _avg(score_values["partial_similarity_score"]), + "precision_avg": _avg(score_values["precision"]), + "recall_avg": _avg(score_values["recall"]), + "f1_avg": _avg(score_values["f1"]), + "total_gold_keys_avg": _avg(score_values["total_gold_keys"]), + "total_model_keys_avg": _avg(score_values["total_model_keys"]), + "matched_keys_avg": _avg(score_values["matched_keys"]), + "exact_value_matches_avg": _avg(score_values["exact_value_matches"]), + "missing_keys_total": missing_keys_total, + "extra_keys_total": extra_keys_total, + "detail_entries_total": detail_entries_total, + } + + def build_report(results: list[ScenarioResult]) -> EvalReport: total = len(results) passed = sum(1 for r in results if r.score.passed) @@ -43,6 +133,7 @@ def build_report(results: list[ScenarioResult]) -> EvalReport: }, by_scenario_type=breakdown, ops=aggregate_ops(results), + score_summary=_aggregate_score_summary(results), results=results, ) @@ -58,7 +149,7 @@ def write_reports_dir(report: EvalReport, reports_dir: Path) -> Path: """Write one JSON file per result (``.json``) plus an aggregate. Results without a ``run_id`` fall back to ``.json`` so - nothing is dropped. Returns the directory path. + nothing is dropped. Returns the directory path. """ reports_dir = Path(reports_dir) reports_dir.mkdir(parents=True, exist_ok=True) @@ -66,7 +157,6 @@ def write_reports_dir(report: EvalReport, reports_dir: Path) -> Path: used: dict[str, int] = {} for r in report.results: stem = r.run_id or f"scenario-{r.scenario_id}" - # Disambiguate any collisions deterministically. suffix = used.get(stem, 0) used[stem] = suffix + 1 name = stem if suffix == 0 else f"{stem}-{suffix}" @@ -88,6 +178,49 @@ def render_summary(report: EvalReport) -> str: f"Passed: {t.get('passed', 0)} " f"Pass rate: {t.get('pass_rate', 0):.1%}" ) + + if report.score_summary: + s = report.score_summary + lines.append("") + lines.append("Static JSON summary:") + if s.get("score_avg") is not None: + lines.append(f" score_avg: {s['score_avg']:.4f}") + if s.get("score_min") is not None: + lines.append(f" score_min: {s['score_min']:.4f}") + if s.get("score_max") is not None: + lines.append(f" score_max: {s['score_max']:.4f}") + if s.get("partial_exact_match_accuracy_avg") is not None: + lines.append( + f" partial_exact_match_avg: {s['partial_exact_match_accuracy_avg']:.4f}" + ) + if s.get("strict_exact_match_accuracy_avg") is not None: + lines.append( + f" strict_exact_match_avg: {s['strict_exact_match_accuracy_avg']:.4f}" + ) + if s.get("partial_similarity_score_avg") is not None: + lines.append( + f" partial_similarity_avg: {s['partial_similarity_score_avg']:.4f}" + ) + if s.get("precision_avg") is not None: + lines.append(f" precision_avg: {s['precision_avg']:.4f}") + if s.get("recall_avg") is not None: + lines.append(f" recall_avg: {s['recall_avg']:.4f}") + if s.get("f1_avg") is not None: + lines.append(f" f1_avg: {s['f1_avg']:.4f}") + if s.get("total_gold_keys_avg") is not None: + lines.append(f" total_gold_keys_avg: {s['total_gold_keys_avg']:.4f}") + if s.get("total_model_keys_avg") is not None: + lines.append(f" total_model_keys_avg: {s['total_model_keys_avg']:.4f}") + if s.get("matched_keys_avg") is not None: + lines.append(f" matched_keys_avg: {s['matched_keys_avg']:.4f}") + if s.get("exact_value_matches_avg") is not None: + lines.append( + f" exact_value_matches_avg: {s['exact_value_matches_avg']:.4f}" + ) + lines.append(f" missing_keys_total: {s.get('missing_keys_total', 0)}") + lines.append(f" extra_keys_total: {s.get('extra_keys_total', 0)}") + lines.append(f" detail_entries_total: {s.get('detail_entries_total', 0)}") + if report.by_scenario_type: lines.append("") lines.append("By scenario type:") @@ -95,6 +228,7 @@ def render_summary(report: EvalReport) -> str: lines.append( f" {stype:<16} {b.passed:>4}/{b.total:<4} ({b.pass_rate:.1%})" ) + o = report.ops lines.append("") lines.append("Operational metrics:") @@ -112,4 +246,4 @@ def render_summary(report: EvalReport) -> str: def report_to_json(report: EvalReport) -> str: """Convenience JSON dump that round-trips through pydantic.""" - return json.dumps(json.loads(report.model_dump_json()), indent=2) + return json.dumps(json.loads(report.model_dump_json()), indent=2) \ No newline at end of file diff --git a/src/evaluation/scorers/__init__.py b/src/evaluation/scorers/__init__.py index 37973fae..f681844a 100644 --- a/src/evaluation/scorers/__init__.py +++ b/src/evaluation/scorers/__init__.py @@ -30,7 +30,9 @@ def register(name: str, scorer: Scorer) -> None: def get(name: str) -> Scorer: if name not in _REGISTRY: - raise KeyError(f"unknown scorer {name!r}; registered: {sorted(_REGISTRY)}") + raise KeyError( + f"unknown scorer {name!r}; registered: {sorted(_REGISTRY)}" + ) return _REGISTRY[name] @@ -46,4 +48,4 @@ def names() -> list[str]: from . import semantic # noqa: E402,F401 from .static_json import install as _install_static_json # noqa: E402 -_install_static_json() +_install_static_json() \ No newline at end of file diff --git a/src/evaluation/scorers/llm_judge.py b/src/evaluation/scorers/llm_judge.py index 139744dd..e37ecc21 100644 --- a/src/evaluation/scorers/llm_judge.py +++ b/src/evaluation/scorers/llm_judge.py @@ -140,7 +140,9 @@ def __call__( if review.get("hallucinations") is True: score = max(0.0, score - 0.2) - rationale = str(review.get("suggestions") or review.get("reason") or "")[:500] + rationale = str( + review.get("suggestions") or review.get("reason") or "" + )[:500] return ScorerResult( scorer=self.name, passed=passed, diff --git a/src/evaluation/scorers/static_json.py b/src/evaluation/scorers/static_json.py index 20fe7262..a26c53db 100644 --- a/src/evaluation/scorers/static_json.py +++ b/src/evaluation/scorers/static_json.py @@ -114,7 +114,9 @@ def _extract_balanced_structure(content: str) -> str: (content.find("("), "(", ")"), ] candidates = [ - (idx, open_ch, close_ch) for idx, open_ch, close_ch in candidates if idx != -1 + (idx, open_ch, close_ch) + for idx, open_ch, close_ch in candidates + if idx != -1 ] if not candidates: @@ -365,7 +367,9 @@ def evaluate_static_json( precision = exact_matches / total_model_keys if total_model_keys else 0.0 recall = exact_matches / total_gold_keys if total_gold_keys else 0.0 f1 = ( - 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0.0 + 2 * precision * recall / (precision + recall) + if precision + recall > 0 + else 0.0 ) partial_exact = exact_matches / total_gold_keys if total_gold_keys else 0.0 @@ -388,7 +392,6 @@ def evaluate_static_json( details=details, ) - def evaluate_static_json_batch( pairs: list[tuple[Any, Any]], *, @@ -436,7 +439,6 @@ def evaluate_static_json_batch( "examples": [score.to_dict() for score in scores], } - class StaticJsonScorer: """Evaluation scorer wrapper for the trajectory-based pipeline.""" @@ -480,4 +482,4 @@ def __call__( def install(name: str = "static_json") -> None: """Register the static JSON scorer.""" - register(name, StaticJsonScorer(name=name)) + register(name, StaticJsonScorer(name=name)) \ No newline at end of file diff --git a/src/evaluation/tests/test_loader.py b/src/evaluation/tests/test_loader.py index 72b3b3e5..27d5c9a9 100644 --- a/src/evaluation/tests/test_loader.py +++ b/src/evaluation/tests/test_loader.py @@ -21,9 +21,7 @@ def test_load_trajectories_from_dir(trajectory_dir: Path): def test_load_trajectories_skips_unparseable(tmp_path: Path, make_persisted_record): - (tmp_path / "good.json").write_text( - json.dumps(make_persisted_record()), encoding="utf-8" - ) + (tmp_path / "good.json").write_text(json.dumps(make_persisted_record()), encoding="utf-8") (tmp_path / "bad.json").write_text("{not json", encoding="utf-8") records = load_trajectories(tmp_path) assert len(records) == 1 @@ -32,7 +30,9 @@ def test_load_trajectories_skips_unparseable(tmp_path: Path, make_persisted_reco def test_load_scenarios_json_list(tmp_path: Path): p = tmp_path / "s.json" p.write_text( - json.dumps([{"id": 1, "text": "Q1"}, {"id": "2", "text": "Q2"}]), + json.dumps( + [{"id": 1, "text": "Q1"}, {"id": "2", "text": "Q2"}] + ), encoding="utf-8", ) out = load_scenarios(p) @@ -65,9 +65,7 @@ def test_join_drops_orphans(make_persisted_record): ] trajs = [ PersistedTrajectory.from_raw(make_persisted_record(scenario_id=1)), - PersistedTrajectory.from_raw( - make_persisted_record(run_id="r2", scenario_id=99) - ), + PersistedTrajectory.from_raw(make_persisted_record(run_id="r2", scenario_id=99)), ] pairs = list(join_records(scenarios, trajs)) assert len(pairs) == 1 @@ -110,4 +108,4 @@ def test_load_scenarios_from_groundtruth_folders(tmp_path): assert len(scenarios) == 1 assert scenarios[0].id == "11" assert scenarios[0].expected_answer == "{'energy': 14, 'material': 48}" - assert scenarios[0].scoring_method == "static_json" + assert scenarios[0].scoring_method == "static_json" \ No newline at end of file diff --git a/src/evaluation/tests/test_metrics.py b/src/evaluation/tests/test_metrics.py index df096d03..21f097b1 100644 --- a/src/evaluation/tests/test_metrics.py +++ b/src/evaluation/tests/test_metrics.py @@ -47,27 +47,9 @@ def test_plan_execute_list_trajectory(self, make_persisted_record): rec = PersistedTrajectory.from_raw( make_persisted_record( trajectory=[ - { - "step_number": 1, - "task": "t", - "server": "iot", - "tool": "sites", - "response": "ok", - }, - { - "step_number": 2, - "task": "t2", - "server": "iot", - "tool": "assets", - "response": "ok", - }, - { - "step_number": 3, - "task": "t3", - "server": "iot", - "tool": "sites", - "response": "ok", - }, + {"step_number": 1, "task": "t", "server": "iot", "tool": "sites", "response": "ok"}, + {"step_number": 2, "task": "t2", "server": "iot", "tool": "assets", "response": "ok"}, + {"step_number": 3, "task": "t3", "server": "iot", "tool": "sites", "response": "ok"}, ] ) ) @@ -85,21 +67,9 @@ def test_empty(self): def test_sums_and_percentiles(self): results = [ - _result( - ops=OpsMetrics( - tokens_in=10, tokens_out=5, duration_ms=100.0, tool_call_count=1 - ) - ), - _result( - ops=OpsMetrics( - tokens_in=20, tokens_out=10, duration_ms=300.0, tool_call_count=2 - ) - ), - _result( - ops=OpsMetrics( - tokens_in=30, tokens_out=15, duration_ms=500.0, tool_call_count=3 - ) - ), + _result(ops=OpsMetrics(tokens_in=10, tokens_out=5, duration_ms=100.0, tool_call_count=1)), + _result(ops=OpsMetrics(tokens_in=20, tokens_out=10, duration_ms=300.0, tool_call_count=2)), + _result(ops=OpsMetrics(tokens_in=30, tokens_out=15, duration_ms=500.0, tool_call_count=3)), ] agg = aggregate_ops(results) assert agg.tokens_in_total == 60 @@ -120,10 +90,7 @@ def test_cost_only_when_some_present(self): class TestNormalizeModel: def test_strips_provider_prefix(self): - assert ( - _normalize_model("litellm_proxy/anthropic/claude-opus-4-5") - == "claude-opus-4-5" - ) + assert _normalize_model("litellm_proxy/anthropic/claude-opus-4-5") == "claude-opus-4-5" assert _normalize_model("watsonx/ibm/granite-13b") == "granite-13b" def test_strips_long_numeric_suffix(self): diff --git a/src/evaluation/tests/test_models.py b/src/evaluation/tests/test_models.py index 621107a0..4aca4d55 100644 --- a/src/evaluation/tests/test_models.py +++ b/src/evaluation/tests/test_models.py @@ -10,9 +10,7 @@ def test_scenario_from_raw_coerces_int_id_to_str(): def test_scenario_preserves_extra_fields(): - s = Scenario.from_raw( - {"id": "1", "text": "Q", "characteristic_form": "X", "tolerance": 0.01} - ) + s = Scenario.from_raw({"id": "1", "text": "Q", "characteristic_form": "X", "tolerance": 0.01}) extra = s.model_extra or {} assert extra.get("tolerance") == 0.01 diff --git a/src/evaluation/tests/test_report.py b/src/evaluation/tests/test_report.py index aabb5042..0c821f5c 100644 --- a/src/evaluation/tests/test_report.py +++ b/src/evaluation/tests/test_report.py @@ -27,9 +27,7 @@ def _result(stype: str, passed: bool, run_id: str = "", **ops_kwargs) -> Scenari model="watsonx/ibm/granite", question="q", answer="a", - score=ScorerResult( - scorer="llm_judge", passed=passed, score=1.0 if passed else 0.0 - ), + score=ScorerResult(scorer="llm_judge", passed=passed, score=1.0 if passed else 0.0), ops=OpsMetrics(**ops_kwargs), ) @@ -100,17 +98,62 @@ def test_write_reports_dir_falls_back_to_scenario_id(tmp_path: Path): def test_render_summary_includes_headlines(): results = [ - _result( - "iot", - True, - tokens_in=10, - tokens_out=5, - duration_ms=100.0, - tool_call_count=1, - ), + _result("iot", True, tokens_in=10, tokens_out=5, duration_ms=100.0, tool_call_count=1), _result("iot", False, tokens_in=8, tokens_out=4, duration_ms=200.0), ] text = render_summary(build_report(results)) assert "Pass rate" in text assert "iot" in text assert "tokens_in_total" in text + +def test_build_report_includes_score_summary(): + from evaluation.models import ScenarioResult, ScorerResult, OpsMetrics + + results = [ + ScenarioResult( + scenario_id="11", + scenario_type="structured", + run_id="direct_llm_11", + runner="direct-llm-agent", + model="tokenrouter/MiniMax-M3", + question="Q", + answer='{"energy":0,"material":0}', + score=ScorerResult( + scorer="static_json", + passed=False, + score=0.0, + rationale="structured answer differs from ground truth", + details={ + "partial_exact_match_accuracy": 0.0, + "strict_exact_match_accuracy": 0.0, + "partial_similarity_score": 0.0, + "precision": 0.0, + "recall": 0.0, + "f1": 0.0, + "total_gold_keys": 2, + "total_model_keys": 2, + "matched_keys": 2, + "exact_value_matches": 0, + "missing_keys": [], + "extra_keys": [], + "details": [], + }, + ), + ops=OpsMetrics( + turn_count=1, + tool_call_count=0, + unique_tools=[], + tokens_in=390, + tokens_out=245, + duration_ms=6224.3382, + est_cost_usd=None, + ), + ) + ] + + report = build_report(results) + + assert report.score_summary is not None + assert report.score_summary["partial_exact_match_accuracy_avg"] == 0.0 + assert report.score_summary["strict_exact_match_accuracy_avg"] == 0.0 + assert report.score_summary["missing_keys_total"] == 0 \ No newline at end of file diff --git a/src/evaluation/tests/test_runner.py b/src/evaluation/tests/test_runner.py index b82123f7..f8a936db 100644 --- a/src/evaluation/tests/test_runner.py +++ b/src/evaluation/tests/test_runner.py @@ -10,9 +10,7 @@ from evaluation import scorers as registry -def _always_pass_scorer( - scenario: Scenario, answer: str, trajectory_text: str -) -> ScorerResult: +def _always_pass_scorer(scenario: Scenario, answer: str, trajectory_text: str) -> ScorerResult: return ScorerResult(scorer="stub", passed=True, score=1.0) @@ -48,15 +46,11 @@ def test_evaluate_end_to_end(tmp_path: Path, make_persisted_record): assert report.ops.tokens_in_total > 0 -def _always_fail_scorer( - scenario: Scenario, answer: str, trajectory_text: str -) -> ScorerResult: +def _always_fail_scorer(scenario: Scenario, answer: str, trajectory_text: str) -> ScorerResult: return ScorerResult(scorer="stub-fail", passed=False, score=0.0) -def test_evaluate_uses_per_scenario_scoring_method( - tmp_path: Path, make_persisted_record -): +def test_evaluate_uses_per_scenario_scoring_method(tmp_path: Path, make_persisted_record): rec = make_persisted_record(run_id="run-x", scenario_id=1, answer="A.") (tmp_path / "run-x.json").write_text(json.dumps(rec), encoding="utf-8") diff --git a/src/evaluation/tests/test_static_json_scorer.py b/src/evaluation/tests/test_static_json_scorer.py index 97ce7239..175a320c 100644 --- a/src/evaluation/tests/test_static_json_scorer.py +++ b/src/evaluation/tests/test_static_json_scorer.py @@ -5,7 +5,6 @@ parse_structured_answer, ) - def test_parse_json_object_from_noisy_markdown_answer(): raw = 'Answer:\n```json\n{"energy": 3, "material": 12}\n```' @@ -123,6 +122,7 @@ def test_batch_evaluation(): assert result["strict_exact_match_accuracy"] == 0.5 + from evaluation.models import Scenario from evaluation.scorers.static_json import StaticJsonScorer @@ -147,4 +147,4 @@ def test_static_json_scorer_wrapper_exact_match(): assert result.scorer == "static_json" assert result.passed is True assert result.score == 1.0 - assert result.details["strict_exact_match_accuracy"] == 1.0 + assert result.details["strict_exact_match_accuracy"] == 1.0 \ No newline at end of file From 72d7ad075cdfa283ee107b39d6869aef98a0205a Mon Sep 17 00:00:00 2001 From: Chathurangi Shyalika Date: Thu, 18 Jun 2026 16:51:21 -0400 Subject: [PATCH 2/4] Updating README.md Signed-off-by: Chathurangi Shyalika --- benchmarks/scenario_suite/README.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/benchmarks/scenario_suite/README.md b/benchmarks/scenario_suite/README.md index 603c8261..e6239f65 100644 --- a/benchmarks/scenario_suite/README.md +++ b/benchmarks/scenario_suite/README.md @@ -70,7 +70,7 @@ The scenario folder name must match the id from `scenarios.txt`: Run the direct LLM baseline sequentially over the listed scenarios: ```bash -uv run python -m benchmark.scenario_suite_runner --scenario-ids benchmarks/scenario_suite/scenarios.txt --scenario-root /.../scenarios_data --method direct_llm --direct-model-id tokenrouter/MiniMax-M3 +uv run python -m benchmark.scenario_suite_runner --scenario-ids benchmarks/scenario_suite/scenarios.txt --scenario-root /.../scenarios_data --agent_name direct_llm --direct-model-id tokenrouter/MiniMax-M3 ``` This writes trajectories to: @@ -99,7 +99,7 @@ Run the Stirrup agent sequentially over the listed scenarios using the MiniMax m uv run python -m benchmark.scenario_suite_runner \ --scenario-ids benchmarks/scenario_suite/scenarios.txt \ --scenario-root /.../scenarios_data \ - --method stirrup_agent \ + --agent_name stirrup_agent \ --stirrup-model-id tokenrouter/MiniMax-M3 ``` @@ -115,12 +115,12 @@ and reports to: reports/scenario_suite/stirrup_agent/ ``` -## Run all methods +## Run all agents Run both supported methods one after the other: ```bash -uv run python -m benchmark.scenario_suite_runner --scenario-ids benchmarks/scenario_suite/scenarios.txt --scenario-root /.../scenarios_data --method all +uv run python -m benchmark.scenario_suite_runner --scenario-ids benchmarks/scenario_suite/scenarios.txt --scenario-root /.../scenarios_data --agent_name all ``` ## Useful options @@ -130,7 +130,7 @@ uv run python -m benchmark.scenario_suite_runner --scenario-ids benchmarks/sce Print the commands without executing them: ```bash -uv run python -m benchmark.scenario_suite_runner --scenario-ids benchmarks/scenario_suite/scenarios.txt --scenario-root /.../scenarios_data --method direct_llm --dry-run +uv run python -m benchmark.scenario_suite_runner --scenario-ids benchmarks/scenario_suite/scenarios.txt --scenario-root /.../scenarios_data --agent_name direct_llm --dry-run ``` ### Skip existing trajectories @@ -138,7 +138,7 @@ uv run python -m benchmark.scenario_suite_runner --scenario-ids benchmarks/sce Skip scenarios whose trajectory files already exist: ```bash -uv run python -m benchmark.scenario_suite_runner --scenario-ids benchmarks/scenario_suite/scenarios.txt --scenario-root /.../scenarios_data --method direct_llm --skip-existing +uv run python -m benchmark.scenario_suite_runner --scenario-ids benchmarks/scenario_suite/scenarios.txt --scenario-root /.../scenarios_data --agent_name direct_llm --skip-existing ``` ### Continue after errors @@ -146,7 +146,7 @@ uv run python -m benchmark.scenario_suite_runner --scenario-ids benchmarks/sce Keep running later scenarios even if one fails: ```bash -uv run python -m benchmark.scenario_suite_runner --scenario-ids benchmarks/scenario_suite/scenarios.txt --scenario-root /.../scenarios_data --method direct_llm --continue-on-error +uv run python -m benchmark.scenario_suite_runner --scenario-ids benchmarks/scenario_suite/scenarios.txt --scenario-root /.../scenarios_data --agent_name direct_llm --continue-on-error ``` ## Environment variables From 055a3c1ffcc5856fc43c6a7a257a27e5251c94bf Mon Sep 17 00:00:00 2001 From: Chathurangi Shyalika Date: Thu, 18 Jun 2026 19:25:26 -0400 Subject: [PATCH 3/4] Updating model parameters & Updating README.md Signed-off-by: Chathurangi Shyalika --- benchmarks/scenario_suite/README.md | 16 ++++++++-------- src/benchmark/scenario_suite_runner.py | 23 +++++++++-------------- 2 files changed, 17 insertions(+), 22 deletions(-) diff --git a/benchmarks/scenario_suite/README.md b/benchmarks/scenario_suite/README.md index e6239f65..ea21337f 100644 --- a/benchmarks/scenario_suite/README.md +++ b/benchmarks/scenario_suite/README.md @@ -70,7 +70,7 @@ The scenario folder name must match the id from `scenarios.txt`: Run the direct LLM baseline sequentially over the listed scenarios: ```bash -uv run python -m benchmark.scenario_suite_runner --scenario-ids benchmarks/scenario_suite/scenarios.txt --scenario-root /.../scenarios_data --agent_name direct_llm --direct-model-id tokenrouter/MiniMax-M3 +uv run python -m benchmark.scenario_suite_runner --scenario-ids benchmarks/scenario_suite/scenarios.txt --scenario-root /.../scenarios_data --method direct_llm --model-id tokenrouter/MiniMax-M3 ``` This writes trajectories to: @@ -99,8 +99,8 @@ Run the Stirrup agent sequentially over the listed scenarios using the MiniMax m uv run python -m benchmark.scenario_suite_runner \ --scenario-ids benchmarks/scenario_suite/scenarios.txt \ --scenario-root /.../scenarios_data \ - --agent_name stirrup_agent \ - --stirrup-model-id tokenrouter/MiniMax-M3 + --method stirrup_agent \ + --model-id tokenrouter/MiniMax-M3 ``` This writes trajectories to: @@ -115,12 +115,12 @@ and reports to: reports/scenario_suite/stirrup_agent/ ``` -## Run all agents +## Run all methods Run both supported methods one after the other: ```bash -uv run python -m benchmark.scenario_suite_runner --scenario-ids benchmarks/scenario_suite/scenarios.txt --scenario-root /.../scenarios_data --agent_name all +uv run python -m benchmark.scenario_suite_runner --scenario-ids benchmarks/scenario_suite/scenarios.txt --scenario-root /.../scenarios_data --method all ``` ## Useful options @@ -130,7 +130,7 @@ uv run python -m benchmark.scenario_suite_runner --scenario-ids benchmarks/sce Print the commands without executing them: ```bash -uv run python -m benchmark.scenario_suite_runner --scenario-ids benchmarks/scenario_suite/scenarios.txt --scenario-root /.../scenarios_data --agent_name direct_llm --dry-run +uv run python -m benchmark.scenario_suite_runner --scenario-ids benchmarks/scenario_suite/scenarios.txt --scenario-root /.../scenarios_data --method direct_llm --dry-run ``` ### Skip existing trajectories @@ -138,7 +138,7 @@ uv run python -m benchmark.scenario_suite_runner --scenario-ids benchmarks/sce Skip scenarios whose trajectory files already exist: ```bash -uv run python -m benchmark.scenario_suite_runner --scenario-ids benchmarks/scenario_suite/scenarios.txt --scenario-root /.../scenarios_data --agent_name direct_llm --skip-existing +uv run python -m benchmark.scenario_suite_runner --scenario-ids benchmarks/scenario_suite/scenarios.txt --scenario-root /.../scenarios_data --method direct_llm --skip-existing ``` ### Continue after errors @@ -146,7 +146,7 @@ uv run python -m benchmark.scenario_suite_runner --scenario-ids benchmarks/sce Keep running later scenarios even if one fails: ```bash -uv run python -m benchmark.scenario_suite_runner --scenario-ids benchmarks/scenario_suite/scenarios.txt --scenario-root /.../scenarios_data --agent_name direct_llm --continue-on-error +uv run python -m benchmark.scenario_suite_runner --scenario-ids benchmarks/scenario_suite/scenarios.txt --scenario-root /.../scenarios_data --method direct_llm --continue-on-error ``` ## Environment variables diff --git a/src/benchmark/scenario_suite_runner.py b/src/benchmark/scenario_suite_runner.py index 09845479..fe836698 100644 --- a/src/benchmark/scenario_suite_runner.py +++ b/src/benchmark/scenario_suite_runner.py @@ -9,7 +9,8 @@ uv run python -m benchmark.scenario_suite_runner \ --scenario-ids benchmarks/scenario_suite/scenarios.txt \ --scenario-root /path/to/scenarios_data \ - --method direct_llm + --agent_name direct_llm \ + --model-id tokenrouter/MiniMax-M3 The scenario root is expected to contain folders such as: @@ -33,8 +34,7 @@ REPO_ROOT = Path(__file__).resolve().parents[2] -_DEFAULT_DIRECT_LLM_MODEL = "tokenrouter/MiniMax-M3" -_DEFAULT_STIRRUP_MODEL = "litellm_proxy/aws/claude-opus-4-8" +_DEFAULT_MODEL_ID = "tokenrouter/MiniMax-M3" @dataclass(frozen=True) @@ -221,12 +221,12 @@ def build_methods(args: argparse.Namespace) -> dict[str, MethodConfig]: "direct_llm": MethodConfig( agent_name="direct_llm", command="direct-llm-agent", - model_id=args.direct_model_id, + model_id=args.model_id, ), "stirrup_agent": MethodConfig( agent_name="stirrup_agent", command="stirrup-agent", - model_id=args.stirrup_model_id, + model_id=args.model_id, ), } @@ -269,7 +269,7 @@ def _build_parser() -> argparse.ArgumentParser: "--agent_name", choices=["direct_llm", "stirrup_agent", "all"], default="direct_llm", - help="Which method to run.", + help="Which agent to run.", ) parser.add_argument( "--trajectory-root", @@ -284,14 +284,9 @@ def _build_parser() -> argparse.ArgumentParser: help="Root directory for evaluation reports.", ) parser.add_argument( - "--direct-model-id", - default=_DEFAULT_DIRECT_LLM_MODEL, - help="Model id for direct_llm.", - ) - parser.add_argument( - "--stirrup-model-id", - default=_DEFAULT_STIRRUP_MODEL, - help="Model id for stirrup_agent.", + "--model-id", + default=_DEFAULT_MODEL_ID, + help="Model id used by both agents.", ) parser.add_argument( "--skip-existing", From afb3437ffe902f5e3a42d7692959d4179cef8b9a Mon Sep 17 00:00:00 2001 From: Chathurangi Shyalika Date: Thu, 18 Jun 2026 19:29:55 -0400 Subject: [PATCH 4/4] Updating README.md Signed-off-by: Chathurangi Shyalika --- benchmarks/scenario_suite/README.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/benchmarks/scenario_suite/README.md b/benchmarks/scenario_suite/README.md index ea21337f..2fd08335 100644 --- a/benchmarks/scenario_suite/README.md +++ b/benchmarks/scenario_suite/README.md @@ -70,7 +70,7 @@ The scenario folder name must match the id from `scenarios.txt`: Run the direct LLM baseline sequentially over the listed scenarios: ```bash -uv run python -m benchmark.scenario_suite_runner --scenario-ids benchmarks/scenario_suite/scenarios.txt --scenario-root /.../scenarios_data --method direct_llm --model-id tokenrouter/MiniMax-M3 +uv run python -m benchmark.scenario_suite_runner --scenario-ids benchmarks/scenario_suite/scenarios.txt --scenario-root /.../scenarios_data --agent_name direct_llm --model-id tokenrouter/MiniMax-M3 ``` This writes trajectories to: @@ -99,7 +99,7 @@ Run the Stirrup agent sequentially over the listed scenarios using the MiniMax m uv run python -m benchmark.scenario_suite_runner \ --scenario-ids benchmarks/scenario_suite/scenarios.txt \ --scenario-root /.../scenarios_data \ - --method stirrup_agent \ + --agent_name stirrup_agent \ --model-id tokenrouter/MiniMax-M3 ``` @@ -115,12 +115,12 @@ and reports to: reports/scenario_suite/stirrup_agent/ ``` -## Run all methods +## Run all agents -Run both supported methods one after the other: +Run all supported agents one after the other: ```bash -uv run python -m benchmark.scenario_suite_runner --scenario-ids benchmarks/scenario_suite/scenarios.txt --scenario-root /.../scenarios_data --method all +uv run python -m benchmark.scenario_suite_runner --scenario-ids benchmarks/scenario_suite/scenarios.txt --scenario-root /.../scenarios_data --agent_name all ``` ## Useful options @@ -130,7 +130,7 @@ uv run python -m benchmark.scenario_suite_runner --scenario-ids benchmarks/sce Print the commands without executing them: ```bash -uv run python -m benchmark.scenario_suite_runner --scenario-ids benchmarks/scenario_suite/scenarios.txt --scenario-root /.../scenarios_data --method direct_llm --dry-run +uv run python -m benchmark.scenario_suite_runner --scenario-ids benchmarks/scenario_suite/scenarios.txt --scenario-root /.../scenarios_data --agent_name direct_llm --dry-run ``` ### Skip existing trajectories @@ -138,7 +138,7 @@ uv run python -m benchmark.scenario_suite_runner --scenario-ids benchmarks/sce Skip scenarios whose trajectory files already exist: ```bash -uv run python -m benchmark.scenario_suite_runner --scenario-ids benchmarks/scenario_suite/scenarios.txt --scenario-root /.../scenarios_data --method direct_llm --skip-existing +uv run python -m benchmark.scenario_suite_runner --scenario-ids benchmarks/scenario_suite/scenarios.txt --scenario-root /.../scenarios_data --agent_name direct_llm --skip-existing ``` ### Continue after errors @@ -146,7 +146,7 @@ uv run python -m benchmark.scenario_suite_runner --scenario-ids benchmarks/sce Keep running later scenarios even if one fails: ```bash -uv run python -m benchmark.scenario_suite_runner --scenario-ids benchmarks/scenario_suite/scenarios.txt --scenario-root /.../scenarios_data --method direct_llm --continue-on-error +uv run python -m benchmark.scenario_suite_runner --scenario-ids benchmarks/scenario_suite/scenarios.txt --scenario-root /.../scenarios_data --agent_name direct_llm --continue-on-error ``` ## Environment variables