From 14d521387d9f4a4ea180671a4584e91a330dbd36 Mon Sep 17 00:00:00 2001 From: Dongbumlee Date: Tue, 7 Apr 2026 14:13:19 -0700 Subject: [PATCH] feat: implement run view command with table and entry detail modes - run view : per-row metrics table with scores across all evaluators - run view --entry N: drill-down with scores + threshold pass/fail - 7 new tests (service + CLI) - No more stubs in browse_commands.py --- src/agentops/cli/browse_commands.py | 76 +++++++++++++-- src/agentops/services/browse.py | 116 +++++++++++++++++++++++ tests/unit/test_browse.py | 140 +++++++++++++++++++++++++++- 3 files changed, 322 insertions(+), 10 deletions(-) diff --git a/src/agentops/cli/browse_commands.py b/src/agentops/cli/browse_commands.py index c3db6139..5aa33782 100644 --- a/src/agentops/cli/browse_commands.py +++ b/src/agentops/cli/browse_commands.py @@ -7,8 +7,6 @@ import typer -from agentops.cli._planned import _planned_command - run_app = typer.Typer(help="Run history and inspection commands.") bundle_app = typer.Typer(help="Bundle browsing commands.") @@ -170,12 +168,76 @@ def cmd_run_show( @run_app.command("view") def cmd_run_view( - run_id: str, + run_id: str = typer.Argument(help="Run ID (timestamp folder name or 'latest')."), entry: Annotated[ int | None, - typer.Option("--entry", help="Optional row/entry index for deep inspection."), + typer.Option("--entry", help="Show detail for a specific row index."), ] = None, + directory: Path = typer.Option( + Path("."), + "--dir", + help="Workspace directory.", + ), ) -> None: - """Deep-inspect run details (planned).""" - _ = run_id, entry - _planned_command("agentops run view [--entry N]") + """View per-row metrics for an evaluation run.""" + from agentops.services.browse import view_run + + try: + result = view_run(run_id=run_id, directory=directory, entry=entry) + except (FileNotFoundError, ValueError) as exc: + typer.echo(f"Error: {exc}", err=True) + raise typer.Exit(code=1) from exc + + status = "PASS" if result.overall_passed else "FAIL" + typer.echo( + f"Run: {result.run_id} ({status}) " + f"bundle={result.bundle_name} dataset={result.dataset_name}" + ) + + if entry is not None: + # Detail view for a single row + row = result.rows[0] + row_status = "PASS" if row.passed_all else "FAIL" + typer.echo(f"\nRow {row.row_index}: {row_status}") + typer.echo("") + typer.echo("Scores:") + for name, value in row.scores.items(): + typer.echo(f" {name:<40} {value:.4f}") + if row.threshold_results: + typer.echo("") + typer.echo("Thresholds:") + for t in row.threshold_results: + mark = "PASS" if t["passed"] else "FAIL" + typer.echo( + f" {t['evaluator']:<40} {t['criteria']} {t['expected']:<10} " + f"actual={t['actual']:<10} {mark}" + ) + else: + # Table view for all rows + if not result.rows: + typer.echo("\nNo per-row metrics available.") + return + + # Collect metric names (excluding samples_evaluated) + metric_names = [n for n in result.evaluator_names if n != "samples_evaluated"] + + # Header + typer.echo("") + header = f"{'Row':>4} {'Pass':4}" + for name in metric_names: + short = name.replace("Evaluator", "") + header += f" {short:>10}" + typer.echo(header) + typer.echo("-" * len(header)) + + # Rows + for row in result.rows: + row_status = "PASS" if row.passed_all else "FAIL" + line = f"{row.row_index:>4} {row_status:4}" + for name in metric_names: + val = row.scores.get(name) + if val is not None: + line += f" {val:>10.2f}" + else: + line += f" {'—':>10}" + typer.echo(line) diff --git a/src/agentops/services/browse.py b/src/agentops/services/browse.py index 37e0506a..7c607ed2 100644 --- a/src/agentops/services/browse.py +++ b/src/agentops/services/browse.py @@ -314,3 +314,119 @@ def show_run(run_id: str, directory: Path = Path(".")) -> RunDetail: report_path=report_path, foundry_url=foundry_url, ) + + +# --------------------------------------------------------------------------- +# Run view (row-level detail) +# --------------------------------------------------------------------------- + + +@dataclass(frozen=True) +class RowView: + """Per-row metrics and threshold results.""" + + row_index: int + passed_all: bool + scores: Dict[str, float] + threshold_results: List[Dict[str, Any]] + + +@dataclass(frozen=True) +class RunViewResult: + """Result of viewing a run with row-level detail.""" + + run_id: str + bundle_name: str + dataset_name: str + overall_passed: bool + rows: List[RowView] + evaluator_names: List[str] + + +def view_run( + run_id: str, + directory: Path = Path("."), + entry: Optional[int] = None, +) -> RunViewResult: + """Load run results with per-row metric breakdown. + + If ``entry`` is provided, only that row is included. + """ + workspace = resolve_workspace(directory) + results_dir = workspace / "results" + + run_dir = (results_dir / run_id).resolve() + if not run_dir.is_dir(): + available = [ + d.name + for d in sorted(results_dir.iterdir(), reverse=True) + if d.is_dir() and d.name != "latest" and (d / "results.json").exists() + ] + hint = ", ".join(available[:5]) if available else "(none)" + raise FileNotFoundError( + f"Run '{run_id}' not found in {results_dir}. Recent runs: {hint}" + ) + + results_file = run_dir / "results.json" + if not results_file.exists(): + raise FileNotFoundError(f"No results.json in {run_dir}") + + data = json.loads(results_file.read_text(encoding="utf-8")) + result = RunResult.model_validate(data) + + # Build per-row scores lookup + row_scores: Dict[int, Dict[str, float]] = {} + evaluator_names_set: dict[str, None] = {} + for row in result.row_metrics: + scores = {} + for m in row.metrics: + scores[m.name] = m.value + evaluator_names_set[m.name] = None + row_scores[row.row_index] = scores + + # Build per-row threshold results + row_thresholds: Dict[int, List[Dict[str, Any]]] = {} + row_passed: Dict[int, bool] = {} + for item in result.item_evaluations: + row_passed[item.row_index] = item.passed_all + row_thresholds[item.row_index] = [ + { + "evaluator": t.evaluator, + "criteria": t.criteria, + "expected": t.expected, + "actual": t.actual, + "passed": t.passed, + } + for t in item.thresholds + ] + + # Build row views + all_row_indices = sorted(set(row_scores.keys()) | set(row_passed.keys())) + + if entry is not None: + if entry not in all_row_indices: + raise ValueError( + f"Entry {entry} not found. Available rows: " + + ", ".join(str(i) for i in all_row_indices) + ) + all_row_indices = [entry] + + rows: List[RowView] = [] + for idx in all_row_indices: + rows.append( + RowView( + row_index=idx, + passed_all=row_passed.get(idx, True), + scores=row_scores.get(idx, {}), + threshold_results=row_thresholds.get(idx, []), + ) + ) + + return RunViewResult( + run_id=run_id, + bundle_name=result.bundle.name, + dataset_name=result.dataset.name, + overall_passed=result.summary.overall_passed, + rows=rows, + evaluator_names=list(evaluator_names_set.keys()), + ) diff --git a/tests/unit/test_browse.py b/tests/unit/test_browse.py index 077426f6..9d742964 100644 --- a/tests/unit/test_browse.py +++ b/tests/unit/test_browse.py @@ -14,6 +14,7 @@ list_runs, show_bundle, show_run, + view_run, ) from agentops.utils.yaml import save_yaml @@ -65,10 +66,51 @@ def _write_run(ws: Path, run_id: str, *, passed: bool = True) -> Path: {"name": "CoherenceEvaluator", "value": 4.5}, {"name": "samples_evaluated", "value": 3.0}, ], - "row_metrics": [], + "row_metrics": [ + { + "row_index": 1, + "metrics": [ + {"name": "CoherenceEvaluator", "value": 5.0}, + {"name": "RelevanceEvaluator", "value": 4.0}, + ], + }, + { + "row_index": 2, + "metrics": [ + {"name": "CoherenceEvaluator", "value": 4.0}, + {"name": "RelevanceEvaluator", "value": 5.0}, + ], + }, + ], "item_evaluations": [ - {"row_index": 1, "passed_all": True, "thresholds": []}, - {"row_index": 2, "passed_all": passed, "thresholds": []}, + { + "row_index": 1, + "passed_all": True, + "thresholds": [ + { + "row_index": 1, + "evaluator": "CoherenceEvaluator", + "criteria": ">=", + "expected": "3.000000", + "actual": "5.000000", + "passed": True, + } + ], + }, + { + "row_index": 2, + "passed_all": passed, + "thresholds": [ + { + "row_index": 2, + "evaluator": "CoherenceEvaluator", + "criteria": ">=", + "expected": "3.000000", + "actual": "4.000000", + "passed": passed, + } + ], + }, ], "thresholds": [ { @@ -257,3 +299,95 @@ def test_not_found(self, tmp_path: Path) -> None: ) assert result.exit_code == 1 assert "not found" in (result.stdout + result.stderr) + + +# --------------------------------------------------------------------------- +# view_run service tests +# --------------------------------------------------------------------------- + + +class TestViewRun: + def test_table_view(self, tmp_path: Path) -> None: + ws = _create_workspace(tmp_path) + _write_run(ws, "2026-04-07_100000", passed=True) + result = view_run("2026-04-07_100000", directory=tmp_path) + assert result.run_id == "2026-04-07_100000" + assert len(result.rows) == 2 + assert result.rows[0].row_index == 1 + assert result.rows[0].scores["CoherenceEvaluator"] == 5.0 + assert result.rows[1].scores["RelevanceEvaluator"] == 5.0 + + def test_entry_filter(self, tmp_path: Path) -> None: + ws = _create_workspace(tmp_path) + _write_run(ws, "2026-04-07_100000") + result = view_run("2026-04-07_100000", directory=tmp_path, entry=2) + assert len(result.rows) == 1 + assert result.rows[0].row_index == 2 + + def test_entry_not_found(self, tmp_path: Path) -> None: + ws = _create_workspace(tmp_path) + _write_run(ws, "2026-04-07_100000") + with pytest.raises(ValueError, match="Entry 99 not found"): + view_run("2026-04-07_100000", directory=tmp_path, entry=99) + + def test_entry_has_thresholds(self, tmp_path: Path) -> None: + ws = _create_workspace(tmp_path) + _write_run(ws, "2026-04-07_100000") + result = view_run("2026-04-07_100000", directory=tmp_path, entry=1) + row = result.rows[0] + assert len(row.threshold_results) == 1 + assert row.threshold_results[0]["evaluator"] == "CoherenceEvaluator" + assert row.threshold_results[0]["passed"] is True + + +# --------------------------------------------------------------------------- +# run view CLI tests +# --------------------------------------------------------------------------- + + +class TestRunViewCLI: + def test_table_view(self, tmp_path: Path) -> None: + ws = _create_workspace(tmp_path) + _write_run(ws, "2026-04-07_100000") + result = runner.invoke( + app, ["run", "view", "2026-04-07_100000", "--dir", str(tmp_path)] + ) + assert result.exit_code == 0 + assert "Coherence" in result.stdout + assert "Relevance" in result.stdout + + def test_entry_view(self, tmp_path: Path) -> None: + ws = _create_workspace(tmp_path) + _write_run(ws, "2026-04-07_100000") + result = runner.invoke( + app, + [ + "run", + "view", + "2026-04-07_100000", + "--entry", + "1", + "--dir", + str(tmp_path), + ], + ) + assert result.exit_code == 0 + assert "Row 1: PASS" in result.stdout + assert "CoherenceEvaluator" in result.stdout + + def test_entry_not_found(self, tmp_path: Path) -> None: + ws = _create_workspace(tmp_path) + _write_run(ws, "2026-04-07_100000") + result = runner.invoke( + app, + [ + "run", + "view", + "2026-04-07_100000", + "--entry", + "99", + "--dir", + str(tmp_path), + ], + ) + assert result.exit_code == 1