Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 69 additions & 7 deletions src/agentops/cli/browse_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@

import typer

from agentops.cli._planned import _planned_command

run_app = typer.Typer(help="Run history and inspection commands.")
bundle_app = typer.Typer(help="Bundle browsing commands.")

Expand Down Expand Up @@ -170,12 +168,76 @@ def cmd_run_show(

@run_app.command("view")
def cmd_run_view(
run_id: str,
run_id: str = typer.Argument(help="Run ID (timestamp folder name or 'latest')."),
entry: Annotated[
int | None,
typer.Option("--entry", help="Optional row/entry index for deep inspection."),
typer.Option("--entry", help="Show detail for a specific row index."),
] = None,
directory: Path = typer.Option(
Path("."),
"--dir",
help="Workspace directory.",
),
) -> None:
"""Deep-inspect run details (planned)."""
_ = run_id, entry
_planned_command("agentops run view <id> [--entry N]")
"""View per-row metrics for an evaluation run."""
from agentops.services.browse import view_run

try:
result = view_run(run_id=run_id, directory=directory, entry=entry)
except (FileNotFoundError, ValueError) as exc:
typer.echo(f"Error: {exc}", err=True)
raise typer.Exit(code=1) from exc

status = "PASS" if result.overall_passed else "FAIL"
typer.echo(
f"Run: {result.run_id} ({status}) "
f"bundle={result.bundle_name} dataset={result.dataset_name}"
)

if entry is not None:
# Detail view for a single row
row = result.rows[0]
row_status = "PASS" if row.passed_all else "FAIL"
typer.echo(f"\nRow {row.row_index}: {row_status}")
typer.echo("")
typer.echo("Scores:")
for name, value in row.scores.items():
typer.echo(f" {name:<40} {value:.4f}")
if row.threshold_results:
typer.echo("")
typer.echo("Thresholds:")
for t in row.threshold_results:
mark = "PASS" if t["passed"] else "FAIL"
typer.echo(
f" {t['evaluator']:<40} {t['criteria']} {t['expected']:<10} "
f"actual={t['actual']:<10} {mark}"
)
else:
# Table view for all rows
if not result.rows:
typer.echo("\nNo per-row metrics available.")
return

# Collect metric names (excluding samples_evaluated)
metric_names = [n for n in result.evaluator_names if n != "samples_evaluated"]

# Header
typer.echo("")
header = f"{'Row':>4} {'Pass':4}"
for name in metric_names:
short = name.replace("Evaluator", "")
header += f" {short:>10}"
typer.echo(header)
typer.echo("-" * len(header))

# Rows
for row in result.rows:
row_status = "PASS" if row.passed_all else "FAIL"
line = f"{row.row_index:>4} {row_status:4}"
for name in metric_names:
val = row.scores.get(name)
if val is not None:
line += f" {val:>10.2f}"
else:
line += f" {'—':>10}"
typer.echo(line)
116 changes: 116 additions & 0 deletions src/agentops/services/browse.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,3 +314,119 @@ def show_run(run_id: str, directory: Path = Path(".")) -> RunDetail:
report_path=report_path,
foundry_url=foundry_url,
)


# ---------------------------------------------------------------------------
# Run view (row-level detail)
# ---------------------------------------------------------------------------


@dataclass(frozen=True)
class RowView:
"""Per-row metrics and threshold results."""

row_index: int
passed_all: bool
scores: Dict[str, float]
threshold_results: List[Dict[str, Any]]


@dataclass(frozen=True)
class RunViewResult:
"""Result of viewing a run with row-level detail."""

run_id: str
bundle_name: str
dataset_name: str
overall_passed: bool
rows: List[RowView]
evaluator_names: List[str]


def view_run(
run_id: str,
directory: Path = Path("."),
entry: Optional[int] = None,
) -> RunViewResult:
"""Load run results with per-row metric breakdown.

If ``entry`` is provided, only that row is included.
"""
workspace = resolve_workspace(directory)
results_dir = workspace / "results"

run_dir = (results_dir / run_id).resolve()
if not run_dir.is_dir():
available = [
d.name
for d in sorted(results_dir.iterdir(), reverse=True)
if d.is_dir() and d.name != "latest" and (d / "results.json").exists()
]
hint = ", ".join(available[:5]) if available else "(none)"
raise FileNotFoundError(
f"Run '{run_id}' not found in {results_dir}. Recent runs: {hint}"
)

results_file = run_dir / "results.json"
if not results_file.exists():
raise FileNotFoundError(f"No results.json in {run_dir}")

data = json.loads(results_file.read_text(encoding="utf-8"))
result = RunResult.model_validate(data)

# Build per-row scores lookup
row_scores: Dict[int, Dict[str, float]] = {}
evaluator_names_set: dict[str, None] = {}
for row in result.row_metrics:
scores = {}
for m in row.metrics:
scores[m.name] = m.value
evaluator_names_set[m.name] = None
row_scores[row.row_index] = scores

# Build per-row threshold results
row_thresholds: Dict[int, List[Dict[str, Any]]] = {}
row_passed: Dict[int, bool] = {}
for item in result.item_evaluations:
row_passed[item.row_index] = item.passed_all
row_thresholds[item.row_index] = [
{
"evaluator": t.evaluator,
"criteria": t.criteria,
"expected": t.expected,
"actual": t.actual,
"passed": t.passed,
}
for t in item.thresholds
]

# Build row views
all_row_indices = sorted(set(row_scores.keys()) | set(row_passed.keys()))

if entry is not None:
if entry not in all_row_indices:
raise ValueError(
f"Entry {entry} not found. Available rows: "
+ ", ".join(str(i) for i in all_row_indices)
)
all_row_indices = [entry]

rows: List[RowView] = []
for idx in all_row_indices:
rows.append(
RowView(
row_index=idx,
passed_all=row_passed.get(idx, True),
scores=row_scores.get(idx, {}),
threshold_results=row_thresholds.get(idx, []),
)
)

return RunViewResult(
run_id=run_id,
bundle_name=result.bundle.name,
dataset_name=result.dataset.name,
overall_passed=result.summary.overall_passed,
rows=rows,
evaluator_names=list(evaluator_names_set.keys()),
)
140 changes: 137 additions & 3 deletions tests/unit/test_browse.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
list_runs,
show_bundle,
show_run,
view_run,
)
from agentops.utils.yaml import save_yaml

Expand Down Expand Up @@ -65,10 +66,51 @@ def _write_run(ws: Path, run_id: str, *, passed: bool = True) -> Path:
{"name": "CoherenceEvaluator", "value": 4.5},
{"name": "samples_evaluated", "value": 3.0},
],
"row_metrics": [],
"row_metrics": [
{
"row_index": 1,
"metrics": [
{"name": "CoherenceEvaluator", "value": 5.0},
{"name": "RelevanceEvaluator", "value": 4.0},
],
},
{
"row_index": 2,
"metrics": [
{"name": "CoherenceEvaluator", "value": 4.0},
{"name": "RelevanceEvaluator", "value": 5.0},
],
},
],
"item_evaluations": [
{"row_index": 1, "passed_all": True, "thresholds": []},
{"row_index": 2, "passed_all": passed, "thresholds": []},
{
"row_index": 1,
"passed_all": True,
"thresholds": [
{
"row_index": 1,
"evaluator": "CoherenceEvaluator",
"criteria": ">=",
"expected": "3.000000",
"actual": "5.000000",
"passed": True,
}
],
},
{
"row_index": 2,
"passed_all": passed,
"thresholds": [
{
"row_index": 2,
"evaluator": "CoherenceEvaluator",
"criteria": ">=",
"expected": "3.000000",
"actual": "4.000000",
"passed": passed,
}
],
},
],
"thresholds": [
{
Expand Down Expand Up @@ -257,3 +299,95 @@ def test_not_found(self, tmp_path: Path) -> None:
)
assert result.exit_code == 1
assert "not found" in (result.stdout + result.stderr)


# ---------------------------------------------------------------------------
# view_run service tests
# ---------------------------------------------------------------------------


class TestViewRun:
def test_table_view(self, tmp_path: Path) -> None:
ws = _create_workspace(tmp_path)
_write_run(ws, "2026-04-07_100000", passed=True)
result = view_run("2026-04-07_100000", directory=tmp_path)
assert result.run_id == "2026-04-07_100000"
assert len(result.rows) == 2
assert result.rows[0].row_index == 1
assert result.rows[0].scores["CoherenceEvaluator"] == 5.0
assert result.rows[1].scores["RelevanceEvaluator"] == 5.0

def test_entry_filter(self, tmp_path: Path) -> None:
ws = _create_workspace(tmp_path)
_write_run(ws, "2026-04-07_100000")
result = view_run("2026-04-07_100000", directory=tmp_path, entry=2)
assert len(result.rows) == 1
assert result.rows[0].row_index == 2

def test_entry_not_found(self, tmp_path: Path) -> None:
ws = _create_workspace(tmp_path)
_write_run(ws, "2026-04-07_100000")
with pytest.raises(ValueError, match="Entry 99 not found"):
view_run("2026-04-07_100000", directory=tmp_path, entry=99)

def test_entry_has_thresholds(self, tmp_path: Path) -> None:
ws = _create_workspace(tmp_path)
_write_run(ws, "2026-04-07_100000")
result = view_run("2026-04-07_100000", directory=tmp_path, entry=1)
row = result.rows[0]
assert len(row.threshold_results) == 1
assert row.threshold_results[0]["evaluator"] == "CoherenceEvaluator"
assert row.threshold_results[0]["passed"] is True


# ---------------------------------------------------------------------------
# run view CLI tests
# ---------------------------------------------------------------------------


class TestRunViewCLI:
def test_table_view(self, tmp_path: Path) -> None:
ws = _create_workspace(tmp_path)
_write_run(ws, "2026-04-07_100000")
result = runner.invoke(
app, ["run", "view", "2026-04-07_100000", "--dir", str(tmp_path)]
)
assert result.exit_code == 0
assert "Coherence" in result.stdout
assert "Relevance" in result.stdout

def test_entry_view(self, tmp_path: Path) -> None:
ws = _create_workspace(tmp_path)
_write_run(ws, "2026-04-07_100000")
result = runner.invoke(
app,
[
"run",
"view",
"2026-04-07_100000",
"--entry",
"1",
"--dir",
str(tmp_path),
],
)
assert result.exit_code == 0
assert "Row 1: PASS" in result.stdout
assert "CoherenceEvaluator" in result.stdout

def test_entry_not_found(self, tmp_path: Path) -> None:
ws = _create_workspace(tmp_path)
_write_run(ws, "2026-04-07_100000")
result = runner.invoke(
app,
[
"run",
"view",
"2026-04-07_100000",
"--entry",
"99",
"--dir",
str(tmp_path),
],
)
assert result.exit_code == 1