From 77d5f321d408b4e124d88f45c96ba77d6125952e Mon Sep 17 00:00:00 2001 From: cafitac Date: Thu, 30 Apr 2026 07:06:12 +0900 Subject: [PATCH] feat: expand retrieval eval text report details --- .dev/status/current-handoff.md | 2 +- README.md | 2 +- src/agent_memory/core/retrieval_eval.py | 52 +++++++++++++++++++++++-- tests/test_retrieval_evaluation.py | 40 +++++++++++++++++++ 4 files changed, 91 insertions(+), 5 deletions(-) diff --git a/.dev/status/current-handoff.md b/.dev/status/current-handoff.md index 0d18053..c932c94 100644 --- a/.dev/status/current-handoff.md +++ b/.dev/status/current-handoff.md @@ -182,7 +182,7 @@ Current verified behavior: - optional `--fail-on-baseline-regression-memory-type {facts,procedures,episodes}` can scope baseline-relative gating down to selected primary task types instead of failing on every current str: + parts = [f"{memory_type}={ids_by_type[memory_type]}" for memory_type in _MEMORY_TYPES if ids_by_type.get(memory_type)] + return ", ".join(parts) if parts else "none" + + +def _format_pass_label(passed: bool) -> str: + return "pass" if passed else "fail" + + +def _append_task_detail(lines: list[str], task: RetrievalEvalTaskResult, *, include_current: bool) -> None: + lines.append(f" - {task.task_id}") + lines.append(f" query: {task.query}") + if include_current: + lines.append(f" missing: {_format_id_map(task.missing_expected)}") + lines.append(f" avoid: {_format_id_map(task.avoid_hits)}") + if task.baseline is not None: + lines.append(f" baseline: {_format_pass_label(task.baseline.pass_)}") + lines.append(f" baseline missing: {_format_id_map(task.baseline.missing_expected)}") + lines.append(f" baseline avoid: {_format_id_map(task.baseline.avoid_hits)}") + if task.delta is not None: + lines.append( + " delta: " + f"expected_hits={_signed_delta(task.delta.expected_hit_delta)} " + f"missing={_signed_delta(task.delta.missing_expected_delta)} " + f"avoid={_signed_delta(task.delta.avoid_hit_delta)} " + f"pass_changed={task.delta.pass_changed}" + ) + + def render_retrieval_eval_text_report(result_set: RetrievalEvalResultSet) -> str: summary = result_set.summary lines = [ @@ -531,13 +560,30 @@ def render_retrieval_eval_text_report(result_set: RetrievalEvalResultSet) -> str type_summary = summary.by_primary_task_type.get(memory_type, RetrievalEvalMemoryTypeSummary()) lines.append(_format_type_summary(memory_type, type_summary)) - failed_task_ids = [task.task_id for task in result_set.results if not task.pass_] - if failed_task_ids: + failed_tasks = [task for task in result_set.results if not task.pass_] + if failed_tasks: lines.append("failed tasks:") - lines.extend(f" - {task_id}" for task_id in failed_task_ids) + for task in failed_tasks: + _append_task_detail(lines, task, include_current=True) else: lines.append("failed tasks: none") + if result_set.baseline_summary is not None: + baseline_weak_spots = [task for task in result_set.results if task.pass_ and task.baseline is not None and not task.baseline.pass_] + current_regressions = [task for task in result_set.results if not task.pass_ and task.baseline is not None and task.baseline.pass_] + if baseline_weak_spots: + lines.append("baseline weak spots:") + for task in baseline_weak_spots: + _append_task_detail(lines, task, include_current=False) + else: + lines.append("baseline weak spots: none") + if current_regressions: + lines.append("current regressions vs baseline:") + for task in current_regressions: + _append_task_detail(lines, task, include_current=True) + else: + lines.append("current regressions vs baseline: none") + if result_set.advisories: lines.append("advisories:") lines.extend(f" - {advisory.code}: {advisory.message}" for advisory in result_set.advisories) diff --git a/tests/test_retrieval_evaluation.py b/tests/test_retrieval_evaluation.py index 6a3fb7b..e1dedfb 100644 --- a/tests/test_retrieval_evaluation.py +++ b/tests/test_retrieval_evaluation.py @@ -596,6 +596,46 @@ def test_render_retrieval_eval_text_report_summarizes_passes_and_type_rollups(tm assert "failed tasks: none" in report +def test_render_retrieval_eval_text_report_shows_failed_task_details(tmp_path: Path) -> None: + from agent_memory.core.retrieval_eval import evaluate_retrieval_fixtures, render_retrieval_eval_text_report + + db_path = tmp_path / "retrieval-eval-text-failure.db" + seeded_ids = _seed_retrieval_eval_db(db_path) + fixture_path = tmp_path / "retrieval-eval-failure.json" + payload = _fixture_payload(seeded_ids) + payload["tasks"] = [payload["tasks"][0]] + payload["tasks"][0]["expected"]["facts"] = [seeded_ids["drift_fact_id"]] + payload["tasks"][0]["avoid"]["facts"] = [seeded_ids["fact_id"]] + fixture_path.write_text(json.dumps(payload, indent=2)) + + result = evaluate_retrieval_fixtures(db_path=db_path, fixtures_path=fixture_path, baseline_mode="lexical") + report = render_retrieval_eval_text_report(result) + + assert "failed tasks:" in report + assert " - project-m1-kb-export" in report + assert " missing: facts=[2]" in report + assert " avoid: facts=[1]" in report + assert " baseline: fail" in report + assert " query: What command does Project M1 use for KB export?" in report + + +def test_render_retrieval_eval_text_report_shows_baseline_weak_spots(tmp_path: Path) -> None: + from agent_memory.core.retrieval_eval import evaluate_retrieval_fixtures, render_retrieval_eval_text_report + + db_path = tmp_path / "retrieval-eval-text-baseline-weak-spots.db" + _seed_checked_in_fixture_eval_db(db_path) + fixture_path = _checked_in_fixture_dir() / "staleness" / "branch-only-current.json" + + result = evaluate_retrieval_fixtures(db_path=db_path, fixtures_path=fixture_path, baseline_mode="lexical-global") + report = render_retrieval_eval_text_report(result) + + assert "baseline weak spots:" in report + assert " - branch-only-current-policy" in report + assert " baseline missing: none" in report + assert " baseline avoid: facts=[" in report + assert "current regressions vs baseline: none" in report + + def test_cli_eval_retrieval_text_format_outputs_human_summary(tmp_path: Path) -> None: db_path = tmp_path / "retrieval-eval-cli-text.db" seeded_ids = _seed_retrieval_eval_db(db_path)