From 77d5f321d408b4e124d88f45c96ba77d6125952e Mon Sep 17 00:00:00 2001
From: cafitac <cafitac99@gmail.com>
Date: Thu, 30 Apr 2026 07:06:12 +0900
Subject: [PATCH] feat: expand retrieval eval text report details

---
 .dev/status/current-handoff.md          |  2 +-
 README.md                               |  2 +-
 src/agent_memory/core/retrieval_eval.py | 52 +++++++++++++++++++++++--
 tests/test_retrieval_evaluation.py      | 40 +++++++++++++++++++
 4 files changed, 91 insertions(+), 5 deletions(-)

diff --git a/.dev/status/current-handoff.md b/.dev/status/current-handoff.md
index 0d18053..c932c94 100644
--- a/.dev/status/current-handoff.md
+++ b/.dev/status/current-handoff.md
@@ -182,7 +182,7 @@ Current verified behavior:
 - optional `--fail-on-baseline-regression-memory-type {facts,procedures,episodes}` can scope baseline-relative gating down to selected primary task types instead of failing on every current<baseline task; lexical-global and source-global comparisons still only gate when current is worse, not when the comparator is worse
 - optional `--fail-on-regression` exits nonzero when any current task has `pass=false`
 - optional `--fail-on-baseline-regression` exits nonzero only when current retrieval is worse than the chosen baseline for at least one task
-- optional `--format text` prints a compact terminal summary with current/baseline/delta totals, primary-task-type rollups, failed task IDs, and advisory messages while preserving JSON as the default machine-readable contract
+- optional `--format text` prints a compact terminal summary with current/baseline/delta totals, primary-task-type rollups, failed task details, baseline weak spots, current regressions versus the selected baseline, and advisory messages while preserving JSON as the default machine-readable contract
 - symbolic fixture selectors now support richer matching such as `searchable_text_contains`, `step_contains`, and `tags_include`
 - current fact retrieval now suppresses lower-priority cross-scope fact drift when exact-scope fact matches exist and hides lower-ranked conflicting facts in the same subject/predicate/scope slot from surfaced results
 - checked-in fixture families are now directly runnable against a suitably seeded DB and also covered by regression tests in `tests/test_retrieval_evaluation.py`; the seeded family now includes a branch-only adversarial staleness case plus procedure-, episode-, and source-global-oriented stale-fact/stale-source guardrails where current retrieval passes and at least one comparator baseline still fails
diff --git a/README.md b/README.md
index f212f7a..9773019 100644
--- a/README.md
+++ b/README.md
@@ -196,7 +196,7 @@ uv run agent-memory eval retrieval ~/.agent-memory/memory.db tests/fixtures/retr
 uv run agent-memory eval retrieval ~/.agent-memory/memory.db tests/fixtures/retrieval_eval --baseline-mode lexical --warn-on-baseline-regression-threshold 0
 ```
 
-The retrieval evaluator accepts either one JSON fixture file or a fixture directory. Directory input is recursive, so fixture families can live under nested folders such as `scope/`, `procedure/`, `drift/`, `staleness/`, and `episode/`. Fixtures may use direct numeric IDs or top-level symbolic `references` that resolve against approved memories in the target database, which makes checked-in fixture families directly runnable from the CLI. Symbolic selectors now also support richer matching such as `searchable_text_contains`, `step_contains`, and `tags_include` when exact field equality is too brittle for checked-in fixtures. Each task may also carry optional human-authored `rationale` text and `notes` arrays; these are preserved verbatim in the JSON report so fixture reviews can explain why a hit matters without introducing LLM judging. The evaluator runs `retrieve_memory_packet` for each task and prints JSON by default with fixture paths, per-task rationale/notes, retrieved IDs, expected hits, missing expected IDs, avoid/drift hits, a derived per-task `pass` flag, any non-fatal soft-gate `advisories`, and an aggregate summary. Use `--format text` when you want a short human-readable terminal report with pass counts, current/baseline/delta totals, primary-task-type rollups, failed task IDs, and advisory messages. Summary objects now also include top-level task counts (`total_tasks`, `passed_tasks`, `failed_tasks`), `by_memory_type` rollups for facts/procedures/episodes, and `by_primary_task_type` rollups keyed by each task's main target surface so regressions can be reviewed both by memory-slice participation and by per-task intent; the per-type summaries expose the same task counts plus hit/miss/avoid totals. With `--baseline-mode lexical`, the same output also includes per-task baseline metrics, per-task delta fields (`expected_hit_delta`, `missing_expected_delta`, `avoid_hit_delta`, `pass_changed`), plus baseline and delta summaries using a simpler lexical-only retrieval path scoped to the same preferred scope; `--baseline-mode source-lexical` keeps that preferred-scope restriction but scores approved memories by lexical overlap in their linked source content instead of normalized memory text; `--baseline-mode source-global` uses the same source-linked lexical scoring while ignoring preferred scope; and `--baseline-mode lexical-global` keeps normalized-text lexical scoring but ignores preferred scope. Soft-gate thresholds never change the per-task `pass` semantics or process exit code on their own; they only populate `advisories` when the observed current or baseline-relative regression count exceeds the requested threshold.
+The retrieval evaluator accepts either one JSON fixture file or a fixture directory. Directory input is recursive, so fixture families can live under nested folders such as `scope/`, `procedure/`, `drift/`, `staleness/`, and `episode/`. Fixtures may use direct numeric IDs or top-level symbolic `references` that resolve against approved memories in the target database, which makes checked-in fixture families directly runnable from the CLI. Symbolic selectors now also support richer matching such as `searchable_text_contains`, `step_contains`, and `tags_include` when exact field equality is too brittle for checked-in fixtures. Each task may also carry optional human-authored `rationale` text and `notes` arrays; these are preserved verbatim in the JSON report so fixture reviews can explain why a hit matters without introducing LLM judging. The evaluator runs `retrieve_memory_packet` for each task and prints JSON by default with fixture paths, per-task rationale/notes, retrieved IDs, expected hits, missing expected IDs, avoid/drift hits, a derived per-task `pass` flag, any non-fatal soft-gate `advisories`, and an aggregate summary. Use `--format text` when you want a short human-readable terminal report with pass counts, current/baseline/delta totals, primary-task-type rollups, failed task details, baseline weak spots, current regressions versus the selected baseline, and advisory messages. Summary objects now also include top-level task counts (`total_tasks`, `passed_tasks`, `failed_tasks`), `by_memory_type` rollups for facts/procedures/episodes, and `by_primary_task_type` rollups keyed by each task's main target surface so regressions can be reviewed both by memory-slice participation and by per-task intent; the per-type summaries expose the same task counts plus hit/miss/avoid totals. With `--baseline-mode lexical`, the same output also includes per-task baseline metrics, per-task delta fields (`expected_hit_delta`, `missing_expected_delta`, `avoid_hit_delta`, `pass_changed`), plus baseline and delta summaries using a simpler lexical-only retrieval path scoped to the same preferred scope; `--baseline-mode source-lexical` keeps that preferred-scope restriction but scores approved memories by lexical overlap in their linked source content instead of normalized memory text; `--baseline-mode source-global` uses the same source-linked lexical scoring while ignoring preferred scope; and `--baseline-mode lexical-global` keeps normalized-text lexical scoring but ignores preferred scope. Soft-gate thresholds never change the per-task `pass` semantics or process exit code on their own; they only populate `advisories` when the observed current or baseline-relative regression count exceeds the requested threshold.
 
 Export approved memories as a human-readable KB draft:
 
diff --git a/src/agent_memory/core/retrieval_eval.py b/src/agent_memory/core/retrieval_eval.py
index 05c7a16..6eb7793 100644
--- a/src/agent_memory/core/retrieval_eval.py
+++ b/src/agent_memory/core/retrieval_eval.py
@@ -506,6 +506,35 @@ def _format_type_summary(memory_type: str, summary: RetrievalEvalMemoryTypeSumma
     )
 
 
+def _format_id_map(ids_by_type: dict[str, list[int]]) -> str:
+    parts = [f"{memory_type}={ids_by_type[memory_type]}" for memory_type in _MEMORY_TYPES if ids_by_type.get(memory_type)]
+    return ", ".join(parts) if parts else "none"
+
+
+def _format_pass_label(passed: bool) -> str:
+    return "pass" if passed else "fail"
+
+
+def _append_task_detail(lines: list[str], task: RetrievalEvalTaskResult, *, include_current: bool) -> None:
+    lines.append(f"  - {task.task_id}")
+    lines.append(f"    query: {task.query}")
+    if include_current:
+        lines.append(f"    missing: {_format_id_map(task.missing_expected)}")
+        lines.append(f"    avoid: {_format_id_map(task.avoid_hits)}")
+    if task.baseline is not None:
+        lines.append(f"    baseline: {_format_pass_label(task.baseline.pass_)}")
+        lines.append(f"    baseline missing: {_format_id_map(task.baseline.missing_expected)}")
+        lines.append(f"    baseline avoid: {_format_id_map(task.baseline.avoid_hits)}")
+    if task.delta is not None:
+        lines.append(
+            "    delta: "
+            f"expected_hits={_signed_delta(task.delta.expected_hit_delta)} "
+            f"missing={_signed_delta(task.delta.missing_expected_delta)} "
+            f"avoid={_signed_delta(task.delta.avoid_hit_delta)} "
+            f"pass_changed={task.delta.pass_changed}"
+        )
+
+
 def render_retrieval_eval_text_report(result_set: RetrievalEvalResultSet) -> str:
     summary = result_set.summary
     lines = [
@@ -531,13 +560,30 @@ def render_retrieval_eval_text_report(result_set: RetrievalEvalResultSet) -> str
         type_summary = summary.by_primary_task_type.get(memory_type, RetrievalEvalMemoryTypeSummary())
         lines.append(_format_type_summary(memory_type, type_summary))
 
-    failed_task_ids = [task.task_id for task in result_set.results if not task.pass_]
-    if failed_task_ids:
+    failed_tasks = [task for task in result_set.results if not task.pass_]
+    if failed_tasks:
         lines.append("failed tasks:")
-        lines.extend(f"  - {task_id}" for task_id in failed_task_ids)
+        for task in failed_tasks:
+            _append_task_detail(lines, task, include_current=True)
     else:
         lines.append("failed tasks: none")
 
+    if result_set.baseline_summary is not None:
+        baseline_weak_spots = [task for task in result_set.results if task.pass_ and task.baseline is not None and not task.baseline.pass_]
+        current_regressions = [task for task in result_set.results if not task.pass_ and task.baseline is not None and task.baseline.pass_]
+        if baseline_weak_spots:
+            lines.append("baseline weak spots:")
+            for task in baseline_weak_spots:
+                _append_task_detail(lines, task, include_current=False)
+        else:
+            lines.append("baseline weak spots: none")
+        if current_regressions:
+            lines.append("current regressions vs baseline:")
+            for task in current_regressions:
+                _append_task_detail(lines, task, include_current=True)
+        else:
+            lines.append("current regressions vs baseline: none")
+
     if result_set.advisories:
         lines.append("advisories:")
         lines.extend(f"  - {advisory.code}: {advisory.message}" for advisory in result_set.advisories)
diff --git a/tests/test_retrieval_evaluation.py b/tests/test_retrieval_evaluation.py
index 6a3fb7b..e1dedfb 100644
--- a/tests/test_retrieval_evaluation.py
+++ b/tests/test_retrieval_evaluation.py
@@ -596,6 +596,46 @@ def test_render_retrieval_eval_text_report_summarizes_passes_and_type_rollups(tm
     assert "failed tasks: none" in report
 
 
+def test_render_retrieval_eval_text_report_shows_failed_task_details(tmp_path: Path) -> None:
+    from agent_memory.core.retrieval_eval import evaluate_retrieval_fixtures, render_retrieval_eval_text_report
+
+    db_path = tmp_path / "retrieval-eval-text-failure.db"
+    seeded_ids = _seed_retrieval_eval_db(db_path)
+    fixture_path = tmp_path / "retrieval-eval-failure.json"
+    payload = _fixture_payload(seeded_ids)
+    payload["tasks"] = [payload["tasks"][0]]
+    payload["tasks"][0]["expected"]["facts"] = [seeded_ids["drift_fact_id"]]
+    payload["tasks"][0]["avoid"]["facts"] = [seeded_ids["fact_id"]]
+    fixture_path.write_text(json.dumps(payload, indent=2))
+
+    result = evaluate_retrieval_fixtures(db_path=db_path, fixtures_path=fixture_path, baseline_mode="lexical")
+    report = render_retrieval_eval_text_report(result)
+
+    assert "failed tasks:" in report
+    assert "  - project-m1-kb-export" in report
+    assert "    missing: facts=[2]" in report
+    assert "    avoid: facts=[1]" in report
+    assert "    baseline: fail" in report
+    assert "    query: What command does Project M1 use for KB export?" in report
+
+
+def test_render_retrieval_eval_text_report_shows_baseline_weak_spots(tmp_path: Path) -> None:
+    from agent_memory.core.retrieval_eval import evaluate_retrieval_fixtures, render_retrieval_eval_text_report
+
+    db_path = tmp_path / "retrieval-eval-text-baseline-weak-spots.db"
+    _seed_checked_in_fixture_eval_db(db_path)
+    fixture_path = _checked_in_fixture_dir() / "staleness" / "branch-only-current.json"
+
+    result = evaluate_retrieval_fixtures(db_path=db_path, fixtures_path=fixture_path, baseline_mode="lexical-global")
+    report = render_retrieval_eval_text_report(result)
+
+    assert "baseline weak spots:" in report
+    assert "  - branch-only-current-policy" in report
+    assert "    baseline missing: none" in report
+    assert "    baseline avoid: facts=[" in report
+    assert "current regressions vs baseline: none" in report
+
+
 def test_cli_eval_retrieval_text_format_outputs_human_summary(tmp_path: Path) -> None:
     db_path = tmp_path / "retrieval-eval-cli-text.db"
     seeded_ids = _seed_retrieval_eval_db(db_path)