diff --git a/README.md b/README.md index 3b10817..f2ba951 100644 --- a/README.md +++ b/README.md @@ -186,7 +186,7 @@ Supported baseline modes include: - `source-lexical`: lexical comparator over linked source content within preferred scope - `source-global`: source-linked comparator that ignores preferred scope -Reports include per-task retrieved IDs, expected hits, missing IDs, avoid hits, pass/fail state, aggregate summaries, advisories, and failure triage details such as snippets, lifecycle status, scopes, and policy signals. Text reports are meant for maintainers reviewing failed retrieval tasks in a terminal; JSON is the stable machine-readable surface. +Reports include per-task retrieved IDs, expected hits, missing IDs, avoid hits, pass/fail state, aggregate summaries, soft-threshold advisories, and failure triage details such as snippets, lifecycle status, scopes, and policy signals. Every JSON result also includes an `advisory_report` with severity, affected task IDs, and recommended next actions. Text reports render the same advisory report as terminal-friendly guidance for maintainers reviewing failed retrieval tasks; JSON is the stable machine-readable surface. ## Current maturity diff --git a/src/agent_memory/core/models.py b/src/agent_memory/core/models.py index 40a6db0..ab8f601 100644 --- a/src/agent_memory/core/models.py +++ b/src/agent_memory/core/models.py @@ -345,6 +345,16 @@ class RetrievalEvalAdvisory(BaseModel): baseline_mode: str | None = None +class RetrievalEvalAdvisoryReport(BaseModel): + severity: Literal["ok", "medium", "high"] = "ok" + summary: str = "No retrieval advisory actions." + current_failure_task_ids: list[str] = Field(default_factory=list) + baseline_weak_spot_task_ids: list[str] = Field(default_factory=list) + current_regression_task_ids: list[str] = Field(default_factory=list) + recommended_actions: list[str] = Field(default_factory=list) + baseline_mode: str | None = None + + class RetrievalEvalResultSet(BaseModel): fixture_paths: list[str] = Field(default_factory=list) summary: RetrievalEvalSummary = Field(default_factory=RetrievalEvalSummary) @@ -353,3 +363,4 @@ class RetrievalEvalResultSet(BaseModel): baseline_summary: RetrievalEvalBaselineSummary | None = None delta_summary: RetrievalEvalDeltaSummary | None = None advisories: list[RetrievalEvalAdvisory] = Field(default_factory=list) + advisory_report: RetrievalEvalAdvisoryReport = Field(default_factory=RetrievalEvalAdvisoryReport) diff --git a/src/agent_memory/core/retrieval_eval.py b/src/agent_memory/core/retrieval_eval.py index 6b4c77d..4c23589 100644 --- a/src/agent_memory/core/retrieval_eval.py +++ b/src/agent_memory/core/retrieval_eval.py @@ -5,6 +5,7 @@ from agent_memory.core.models import ( RetrievalEvalAdvisory, + RetrievalEvalAdvisoryReport, RetrievalEvalBaselineSummary, RetrievalEvalDelta, RetrievalEvalDeltaSummary, @@ -649,6 +650,71 @@ def _append_task_detail(lines: list[str], task: RetrievalEvalTaskResult, *, incl ) +def _plural(count: int, singular: str, plural: str | None = None) -> str: + label = singular if count == 1 else (plural or f"{singular}s") + return f"{count} {label}" + + +def _build_advisory_report( + *, + results: list[RetrievalEvalTaskResult], + baseline_mode: str | None, +) -> RetrievalEvalAdvisoryReport: + current_failure_task_ids = [task.task_id for task in results if not task.pass_] + missing_task_ids = [task.task_id for task in results if any(task.missing_expected[memory_type] for memory_type in _MEMORY_TYPES)] + avoid_hit_task_ids = [task.task_id for task in results if any(task.avoid_hits[memory_type] for memory_type in _MEMORY_TYPES)] + baseline_weak_spot_task_ids = [ + task.task_id + for task in results + if task.pass_ and task.baseline is not None and not task.baseline.pass_ + ] + current_regression_task_ids = [ + task.task_id + for task in results + if not task.pass_ and task.baseline is not None and task.baseline.pass_ + ] + + summary_parts: list[str] = [] + if current_failure_task_ids: + summary_parts.append(f"{_plural(len(current_failure_task_ids), 'current task')} failed") + if missing_task_ids: + summary_parts.append(f"{_plural(len(missing_task_ids), 'task')} has missing expected memories") + if avoid_hit_task_ids: + summary_parts.append(f"{_plural(len(avoid_hit_task_ids), 'task')} has avoid-hit memories") + if current_regression_task_ids and baseline_mode is not None: + summary_parts.append(f"{_plural(len(current_regression_task_ids), 'current regression')} against {baseline_mode}") + if baseline_weak_spot_task_ids and not current_failure_task_ids and baseline_mode is not None: + summary_parts.append(f"{_plural(len(baseline_weak_spot_task_ids), 'baseline weak spot')} found against {baseline_mode}") + + recommended_actions: list[str] = [] + if current_failure_task_ids: + recommended_actions.append("Inspect failed task details and compare retrieved_details against expected_details.") + if missing_task_ids: + recommended_actions.append("Seed or approve missing expected memories, or tighten fixture expectations if they are stale.") + if avoid_hit_task_ids: + recommended_actions.append("Review avoid-hit details for stale, cross-scope, or conflicting approved memories.") + if current_regression_task_ids: + recommended_actions.append("Compare current regressions against the selected baseline before merging retrieval changes.") + if baseline_weak_spot_task_ids and not current_failure_task_ids: + recommended_actions.append("Use baseline weak spots as coverage wins: keep the fixture checked in and watch for future regressions.") + + severity = "ok" + if current_failure_task_ids or current_regression_task_ids: + severity = "high" + elif baseline_weak_spot_task_ids: + severity = "medium" + + return RetrievalEvalAdvisoryReport( + severity=severity, + summary="; ".join(summary_parts) if summary_parts else "No retrieval advisory actions.", + current_failure_task_ids=current_failure_task_ids, + baseline_weak_spot_task_ids=baseline_weak_spot_task_ids, + current_regression_task_ids=current_regression_task_ids, + recommended_actions=recommended_actions, + baseline_mode=baseline_mode, + ) + + def render_retrieval_eval_text_report(result_set: RetrievalEvalResultSet) -> str: summary = result_set.summary lines = [ @@ -702,6 +768,12 @@ def render_retrieval_eval_text_report(result_set: RetrievalEvalResultSet) -> str lines.append("advisories:") lines.extend(f" - {advisory.code}: {advisory.message}" for advisory in result_set.advisories) + advisory_report = result_set.advisory_report + lines.append(f"advisory report: {advisory_report.severity} - {advisory_report.summary}") + if advisory_report.recommended_actions: + lines.append("recommended actions:") + lines.extend(f" - {action}" for action in advisory_report.recommended_actions) + return "\n".join(lines) @@ -829,6 +901,7 @@ def evaluate_retrieval_fixtures( baseline_summary=baseline_summary, delta_summary=delta_summary, advisories=advisories, + advisory_report=_build_advisory_report(results=results, baseline_mode=baseline_mode), ) if fail_on_baseline_regression or selected_baseline_regression_memory_types is not None: if baseline_mode is None: diff --git a/tests/test_retrieval_evaluation.py b/tests/test_retrieval_evaluation.py index dd2f4d4..dd275d7 100644 --- a/tests/test_retrieval_evaluation.py +++ b/tests/test_retrieval_evaluation.py @@ -572,6 +572,15 @@ def test_cli_eval_retrieval_outputs_json_summary(tmp_path: Path) -> None: } assert payload["results"][0]["task_id"] == "project-m1-kb-export" assert payload["results"][0]["pass"] is True + assert payload["advisory_report"] == { + "severity": "ok", + "summary": "No retrieval advisory actions.", + "current_failure_task_ids": [], + "baseline_weak_spot_task_ids": [], + "current_regression_task_ids": [], + "recommended_actions": [], + "baseline_mode": None, + } @@ -643,6 +652,60 @@ def test_render_retrieval_eval_text_report_shows_baseline_weak_spots(tmp_path: P assert "current regressions vs baseline: none" in report + +def test_evaluate_retrieval_fixtures_builds_advisory_report_for_failures_and_baseline_weak_spots(tmp_path: Path) -> None: + from agent_memory.core.retrieval_eval import evaluate_retrieval_fixtures, render_retrieval_eval_text_report + + db_path = tmp_path / "retrieval-eval-advisory-report.db" + seeded_ids = _seed_retrieval_eval_db(db_path) + fixture_path = tmp_path / "retrieval-eval-advisory-report.json" + payload = _fixture_payload(seeded_ids) + payload["tasks"] = [payload["tasks"][0]] + payload["tasks"][0]["expected"]["facts"] = [seeded_ids["drift_fact_id"]] + payload["tasks"][0]["avoid"]["facts"] = [seeded_ids["fact_id"]] + fixture_path.write_text(json.dumps(payload, indent=2)) + + result = evaluate_retrieval_fixtures(db_path=db_path, fixtures_path=fixture_path, baseline_mode="lexical") + + assert result.advisory_report.severity == "high" + assert result.advisory_report.summary == "1 current task failed; 1 task has missing expected memories; 1 task has avoid-hit memories" + assert result.advisory_report.current_failure_task_ids == ["project-m1-kb-export"] + assert result.advisory_report.baseline_weak_spot_task_ids == [] + assert result.advisory_report.current_regression_task_ids == [] + assert result.advisory_report.recommended_actions == [ + "Inspect failed task details and compare retrieved_details against expected_details.", + "Seed or approve missing expected memories, or tighten fixture expectations if they are stale.", + "Review avoid-hit details for stale, cross-scope, or conflicting approved memories.", + ] + assert result.advisory_report.baseline_mode == "lexical" + + report = render_retrieval_eval_text_report(result) + assert "advisory report: high - 1 current task failed; 1 task has missing expected memories; 1 task has avoid-hit memories" in report + assert "recommended actions:" in report + assert " - Inspect failed task details and compare retrieved_details against expected_details." in report + + + +def test_evaluate_retrieval_fixtures_advisory_report_summarizes_baseline_weak_spots(tmp_path: Path) -> None: + from agent_memory.core.retrieval_eval import evaluate_retrieval_fixtures + + db_path = tmp_path / "retrieval-eval-baseline-advisory-report.db" + _seed_checked_in_fixture_eval_db(db_path) + fixture_path = _checked_in_fixture_dir() / "staleness" / "branch-only-current.json" + + result = evaluate_retrieval_fixtures(db_path=db_path, fixtures_path=fixture_path, baseline_mode="lexical-global") + + assert result.advisory_report.severity == "medium" + assert result.advisory_report.summary == "1 baseline weak spot found against lexical-global" + assert result.advisory_report.current_failure_task_ids == [] + assert result.advisory_report.baseline_weak_spot_task_ids == ["branch-only-current-policy"] + assert result.advisory_report.current_regression_task_ids == [] + assert result.advisory_report.recommended_actions == [ + "Use baseline weak spots as coverage wins: keep the fixture checked in and watch for future regressions.", + ] + + + def test_evaluate_retrieval_fixtures_emits_triage_detail_contract(tmp_path: Path) -> None: from agent_memory.core.retrieval_eval import evaluate_retrieval_fixtures