Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ Supported baseline modes include:
- `source-lexical`: lexical comparator over linked source content within preferred scope
- `source-global`: source-linked comparator that ignores preferred scope

Reports include per-task retrieved IDs, expected hits, missing IDs, avoid hits, pass/fail state, aggregate summaries, advisories, and failure triage details such as snippets, lifecycle status, scopes, and policy signals. Text reports are meant for maintainers reviewing failed retrieval tasks in a terminal; JSON is the stable machine-readable surface.
Reports include per-task retrieved IDs, expected hits, missing IDs, avoid hits, pass/fail state, aggregate summaries, soft-threshold advisories, and failure triage details such as snippets, lifecycle status, scopes, and policy signals. Every JSON result also includes an `advisory_report` with severity, affected task IDs, and recommended next actions. Text reports render the same advisory report as terminal-friendly guidance for maintainers reviewing failed retrieval tasks; JSON is the stable machine-readable surface.

## Current maturity

Expand Down
11 changes: 11 additions & 0 deletions src/agent_memory/core/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,16 @@ class RetrievalEvalAdvisory(BaseModel):
baseline_mode: str | None = None


class RetrievalEvalAdvisoryReport(BaseModel):
severity: Literal["ok", "medium", "high"] = "ok"
summary: str = "No retrieval advisory actions."
current_failure_task_ids: list[str] = Field(default_factory=list)
baseline_weak_spot_task_ids: list[str] = Field(default_factory=list)
current_regression_task_ids: list[str] = Field(default_factory=list)
recommended_actions: list[str] = Field(default_factory=list)
baseline_mode: str | None = None


class RetrievalEvalResultSet(BaseModel):
fixture_paths: list[str] = Field(default_factory=list)
summary: RetrievalEvalSummary = Field(default_factory=RetrievalEvalSummary)
Expand All @@ -353,3 +363,4 @@ class RetrievalEvalResultSet(BaseModel):
baseline_summary: RetrievalEvalBaselineSummary | None = None
delta_summary: RetrievalEvalDeltaSummary | None = None
advisories: list[RetrievalEvalAdvisory] = Field(default_factory=list)
advisory_report: RetrievalEvalAdvisoryReport = Field(default_factory=RetrievalEvalAdvisoryReport)
73 changes: 73 additions & 0 deletions src/agent_memory/core/retrieval_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from agent_memory.core.models import (
RetrievalEvalAdvisory,
RetrievalEvalAdvisoryReport,
RetrievalEvalBaselineSummary,
RetrievalEvalDelta,
RetrievalEvalDeltaSummary,
Expand Down Expand Up @@ -649,6 +650,71 @@ def _append_task_detail(lines: list[str], task: RetrievalEvalTaskResult, *, incl
)


def _plural(count: int, singular: str, plural: str | None = None) -> str:
label = singular if count == 1 else (plural or f"{singular}s")
return f"{count} {label}"


def _build_advisory_report(
*,
results: list[RetrievalEvalTaskResult],
baseline_mode: str | None,
) -> RetrievalEvalAdvisoryReport:
current_failure_task_ids = [task.task_id for task in results if not task.pass_]
missing_task_ids = [task.task_id for task in results if any(task.missing_expected[memory_type] for memory_type in _MEMORY_TYPES)]
avoid_hit_task_ids = [task.task_id for task in results if any(task.avoid_hits[memory_type] for memory_type in _MEMORY_TYPES)]
baseline_weak_spot_task_ids = [
task.task_id
for task in results
if task.pass_ and task.baseline is not None and not task.baseline.pass_
]
current_regression_task_ids = [
task.task_id
for task in results
if not task.pass_ and task.baseline is not None and task.baseline.pass_
]

summary_parts: list[str] = []
if current_failure_task_ids:
summary_parts.append(f"{_plural(len(current_failure_task_ids), 'current task')} failed")
if missing_task_ids:
summary_parts.append(f"{_plural(len(missing_task_ids), 'task')} has missing expected memories")
if avoid_hit_task_ids:
summary_parts.append(f"{_plural(len(avoid_hit_task_ids), 'task')} has avoid-hit memories")
if current_regression_task_ids and baseline_mode is not None:
summary_parts.append(f"{_plural(len(current_regression_task_ids), 'current regression')} against {baseline_mode}")
if baseline_weak_spot_task_ids and not current_failure_task_ids and baseline_mode is not None:
summary_parts.append(f"{_plural(len(baseline_weak_spot_task_ids), 'baseline weak spot')} found against {baseline_mode}")

recommended_actions: list[str] = []
if current_failure_task_ids:
recommended_actions.append("Inspect failed task details and compare retrieved_details against expected_details.")
if missing_task_ids:
recommended_actions.append("Seed or approve missing expected memories, or tighten fixture expectations if they are stale.")
if avoid_hit_task_ids:
recommended_actions.append("Review avoid-hit details for stale, cross-scope, or conflicting approved memories.")
if current_regression_task_ids:
recommended_actions.append("Compare current regressions against the selected baseline before merging retrieval changes.")
if baseline_weak_spot_task_ids and not current_failure_task_ids:
recommended_actions.append("Use baseline weak spots as coverage wins: keep the fixture checked in and watch for future regressions.")

severity = "ok"
if current_failure_task_ids or current_regression_task_ids:
severity = "high"
elif baseline_weak_spot_task_ids:
severity = "medium"

return RetrievalEvalAdvisoryReport(
severity=severity,
summary="; ".join(summary_parts) if summary_parts else "No retrieval advisory actions.",
current_failure_task_ids=current_failure_task_ids,
baseline_weak_spot_task_ids=baseline_weak_spot_task_ids,
current_regression_task_ids=current_regression_task_ids,
recommended_actions=recommended_actions,
baseline_mode=baseline_mode,
)


def render_retrieval_eval_text_report(result_set: RetrievalEvalResultSet) -> str:
summary = result_set.summary
lines = [
Expand Down Expand Up @@ -702,6 +768,12 @@ def render_retrieval_eval_text_report(result_set: RetrievalEvalResultSet) -> str
lines.append("advisories:")
lines.extend(f" - {advisory.code}: {advisory.message}" for advisory in result_set.advisories)

advisory_report = result_set.advisory_report
lines.append(f"advisory report: {advisory_report.severity} - {advisory_report.summary}")
if advisory_report.recommended_actions:
lines.append("recommended actions:")
lines.extend(f" - {action}" for action in advisory_report.recommended_actions)

return "\n".join(lines)


Expand Down Expand Up @@ -829,6 +901,7 @@ def evaluate_retrieval_fixtures(
baseline_summary=baseline_summary,
delta_summary=delta_summary,
advisories=advisories,
advisory_report=_build_advisory_report(results=results, baseline_mode=baseline_mode),
)
if fail_on_baseline_regression or selected_baseline_regression_memory_types is not None:
if baseline_mode is None:
Expand Down
63 changes: 63 additions & 0 deletions tests/test_retrieval_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -572,6 +572,15 @@ def test_cli_eval_retrieval_outputs_json_summary(tmp_path: Path) -> None:
}
assert payload["results"][0]["task_id"] == "project-m1-kb-export"
assert payload["results"][0]["pass"] is True
assert payload["advisory_report"] == {
"severity": "ok",
"summary": "No retrieval advisory actions.",
"current_failure_task_ids": [],
"baseline_weak_spot_task_ids": [],
"current_regression_task_ids": [],
"recommended_actions": [],
"baseline_mode": None,
}



Expand Down Expand Up @@ -643,6 +652,60 @@ def test_render_retrieval_eval_text_report_shows_baseline_weak_spots(tmp_path: P
assert "current regressions vs baseline: none" in report



def test_evaluate_retrieval_fixtures_builds_advisory_report_for_failures_and_baseline_weak_spots(tmp_path: Path) -> None:
from agent_memory.core.retrieval_eval import evaluate_retrieval_fixtures, render_retrieval_eval_text_report

db_path = tmp_path / "retrieval-eval-advisory-report.db"
seeded_ids = _seed_retrieval_eval_db(db_path)
fixture_path = tmp_path / "retrieval-eval-advisory-report.json"
payload = _fixture_payload(seeded_ids)
payload["tasks"] = [payload["tasks"][0]]
payload["tasks"][0]["expected"]["facts"] = [seeded_ids["drift_fact_id"]]
payload["tasks"][0]["avoid"]["facts"] = [seeded_ids["fact_id"]]
fixture_path.write_text(json.dumps(payload, indent=2))

result = evaluate_retrieval_fixtures(db_path=db_path, fixtures_path=fixture_path, baseline_mode="lexical")

assert result.advisory_report.severity == "high"
assert result.advisory_report.summary == "1 current task failed; 1 task has missing expected memories; 1 task has avoid-hit memories"
assert result.advisory_report.current_failure_task_ids == ["project-m1-kb-export"]
assert result.advisory_report.baseline_weak_spot_task_ids == []
assert result.advisory_report.current_regression_task_ids == []
assert result.advisory_report.recommended_actions == [
"Inspect failed task details and compare retrieved_details against expected_details.",
"Seed or approve missing expected memories, or tighten fixture expectations if they are stale.",
"Review avoid-hit details for stale, cross-scope, or conflicting approved memories.",
]
assert result.advisory_report.baseline_mode == "lexical"

report = render_retrieval_eval_text_report(result)
assert "advisory report: high - 1 current task failed; 1 task has missing expected memories; 1 task has avoid-hit memories" in report
assert "recommended actions:" in report
assert " - Inspect failed task details and compare retrieved_details against expected_details." in report



def test_evaluate_retrieval_fixtures_advisory_report_summarizes_baseline_weak_spots(tmp_path: Path) -> None:
from agent_memory.core.retrieval_eval import evaluate_retrieval_fixtures

db_path = tmp_path / "retrieval-eval-baseline-advisory-report.db"
_seed_checked_in_fixture_eval_db(db_path)
fixture_path = _checked_in_fixture_dir() / "staleness" / "branch-only-current.json"

result = evaluate_retrieval_fixtures(db_path=db_path, fixtures_path=fixture_path, baseline_mode="lexical-global")

assert result.advisory_report.severity == "medium"
assert result.advisory_report.summary == "1 baseline weak spot found against lexical-global"
assert result.advisory_report.current_failure_task_ids == []
assert result.advisory_report.baseline_weak_spot_task_ids == ["branch-only-current-policy"]
assert result.advisory_report.current_regression_task_ids == []
assert result.advisory_report.recommended_actions == [
"Use baseline weak spots as coverage wins: keep the fixture checked in and watch for future regressions.",
]



def test_evaluate_retrieval_fixtures_emits_triage_detail_contract(tmp_path: Path) -> None:
from agent_memory.core.retrieval_eval import evaluate_retrieval_fixtures

Expand Down