diff --git a/.dev/status/current-handoff.md b/.dev/status/current-handoff.md index cab072f..0d18053 100644 --- a/.dev/status/current-handoff.md +++ b/.dev/status/current-handoff.md @@ -1,7 +1,7 @@ # agent-memory current handoff Status: AI-authored draft. Not yet human-approved. -Last updated: 2026-04-29 01:57 KST +Last updated: 2026-04-30 06:43 KST ## Trigger for the next session @@ -17,17 +17,17 @@ then read this file first and answer from the "Ready-to-say answer" section belo ## Ready-to-say answer -지금 바로 해야 할 건 `agent-memory`에서 retrieval evaluation fixture/harness를 시작하는 거야. +지금 바로 해야 할 건 `agent-memory`의 retrieval evaluation 결과를 사람이 빠르게 읽고 triage할 수 있게 만드는 거야. -KB M1/M1+와 v0.1.8 release/smoke까지 끝났고, 다음 단계는 embeddings/reranking 같은 복잡도를 넣기 전에 retrieval 품질을 측정하는 최소 평가 루프를 만드는 거야. +Runtime adapters, retrieval fixture harness, npm/PyPI distribution, 그리고 main-merge 자동 릴리즈는 검증됐어. 다음 단계는 embeddings/reranking 같은 검색 복잡도를 더 넣기 전에, fixture 결과를 사람이 한눈에 보고 어떤 memory type / task type이 약한지 판단할 수 있는 report surface를 다듬는 거야. 진행 순서는: 1. `~/Project/agent-memory` 상태 확인 -2. `.dev/kb/retrieval-evaluation-v0.md`를 기준으로 `.dev/kb/retrieval-eval-m1-implementation-plan.md` 작성 -3. TDD로 `tests/test_retrieval_evaluation.py`부터 추가 -4. `agent-memory eval retrieval ` 또는 최소 core API를 구현 -5. fixture 기반으로 expected memory IDs / drift / counts를 검증 -6. README는 검증 후 짧게만 업데이트 +2. retrieval eval report/CLI 현재 구조 확인 +3. TDD로 `--format text` 또는 동등한 human summary surface 테스트 추가 +4. JSON contract는 유지하면서 compact text report 구현 +5. README와 이 handoff를 검증 결과 기준으로 갱신 +6. PR/CI/merge 후 main auto-release가 새 patch를 배포하는지 확인 이 작업부터 진행하면 돼. @@ -39,23 +39,25 @@ Canonical repo path: Current branch/release state at this handoff: -- branch: `main` -- remote: `origin` -> `git@github.com-cafitac:cafitac/agent-memory.git` -- git status at last check: clean, `main...origin/main` -- latest commit: `750ef36 chore: release v0.1.8` -- latest validated release: `v0.1.8` -- npm: `@cafitac/agent-memory@0.1.8` -- PyPI: `cafitac-agent-memory==0.1.8` -- GitHub Release: `https://github.com/cafitac/agent-memory/releases/tag/v0.1.8` +- branch: `main` before the current report-summary branch +- remote: `origin` -> `https://github.com/cafitac/agent-memory.git` in the local checkout after gh HTTPS push repair +- git status at last check: tracked files clean on main; pre-existing untracked local agent/dev state remains intentionally preserved +- latest commit before this branch: `67653e9 chore: release v0.1.11 [skip release]` +- latest validated release: `v0.1.11` +- npm: `@cafitac/agent-memory@0.1.11` +- PyPI: `cafitac-agent-memory==0.1.11` +- GitHub Release: `https://github.com/cafitac/agent-memory/releases/tag/v0.1.11` +- main-merge auto-release is active: `auto-release.yml` bumps patch metadata, commits `[skip release]`, tags, and dispatches `publish.yml` because `GITHUB_TOKEN` tag pushes do not trigger downstream tag workflows reliably Important run IDs: -- `25065915434` — CI success for `b468166 feat: enrich KB export provenance` -- `25066123570` — main CI success for `750ef36 chore: release v0.1.8` -- `25066195998` — publish workflow success for `v0.1.8` -- `25066196035` — tag CI success for `v0.1.8` +- `25134278544` — first auto-release main run, created `v0.1.10` but exposed the bot-created tag dispatch gap +- `25134636398` — fixed auto-release main run for PR #5, successfully bumped/tagged `v0.1.11` and dispatched publish +- `25134684685` — publish workflow success for `v0.1.11` +- `25134830075` — manual repair publish workflow success for the earlier `v0.1.10` tag +- `25133706803` — publish workflow success for `v0.1.9` -Published install smoke for `v0.1.8` was completed after registry propagation: +Published install smoke for `v0.1.11` was completed after registry propagation: - npm global install path passed - npm wrapper `agent-memory kb export --help` passed @@ -67,7 +69,7 @@ Published install smoke for `v0.1.8` was completed after registry propagation: - uv tool `kb export --help`, `bootstrap`, `doctor` passed - final smoke output: `published install smoke ok` -Note: the first npm smoke attempt for `v0.1.8` failed because the npm launcher correctly pinned `cafitac-agent-memory==0.1.8` before uv/PyPI simple-index resolution had caught up. A retry after propagation succeeded. This is the same known registry-propagation behavior seen in v0.1.7 and is not currently a code blocker. +Note: registry metadata and delegated installer resolvers can briefly disagree after publish. This happened again during `v0.1.10`/`v0.1.11` validation; retrying with propagation time or `uvx --refresh` confirmed the published packages. This is not currently a code blocker. ## What is complete @@ -78,7 +80,7 @@ Note: the first npm smoke attempt for `v0.1.8` failed because the npm launcher c - npm is the shortest onboarding surface; PyPI is the canonical Python runtime. - npm thin launcher pins the delegated Python package to the npm package version. - GitHub Actions CI/publish flow is validated. -- Actual published install smoke is validated through `v0.1.8`. +- Actual published install smoke is validated through `v0.1.11`. ### Hermes integration @@ -180,6 +182,7 @@ Current verified behavior: - optional `--fail-on-baseline-regression-memory-type {facts,procedures,episodes}` can scope baseline-relative gating down to selected primary task types instead of failing on every current argparse.ArgumentParser: eval_retrieval_parser.add_argument("db_path", type=Path) eval_retrieval_parser.add_argument("fixtures_path", type=Path) eval_retrieval_parser.add_argument("--baseline-mode", choices=["lexical", "lexical-global", "source-lexical", "source-global"]) + eval_retrieval_parser.add_argument("--format", choices=["json", "text"], default="json") eval_retrieval_parser.add_argument("--fail-on-regression", action="store_true") eval_retrieval_parser.add_argument("--warn-on-regression-threshold", type=int) eval_retrieval_parser.add_argument("--fail-on-baseline-regression", action="store_true") @@ -535,7 +540,10 @@ def main() -> None: except RetrievalEvalRegressionError as exc: print(str(exc), file=sys.stderr) raise SystemExit(1) from exc - print(result.model_dump_json(indent=2, by_alias=True)) + if args.format == "text": + print(render_retrieval_eval_text_report(result)) + else: + print(result.model_dump_json(indent=2, by_alias=True)) return raise ValueError(f"Unsupported eval action: {args.eval_action}") diff --git a/src/agent_memory/core/retrieval_eval.py b/src/agent_memory/core/retrieval_eval.py index 3049e3e..05c7a16 100644 --- a/src/agent_memory/core/retrieval_eval.py +++ b/src/agent_memory/core/retrieval_eval.py @@ -486,6 +486,65 @@ def _build_summary(task_metrics: list[tuple[str, RetrievalEvalRunMetrics]]) -> R return summary +def _signed_delta(value: int) -> str: + return f"{value:+d}" + + +def _format_summary_line(prefix: str, summary: RetrievalEvalSummary) -> str: + return ( + f"{prefix}: failures={summary.failed_tasks} " + f"missing={summary.total_missing_expected} " + f"avoid={summary.total_avoid_hits} " + f"expected_hits={summary.total_expected_hits}" + ) + + +def _format_type_summary(memory_type: str, summary: RetrievalEvalMemoryTypeSummary) -> str: + return ( + f" {memory_type}: {summary.passed_tasks}/{summary.total_tasks} passed, " + f"missing={summary.total_missing_expected}, avoid={summary.total_avoid_hits}" + ) + + +def render_retrieval_eval_text_report(result_set: RetrievalEvalResultSet) -> str: + summary = result_set.summary + lines = [ + f"Retrieval evaluation: {summary.passed_tasks}/{summary.total_tasks} tasks passed", + _format_summary_line("current", summary), + ] + + if result_set.baseline_summary is not None: + baseline = result_set.baseline_summary + lines.append(f"baseline {baseline.mode}: {baseline.passed_tasks}/{baseline.total_tasks} tasks passed") + if result_set.delta_summary is not None: + delta = result_set.delta_summary + lines.append( + "delta: " + f"pass_count={_signed_delta(delta.total_pass_count_delta)} " + f"expected_hits={_signed_delta(delta.total_expected_hit_delta)} " + f"missing={_signed_delta(delta.total_missing_expected_delta)} " + f"avoid={_signed_delta(delta.total_avoid_hit_delta)}" + ) + + lines.append("by primary task type:") + for memory_type in _MEMORY_TYPES: + type_summary = summary.by_primary_task_type.get(memory_type, RetrievalEvalMemoryTypeSummary()) + lines.append(_format_type_summary(memory_type, type_summary)) + + failed_task_ids = [task.task_id for task in result_set.results if not task.pass_] + if failed_task_ids: + lines.append("failed tasks:") + lines.extend(f" - {task_id}" for task_id in failed_task_ids) + else: + lines.append("failed tasks: none") + + if result_set.advisories: + lines.append("advisories:") + lines.extend(f" - {advisory.code}: {advisory.message}" for advisory in result_set.advisories) + + return "\n".join(lines) + + def evaluate_retrieval_fixtures( db_path: Path | str, fixtures_path: Path | str, diff --git a/tests/test_retrieval_evaluation.py b/tests/test_retrieval_evaluation.py index 2c86f59..6a3fb7b 100644 --- a/tests/test_retrieval_evaluation.py +++ b/tests/test_retrieval_evaluation.py @@ -575,6 +575,60 @@ def test_cli_eval_retrieval_outputs_json_summary(tmp_path: Path) -> None: + +def test_render_retrieval_eval_text_report_summarizes_passes_and_type_rollups(tmp_path: Path) -> None: + from agent_memory.core.retrieval_eval import evaluate_retrieval_fixtures, render_retrieval_eval_text_report + + db_path = tmp_path / "retrieval-eval-text.db" + seeded_ids = _seed_retrieval_eval_db(db_path) + fixture_path = _write_fixture_file(tmp_path, seeded_ids) + + result = evaluate_retrieval_fixtures(db_path=db_path, fixtures_path=fixture_path, baseline_mode="lexical") + report = render_retrieval_eval_text_report(result) + + assert "Retrieval evaluation: 2/2 tasks passed" in report + assert "current: failures=0 missing=0 avoid=0 expected_hits=2" in report + assert "baseline lexical: 2/2 tasks passed" in report + assert "delta: pass_count=+0 expected_hits=+0 missing=+0 avoid=+0" in report + assert "by primary task type:" in report + assert "facts: 1/1 passed, missing=0, avoid=0" in report + assert "procedures: 1/1 passed, missing=0, avoid=0" in report + assert "failed tasks: none" in report + + +def test_cli_eval_retrieval_text_format_outputs_human_summary(tmp_path: Path) -> None: + db_path = tmp_path / "retrieval-eval-cli-text.db" + seeded_ids = _seed_retrieval_eval_db(db_path) + fixture_path = _write_fixture_file(tmp_path, seeded_ids) + env = {**os.environ, "PYTHONPATH": "src"} + + result = subprocess.run( + [ + sys.executable, + "-m", + "agent_memory.api.cli", + "eval", + "retrieval", + str(db_path), + str(fixture_path), + "--baseline-mode", + "lexical", + "--format", + "text", + ], + cwd=Path(__file__).resolve().parents[1], + env=env, + capture_output=True, + text=True, + ) + + assert result.returncode == 0, result.stderr + assert result.stdout.startswith("Retrieval evaluation: 2/2 tasks passed") + assert "baseline lexical: 2/2 tasks passed" in result.stdout + assert "by primary task type:" in result.stdout + assert not result.stdout.lstrip().startswith("{") + + def test_evaluate_retrieval_fixtures_preserves_task_rationale_and_notes(tmp_path: Path) -> None: from agent_memory.core.retrieval_eval import evaluate_retrieval_fixtures