From 1755d1035717f6467974c7af3ad043eec5c923b3 Mon Sep 17 00:00:00 2001 From: cafitac Date: Thu, 30 Apr 2026 17:15:10 +0900 Subject: [PATCH] fix: keep retrieval eval read-only --- .dev/status/current-handoff.md | 99 ++++++++++--------------- README.md | 2 + src/agent_memory/core/retrieval.py | 20 ++--- src/agent_memory/core/retrieval_eval.py | 1 + tests/test_retrieval_evaluation.py | 25 +++++++ 5 files changed, 79 insertions(+), 68 deletions(-) diff --git a/.dev/status/current-handoff.md b/.dev/status/current-handoff.md index 1cf7924..bc1df49 100644 --- a/.dev/status/current-handoff.md +++ b/.dev/status/current-handoff.md @@ -1,7 +1,7 @@ # agent-memory current handoff Status: AI-authored draft. Not yet human-approved. -Last updated: 2026-04-30 16:30 KST +Last updated: 2026-04-30 17:12 KST ## Trigger for the next session @@ -16,11 +16,11 @@ read this file first. Do not ask the user to restate context. Verify repo state, ## Ready-to-say answer -지금 agent-memory는 OSS 기본 메모리 레이어 신뢰도 작업 Priority 1~3을 대부분 마쳤고, Priority 4 `Conflict, obsolete, and truth lifecycle`를 진행 중이야. +지금 agent-memory는 OSS 기본 메모리 레이어 신뢰도 작업 Priority 1~4의 주요 truth lifecycle 조각을 v0.1.30까지 완료했고, 다음 안정화 slice로 retrieval-eval determinism hardening을 진행 중이야. -완료된 최신 공개 릴리스는 v0.1.29야. v0.1.27에서 status transition history가 들어갔고, v0.1.28에서 npm wrapper stdin forwarding과 published smoke의 Hermes hook QA 경로가 보강됐고, v0.1.29에서 fact supersession/replacement relation이 들어갔어. +최신 검증 완료 릴리스는 v0.1.30이야. v0.1.27에서 status transition history, v0.1.28에서 npm wrapper stdin forwarding과 published Hermes hook smoke, v0.1.29에서 fact supersession/replacement relation, v0.1.30에서 `agent-memory review explain fact ...` decision explanation UX가 들어갔어. 로컬 Hermes hook도 v0.1.30 runtime으로 업데이트되어 doctor/hook smoke가 통과한 상태야. -현재 slice는 `feat: explain conflict review decisions`야. 목적은 reviewer가 특정 fact가 default retrieval에 보이는지/숨겨지는지, 왜 disputed/deprecated 되었는지, 어떤 같은 claim-slot 대안과 replacement chain이 있는지를 한 번에 설명받도록 `agent-memory review explain fact ...` forensic UX를 추가하는 것. +현재 slice는 retrieval evaluation이 실제 retrieval path를 쓰되 eval 실행 자체가 `retrieval_count`, `reinforcement_count`, `last_accessed_at`를 mutate하지 않게 만드는 determinism hardening이야. 목적은 fixture 순서나 반복 실행이 이후 ranking 결과를 흔들지 않게 하는 것. ## Current repo state @@ -37,17 +37,17 @@ Expected GitHub identity: Verified base before this slice: - branch: `main` -- HEAD: `e102865 chore: release v0.1.29 [skip release]` -- tag/release: `v0.1.29` -- GitHub Release: `https://github.com/cafitac/agent-memory/releases/tag/v0.1.29` -- npm: `@cafitac/agent-memory@0.1.29` -- PyPI: `cafitac-agent-memory==0.1.29` -- v0.1.29 published smoke artifact: passed; includes npm/uvx/pipx Hermes hook commands. +- HEAD: `5011d99 chore: release v0.1.30 [skip release] (#28)` +- tag/release: `v0.1.30` +- GitHub Release: `https://github.com/cafitac/agent-memory/releases/tag/v0.1.30` +- npm: `@cafitac/agent-memory@0.1.30` +- PyPI: `cafitac-agent-memory==0.1.30` +- v0.1.30 published smoke artifact: passed; includes npm/uvx/pipx Hermes hook commands. Active slice/worktree: -- branch: `feat/conflict-decision-explanations` -- worktree: `/Users/reddit/Project/agent-memory/.worktrees/conflict-decision-explanations` +- branch: `fix/retrieval-eval-deterministic-ordering` +- worktree: `/Users/reddit/Project/agent-memory/.worktrees/retrieval-eval-deterministic-ordering` Expected local untracked artifacts to preserve in the root checkout: @@ -59,7 +59,7 @@ Expected local untracked artifacts to preserve in the root checkout: Do not delete or commit these unless the user explicitly asks. -## What is complete through v0.1.29 +## What is complete through v0.1.30 ### Distribution and release automation @@ -73,7 +73,7 @@ Do not delete or commit these unless the user explicitly asks. ### Runtime adapter readiness - Hermes bootstrap/doctor/install flow exists and defaults to the conservative preset. -- This local Hermes setup has agent-memory enabled via `/Users/reddit/.agent-memory/runtime/v0.1.29/.venv/bin/agent-memory` against `/Users/reddit/.agent-memory/memory.db`. +- This local Hermes setup has agent-memory enabled via `/Users/reddit/.agent-memory/runtime/v0.1.30/.venv/bin/agent-memory` against `/Users/reddit/.agent-memory/memory.db`. - Hermes hook fails closed: unavailable DB/schema returns `{}` and exit 0 instead of breaking prompt flow. - Conservative preset remains default: small prompt budgets, one top memory, no alternative-memory detail, no reason-code noise. - `--preset balanced` is explicit opt-in for more context/noise. @@ -88,67 +88,48 @@ Do not delete or commit these unless the user explicitly asks. - Replacement relation direction: `fact: --superseded_by--> fact:`. - Superseding a fact deprecates the old fact and approves the replacement fact, preserving reason/actor/evidence in transition history. - `agent-memory review replacements fact ...` exposes replacement chains. +- `agent-memory review explain fact ...` explains status, default retrieval visibility, same claim-slot alternatives, replacement chain, and review follow-up commands. -## Current slice: explain conflict review decisions +## Current slice: retrieval-eval deterministic ordering hardening Planned behavior: -```bash -agent-memory review explain fact "$DB" -``` +- `evaluate_retrieval_fixtures(...)` continues to call the real `retrieve_memory_packet(...)` path. +- Evaluation runs suppress retrieval bookkeeping writes so repeated evals and fixture order cannot change future ranking via reinforcement state. +- Normal runtime retrieval still records approved memory retrievals by default. -Expected JSON payload: +Implementation direction: -- `fact`: the selected fact. -- `decision`: current status, whether it is visible in default retrieval, and a short summary. -- `claim_slot`: same subject/predicate/scope alternatives plus status counts. -- `history`: transition history with reason/actor/evidence. -- `replacement_chain`: superseded-by and replaces relation edges. -- `default_retrieval_policy`: `approved_only`. - -This is a small UX layer on top of v0.1.27 transition history and v0.1.29 supersession relation. It should not change retrieval behavior. +- Add/keep a default-on `record_retrievals` option on `retrieve_memory_packet(...)`. +- Call `retrieve_memory_packet(..., record_retrievals=False)` from `core/retrieval_eval.py`. +- Test that eval does not mutate `retrieval_count`, `reinforcement_count`, or `last_accessed_at`. ## Verification checklist for this slice Run from the active worktree: ```bash -uv run pytest tests/test_cli.py::test_python_module_cli_review_explain_fact_shows_decision_context -q -uv run pytest tests/test_review_and_scope_ranking.py tests/test_cli.py -q +uv run pytest tests/test_retrieval_evaluation.py::test_evaluate_retrieval_fixtures_does_not_mutate_retrieval_counters -q +uv run pytest tests/test_retrieval_evaluation.py tests/test_retrieval_trace.py tests/test_hermes_adapter.py -q uv run pytest tests/ -q uv run python scripts/check_release_metadata.py uv run python scripts/smoke_release_readiness.py npm pack --dry-run git diff --check +node --check bin/agent-memory.js ``` -Before commit, scan the diff for secrets/tokens/credentials and preserve local-only untracked files. - -## Recommended next work after this slice - -1. Retrieval eval determinism/flake hardening. - - v0.1.25/v0.1.26 had historical one-off retrieval eval verify failures that passed on rerun. - - Tighten ordering, fixtures, and failure diagnostics before deeper graph traversal. -2. Release workflow protected-main improvement. - - Auto-release direct push still fails under current rules. - - Either codify release-sync PR fallback or adjust permissions/rulesets safely. -3. Graph-centered foundation. - - Entity/Concept canonicalization. - - relation edge traversal. - - graph inspection CLI. - - graph retrieval eval fixtures. - - depth/drift controls. -4. Long-run Hermes dogfood/noise monitoring. - - The v0.1.29 hook is live locally, so collect latency/noise/quality observations before raising prompt-context budgets. - -## Known operational issues - -- Protected `main` blocks auto-release write-back. Established workaround: - 1. create `release-sync/vX.Y.Z` branch from `origin/main`, - 2. run `scripts/bump_release_version.py --patch`, - 3. run `uv lock`, tests/readiness/npm dry-run/diff checks, - 4. open/merge `chore: release vX.Y.Z [skip release]` PR, - 5. push annotated tag `vX.Y.Z`, - 6. verify publish workflow, registries, GitHub Release, and smoke artifact. -- PyPI Trusted Publisher is deferred by user preference. -- Do not expose secrets/tokens/API keys. If encountered, redact as `[REDACTED]`. +Before PR, run a static diff secret scan and confirm finding_count 0. + +## PR/release notes + +This slice should be a patch release candidate, likely v0.1.31 after PR merge. If protected `main` blocks auto-release write-back again, use the existing release-sync PR + tag push workaround. + +After release, verify GitHub Release/npm/PyPI/published-install-smoke and update the local Hermes runtime only if the release contains runtime-relevant package changes. This slice changes retrieval/eval behavior in the Python package, so a v0.1.31 runtime update is still preferred for dogfood parity. + +## Next likely slices after this + +1. Release workflow protected-main automation/fallback improvement. +2. Actual Hermes dogfood observations and noise/latency notes. +3. Graph foundation read-only slice: graph inspection CLI or bounded relation traversal eval fixtures. +4. PyPI Trusted Publisher later; user deferred it. diff --git a/README.md b/README.md index 88231ab..3c54bda 100644 --- a/README.md +++ b/README.md @@ -207,6 +207,8 @@ Supported baseline modes include: Reports include per-task retrieved IDs, expected hits, missing IDs, avoid hits, pass/fail state, aggregate summaries, soft-threshold advisories, and failure triage details such as snippets, lifecycle status, scopes, and policy signals. Every JSON result also includes an `advisory_report` with severity, affected task IDs, and recommended next actions. Text reports render the same advisory report as terminal-friendly guidance for maintainers reviewing failed retrieval tasks; JSON is the stable machine-readable surface. +The evaluator calls the real retrieval path but suppresses retrieval bookkeeping side effects while it runs. Eval tasks do not increment `retrieval_count`, `reinforcement_count`, or `last_accessed_at`, so fixture order and repeated local/CI runs do not perturb later ranking results. + ## Current maturity agent-memory is alpha software, but the public install path is validated. diff --git a/src/agent_memory/core/retrieval.py b/src/agent_memory/core/retrieval.py index 6072995..158914a 100644 --- a/src/agent_memory/core/retrieval.py +++ b/src/agent_memory/core/retrieval.py @@ -288,6 +288,7 @@ def retrieve_memory_packet( limit: int = 5, preferred_scope: str | None = None, statuses: tuple[MemoryStatus, ...] = ("approved",), + record_retrievals: bool = True, ) -> MemoryPacket: if statuses == ("approved",): ranked_facts = search_ranked_approved_facts( @@ -420,15 +421,16 @@ def retrieve_memory_packet( ], ) - for fact in semantic_facts: - if fact.status == "approved": - record_memory_retrieval(db_path, memory_type="fact", memory_id=fact.id) - for procedure in procedural_guidance: - if procedure.status == "approved": - record_memory_retrieval(db_path, memory_type="procedure", memory_id=procedure.id) - for episode in episodic_context: - if episode.status == "approved": - record_memory_retrieval(db_path, memory_type="episode", memory_id=episode.id) + if record_retrievals: + for fact in semantic_facts: + if fact.status == "approved": + record_memory_retrieval(db_path, memory_type="fact", memory_id=fact.id) + for procedure in procedural_guidance: + if procedure.status == "approved": + record_memory_retrieval(db_path, memory_type="procedure", memory_id=procedure.id) + for episode in episodic_context: + if episode.status == "approved": + record_memory_retrieval(db_path, memory_type="episode", memory_id=episode.id) return MemoryPacket( query=query, diff --git a/src/agent_memory/core/retrieval_eval.py b/src/agent_memory/core/retrieval_eval.py index d650b63..0041714 100644 --- a/src/agent_memory/core/retrieval_eval.py +++ b/src/agent_memory/core/retrieval_eval.py @@ -461,6 +461,7 @@ def _evaluate_task( query=task.query, limit=task.limit, preferred_scope=task.preferred_scope, + record_retrievals=False, ) primary_metrics = _evaluate_retrieved_ids( mode="current", diff --git a/tests/test_retrieval_evaluation.py b/tests/test_retrieval_evaluation.py index 5e60386..35cfac7 100644 --- a/tests/test_retrieval_evaluation.py +++ b/tests/test_retrieval_evaluation.py @@ -350,6 +350,31 @@ def test_evaluate_retrieval_fixture_directory_recurses_and_sorts_paths(tmp_path: +def test_evaluate_retrieval_fixtures_does_not_mutate_retrieval_counters(tmp_path: Path) -> None: + from agent_memory.core.retrieval_eval import evaluate_retrieval_fixtures + from agent_memory.storage.sqlite import connect + + db_path = tmp_path / "retrieval-eval-readonly.db" + seeded_ids = _seed_retrieval_eval_db(db_path) + fixture_path = _write_fixture_file(tmp_path, seeded_ids) + + evaluate_retrieval_fixtures(db_path=db_path, fixtures_path=fixture_path) + + with connect(db_path) as connection: + fact_row = connection.execute( + "SELECT retrieval_count, reinforcement_count, last_accessed_at FROM facts WHERE id = ?", + (seeded_ids["fact_id"],), + ).fetchone() + procedure_row = connection.execute( + "SELECT retrieval_count, reinforcement_count, last_accessed_at FROM procedures WHERE id = ?", + (seeded_ids["procedure_id"],), + ).fetchone() + + assert dict(fact_row) == {"retrieval_count": 0, "reinforcement_count": 0.0, "last_accessed_at": None} + assert dict(procedure_row) == {"retrieval_count": 0, "reinforcement_count": 0.0, "last_accessed_at": None} + + + def test_checked_in_retrieval_eval_examples_validate_as_fixture_models() -> None: from agent_memory.core.models import RetrievalEvalFixture