From 97a8f9a1a2e9454d0d016048462e4490a8f6c766 Mon Sep 17 00:00:00 2001 From: cafitac Date: Thu, 30 Apr 2026 16:38:13 +0900 Subject: [PATCH] feat: explain fact review decisions --- .dev/status/current-handoff.md | 167 ++++++++++++++------------------- README.md | 3 + docs/install-smoke.md | 2 +- src/agent_memory/api/cli.py | 89 +++++++++++++++++- tests/test_cli.py | 93 ++++++++++++++++++ 5 files changed, 252 insertions(+), 102 deletions(-) diff --git a/.dev/status/current-handoff.md b/.dev/status/current-handoff.md index b11bd1f..1cf7924 100644 --- a/.dev/status/current-handoff.md +++ b/.dev/status/current-handoff.md @@ -1,18 +1,15 @@ # agent-memory current handoff Status: AI-authored draft. Not yet human-approved. -Last updated: 2026-04-30 15:55 KST +Last updated: 2026-04-30 16:30 KST ## Trigger for the next session If the user starts a fresh session with a vague prompt such as: > 지금 해야하는거 알려줘 - > 다음으로 진행할거 해줘 - > 다음 거 진행해줘 - > agent-memory 이어서 해줘 read this file first. Do not ask the user to restate context. Verify repo state, then answer from the current roadmap position below. @@ -21,7 +18,9 @@ read this file first. Do not ask the user to restate context. Verify repo state, 지금 agent-memory는 OSS 기본 메모리 레이어 신뢰도 작업 Priority 1~3을 대부분 마쳤고, Priority 4 `Conflict, obsolete, and truth lifecycle`를 진행 중이야. -완료된 최신 공개 릴리스는 v0.1.28이야. v0.1.27에서 status transition history가 들어갔고, v0.1.28에서 npm wrapper stdin forwarding과 published smoke의 Hermes hook QA 경로가 보강됐어. 현재 slice는 `feat: add supersedes/replaces relation for facts`야. 목적은 새 fact가 옛 fact를 대체했음을 구조적으로 남겨서 deprecated memory가 왜 폐기됐고 무엇으로 대체됐는지 설명할 수 있게 하는 것. graph/hybrid retrieval 전에 stale memory가 graph를 타고 퍼지지 않게 하는 truth lifecycle 기반이야. +완료된 최신 공개 릴리스는 v0.1.29야. v0.1.27에서 status transition history가 들어갔고, v0.1.28에서 npm wrapper stdin forwarding과 published smoke의 Hermes hook QA 경로가 보강됐고, v0.1.29에서 fact supersession/replacement relation이 들어갔어. + +현재 slice는 `feat: explain conflict review decisions`야. 목적은 reviewer가 특정 fact가 default retrieval에 보이는지/숨겨지는지, 왜 disputed/deprecated 되었는지, 어떤 같은 claim-slot 대안과 replacement chain이 있는지를 한 번에 설명받도록 `agent-memory review explain fact ...` forensic UX를 추가하는 것. ## Current repo state @@ -35,20 +34,20 @@ Expected GitHub identity: - Use `HOME=/Users/reddit` for gh commands. - Remote: `origin` -> `https://github.com/cafitac/agent-memory.git` -Current verified base before this slice: +Verified base before this slice: - branch: `main` -- HEAD: `1467d24 chore: release v0.1.28 [skip release]` -- tag/release: `v0.1.28` -- GitHub Release: `https://github.com/cafitac/agent-memory/releases/tag/v0.1.28` -- npm: `@cafitac/agent-memory@0.1.28` -- PyPI: `cafitac-agent-memory==0.1.28` -- v0.1.28 published smoke artifact: passed; includes npm/uvx/pipx Hermes hook commands. +- HEAD: `e102865 chore: release v0.1.29 [skip release]` +- tag/release: `v0.1.29` +- GitHub Release: `https://github.com/cafitac/agent-memory/releases/tag/v0.1.29` +- npm: `@cafitac/agent-memory@0.1.29` +- PyPI: `cafitac-agent-memory==0.1.29` +- v0.1.29 published smoke artifact: passed; includes npm/uvx/pipx Hermes hook commands. Active slice/worktree: -- branch: `feat/fact-supersedes-replaces` -- worktree: `/Users/reddit/Project/agent-memory/.worktrees/fact-supersedes-replaces` +- branch: `feat/conflict-decision-explanations` +- worktree: `/Users/reddit/Project/agent-memory/.worktrees/conflict-decision-explanations` Expected local untracked artifacts to preserve in the root checkout: @@ -60,7 +59,7 @@ Expected local untracked artifacts to preserve in the root checkout: Do not delete or commit these unless the user explicitly asks. -## What is complete through v0.1.28 +## What is complete through v0.1.29 ### Distribution and release automation @@ -69,69 +68,52 @@ Do not delete or commit these unless the user explicitly asks. - main merge auto-release is active but protected `main` can block release metadata write-back; if that happens, use release-sync PR + tag push. - Publish workflow gates GitHub Release creation on `published-install-smoke` after npm/PyPI publish. - Published smoke uploads `published-install-smoke-result` JSON artifact with success/failure diagnostics. -- v0.1.28 smoke covers npm/npx/npm-exec/uvx/pipx and Hermes hook stdin payload handling. +- v0.1.28+ smoke covers npm/npx/npm-exec/uvx/pipx and Hermes hook stdin payload handling. ### Runtime adapter readiness - Hermes bootstrap/doctor/install flow exists and defaults to the conservative preset. -- This local Hermes setup has agent-memory enabled via `/Users/reddit/.agent-memory/runtime/v0.1.28/.venv/bin/agent-memory` against `/Users/reddit/.agent-memory/memory.db`. +- This local Hermes setup has agent-memory enabled via `/Users/reddit/.agent-memory/runtime/v0.1.29/.venv/bin/agent-memory` against `/Users/reddit/.agent-memory/memory.db`. - Hermes hook fails closed: unavailable DB/schema returns `{}` and exit 0 instead of breaking prompt flow. - Conservative preset remains default: small prompt budgets, one top memory, no alternative-memory detail, no reason-code noise. - `--preset balanced` is explicit opt-in for more context/noise. -### Retrieval eval and quality visibility - -- `agent-memory eval retrieval` exists with JSON/text reporting, baseline comparators, regression gates, failure triage, and structured `advisory_report`. -- On regression gate failure, CLI stderr prints a human-readable advisory report when available. - -### Memory lifecycle and conflict handling - -- Memory statuses: `candidate`, `approved`, `disputed`, `deprecated`. -- Default retrieval remains approved-only. -- `retrieve --status approved|candidate|disputed|deprecated|all` supports intentional forensic retrieval. -- `review conflicts fact ...` shows same-slot fact lifecycle across statuses. -- `review history ...` shows status transition history with reason/actor/evidence/timestamp. -- Current slice adds fact replacement chains: old fact `superseded_by` new fact, old fact deprecated, replacement fact approved. - -## Immediate next work: finish fact supersedes/replaces PR +### Truth lifecycle readiness -Goal: +- Normal retrieval is approved-only by default. +- Candidate/disputed/deprecated facts remain available only behind explicit forensic/review surfaces. +- `memory_status_transitions` records status changes with from/to status, reason, actor, evidence IDs, and timestamp. +- `agent-memory review history fact|procedure|episode ...` exposes transition history. +- `agent-memory review supersede fact ` records fact replacement as a relation edge. +- Replacement relation direction: `fact: --superseded_by--> fact:`. +- Superseding a fact deprecates the old fact and approves the replacement fact, preserving reason/actor/evidence in transition history. +- `agent-memory review replacements fact ...` exposes replacement chains. -Land the second Priority 4 truth-lifecycle slice: record when one fact supersedes/replaces another, preserve status-transition history, and expose replacement chains through CLI review surfaces while keeping normal retrieval approved-only. +## Current slice: explain conflict review decisions -Active branch/worktree: +Planned behavior: ```bash -cd /Users/reddit/Project/agent-memory/.worktrees/fact-supersedes-replaces +agent-memory review explain fact "$DB" ``` -Implemented in this slice so far: +Expected JSON payload: -1. Tests - - storage/curation test for `supersede_fact(...)` recording `superseded_by` relation, deprecating old fact, approving replacement fact, and preserving transition history. - - CLI test for `agent-memory review supersede fact ...` and `agent-memory review replacements fact ...`. +- `fact`: the selected fact. +- `decision`: current status, whether it is visible in default retrieval, and a short summary. +- `claim_slot`: same subject/predicate/scope alternatives plus status counts. +- `history`: transition history with reason/actor/evidence. +- `replacement_chain`: superseded-by and replaces relation edges. +- `default_retrieval_policy`: `approved_only`. -2. Storage/curation/CLI - - `get_fact(...)` - - `list_fact_replacement_relations(...)` - - `supersede_fact(...)` - - `review supersede fact` command - - `review replacements fact` command +This is a small UX layer on top of v0.1.27 transition history and v0.1.29 supersession relation. It should not change retrieval behavior. -3. Docs - - README forensic review examples mention replacement chains. - - `docs/install-smoke.md` forensic review surface mentions `review replacements`. +## Verification checklist for this slice -Focused tests passed: - -```bash -uv run pytest tests/test_review_and_scope_ranking.py::test_supersede_fact_records_replacement_relation_and_status_history tests/test_cli.py::test_python_module_cli_review_supersede_fact_shows_replacement_chain -q -# 2 passed -``` - -Remaining verification: +Run from the active worktree: ```bash +uv run pytest tests/test_cli.py::test_python_module_cli_review_explain_fact_shows_decision_context -q uv run pytest tests/test_review_and_scope_ranking.py tests/test_cli.py -q uv run pytest tests/ -q uv run python scripts/check_release_metadata.py @@ -140,44 +122,33 @@ npm pack --dry-run git diff --check ``` -Then commit, push, open PR, watch CI, merge when green, verify auto-release/publish, and verify npm/PyPI/GitHub Release/published-smoke artifact. If auto-release cannot push protected `main`, use release-sync PR + tag push. - -## Roadmap position - -Final goal: - -agent-memory should be credible as an OSS default memory layer for Hermes, Codex, and Claude Code: safe to install, safe to leave on, measurable, debuggable, conservative by default, and able to explain why memories are trusted or obsolete. - -### Priority 1 — Retrieval quality measurement and triage - -Status: core complete; broader corpus/flake hardening remain. - -### Priority 2 — Always-on hook safety and conservative defaults - -Status: mostly complete; real dogfood observations can continue. - -### Priority 3 — Fresh-user onboarding matrix automation - -Status: mostly complete; published smoke gate is active and now includes Hermes hook stdin QA. - -### Priority 4 — Conflict, obsolete, and truth lifecycle - -Status: in progress. - -Completed: - -- v0.1.27: memory transition history. - -Current: - -- fact supersedes/replaces relation. - -Likely next candidates: - -1. conflict review decision explanation UX. -2. retrieval eval determinism/flake hardening. -3. graph-centered foundation only after lifecycle chains are strong enough. - -### Priority 5 — Long-run dogfood and noise monitoring - -Status: not started beyond docs/checklists. +Before commit, scan the diff for secrets/tokens/credentials and preserve local-only untracked files. + +## Recommended next work after this slice + +1. Retrieval eval determinism/flake hardening. + - v0.1.25/v0.1.26 had historical one-off retrieval eval verify failures that passed on rerun. + - Tighten ordering, fixtures, and failure diagnostics before deeper graph traversal. +2. Release workflow protected-main improvement. + - Auto-release direct push still fails under current rules. + - Either codify release-sync PR fallback or adjust permissions/rulesets safely. +3. Graph-centered foundation. + - Entity/Concept canonicalization. + - relation edge traversal. + - graph inspection CLI. + - graph retrieval eval fixtures. + - depth/drift controls. +4. Long-run Hermes dogfood/noise monitoring. + - The v0.1.29 hook is live locally, so collect latency/noise/quality observations before raising prompt-context budgets. + +## Known operational issues + +- Protected `main` blocks auto-release write-back. Established workaround: + 1. create `release-sync/vX.Y.Z` branch from `origin/main`, + 2. run `scripts/bump_release_version.py --patch`, + 3. run `uv lock`, tests/readiness/npm dry-run/diff checks, + 4. open/merge `chore: release vX.Y.Z [skip release]` PR, + 5. push annotated tag `vX.Y.Z`, + 6. verify publish workflow, registries, GitHub Release, and smoke artifact. +- PyPI Trusted Publisher is deferred by user preference. +- Do not expose secrets/tokens/API keys. If encountered, redact as `[REDACTED]`. diff --git a/README.md b/README.md index bf617fc..88231ab 100644 --- a/README.md +++ b/README.md @@ -89,8 +89,11 @@ agent-memory review approve fact "$DB" 1 --reason "Verified from current setup g agent-memory review history fact "$DB" 1 agent-memory review supersede fact "$DB" 1 2 --reason "Newer source replaces the old install path." --actor maintainer --evidence-ids-json '[2]' agent-memory review replacements fact "$DB" 1 +agent-memory review explain fact "$DB" 1 ``` +`review explain` combines the current status, default retrieval visibility, transition history, same claim-slot alternatives, and replacement chain into one decision context so a reviewer can see why a stale or conflicting fact is hidden. + ## Hermes quickstart For most Hermes users: diff --git a/docs/install-smoke.md b/docs/install-smoke.md index 7c59aa8..44332d6 100644 --- a/docs/install-smoke.md +++ b/docs/install-smoke.md @@ -99,7 +99,7 @@ Before treating a release as ready for external users, validate these surfaces f | uvx | `uvx --refresh cafitac-agent-memory== agent-memory --help` | PyPI package resolves independently of npm wrapper | | Hermes | `agent-memory bootstrap`; `agent-memory doctor`; `hermes hooks doctor`; one QA prompt with hooks accepted | hook install is merge-safe, bounded by conservative prompt budgets, and fails closed if memory DB is unavailable | | Codex/Claude prompts | `agent-memory codex-prompt ...`; `agent-memory claude-prompt ...` after seeding approved memory | prompt wrappers include actual approved snippets and exclude disputed/deprecated content by default | -| Forensic review | `agent-memory retrieve ... --status all`; `agent-memory review conflicts fact ...`; `agent-memory review history fact ...`; `agent-memory review replacements fact ...` | obsolete/conflicting memory can be inspected intentionally with status-transition reason/evidence history and supersedes/replaces chains without entering normal prompts | +| Forensic review | `agent-memory retrieve ... --status all`; `agent-memory review conflicts fact ...`; `agent-memory review history fact ...`; `agent-memory review replacements fact ...`; `agent-memory review explain fact ...` | obsolete/conflicting memory can be inspected intentionally with status-transition reason/evidence history, same-claim alternatives, default-retrieval visibility, and supersedes/replaces chains without entering normal prompts | ## What to capture if smoke fails diff --git a/src/agent_memory/api/cli.py b/src/agent_memory/api/cli.py index f8a0d64..6f8e393 100644 --- a/src/agent_memory/api/cli.py +++ b/src/agent_memory/api/cli.py @@ -41,6 +41,7 @@ render_retrieval_eval_text_report, ) from agent_memory.storage.sqlite import ( + get_fact, initialize_database, list_candidate_episodes, list_candidate_facts, @@ -81,6 +82,45 @@ def parse_fact_ref(value: str) -> int | None: } +def _fact_replacement_chain_payload(relations, *, fact_id: int) -> dict[str, list[dict[str, Any]]]: + chain = {"superseded_by": [], "replaces": []} + for relation in relations: + payload = _fact_replacement_relation_payload(relation) + if payload["superseded_fact_id"] == fact_id: + chain["superseded_by"].append(payload) + elif payload["replacement_fact_id"] == fact_id: + chain["replaces"].append(payload) + return chain + + +def _fact_decision_summary(*, status: str, replacement_chain: dict[str, list[dict[str, Any]]]) -> str: + superseded_by = replacement_chain["superseded_by"] + if status == "approved": + base = "approved: visible in default retrieval" + elif status == "candidate": + base = "candidate: hidden from default retrieval until approved" + elif status == "disputed": + base = "disputed: hidden from default retrieval pending review" + elif status == "deprecated": + base = "deprecated: hidden from default retrieval" + else: + base = f"{status}: hidden from default retrieval" + if superseded_by: + replacement_ids = ", ".join( + f"fact #{item['replacement_fact_id']}" for item in superseded_by if item["replacement_fact_id"] is not None + ) + if replacement_ids: + base = f"{base}; superseded by {replacement_ids}" + return base + + +def _status_counts_for_facts(facts) -> dict[str, int]: + counts = {"approved": 0, "candidate": 0, "disputed": 0, "deprecated": 0} + for fact in facts: + counts[fact.status] += 1 + return counts + + def _retrieve_packet_for_prompt(args: argparse.Namespace): return retrieve_memory_packet( db_path=args.db_path, @@ -292,6 +332,14 @@ def _build_parser() -> argparse.ArgumentParser: review_history_parser.add_argument("db_path", type=Path) review_history_parser.add_argument("memory_id", type=int) + review_explain_parser = review_subparsers.add_parser( + "explain", + help="Explain why one memory is or is not visible in default retrieval.", + ) + review_explain_parser.add_argument("memory_type", choices=["fact"]) + review_explain_parser.add_argument("db_path", type=Path) + review_explain_parser.add_argument("memory_id", type=int) + review_conflicts_parser = review_subparsers.add_parser( "conflicts", help="Inspect all fact statuses for one subject/predicate claim slot without changing default retrieval policy.", @@ -626,6 +674,43 @@ def main() -> None: ) ) return + elif args.review_action == "explain": + fact = get_fact(args.db_path, fact_id=args.memory_id) + claim_facts = list_facts_by_claim_slot( + args.db_path, + subject_ref=fact.subject_ref, + predicate=fact.predicate, + scope=fact.scope, + ) + history = list_memory_status_history(args.db_path, memory_type="fact", memory_id=fact.id) + replacement_relations = list_fact_replacement_relations(args.db_path, fact_id=fact.id) + replacement_chain = _fact_replacement_chain_payload(replacement_relations, fact_id=fact.id) + print( + json.dumps( + { + "memory_type": "fact", + "memory_id": fact.id, + "fact": fact.model_dump(mode="json"), + "decision": { + "current_status": fact.status, + "visible_in_default_retrieval": fact.status == "approved", + "summary": _fact_decision_summary(status=fact.status, replacement_chain=replacement_chain), + }, + "claim_slot": { + "subject_ref": fact.subject_ref, + "predicate": fact.predicate, + "scope": fact.scope, + "counts": _status_counts_for_facts(claim_facts), + "facts": [item.model_dump(mode="json") for item in claim_facts], + }, + "history": [entry.model_dump(mode="json") for entry in history], + "replacement_chain": replacement_chain, + "default_retrieval_policy": "approved_only", + }, + indent=2, + ) + ) + return elif args.review_action == "conflicts": facts = list_facts_by_claim_slot( args.db_path, @@ -633,9 +718,7 @@ def main() -> None: predicate=args.predicate, scope=args.scope, ) - counts = {"approved": 0, "candidate": 0, "disputed": 0, "deprecated": 0} - for fact in facts: - counts[fact.status] += 1 + counts = _status_counts_for_facts(facts) print( json.dumps( { diff --git a/tests/test_cli.py b/tests/test_cli.py index 75c0712..25cccc2 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -231,6 +231,99 @@ def test_python_module_cli_review_conflicts_shows_claim_lifecycle_across_statuse assert payload["default_retrieval_policy"] == "approved_only" +def test_python_module_cli_review_explain_fact_shows_decision_context(tmp_path: Path) -> None: + db_path = tmp_path / "review-explain-fact.db" + initialize_database(db_path) + source = ingest_source_text( + db_path=db_path, + source_type="transcript", + content="Review explain source text.", + metadata={"project": "status-qa"}, + ) + approved = create_candidate_fact( + db_path=db_path, + subject_ref="Status QA", + predicate="target_phrase", + object_ref_or_value="APPROVED_OK", + evidence_ids=[source.id], + scope="project:status-qa", + confidence=0.95, + ) + disputed = create_candidate_fact( + db_path=db_path, + subject_ref="Status QA", + predicate="target_phrase", + object_ref_or_value="DISPUTED_BAD", + evidence_ids=[source.id], + scope="project:status-qa", + confidence=0.91, + ) + replacement = create_candidate_fact( + db_path=db_path, + subject_ref="Status QA", + predicate="target_phrase", + object_ref_or_value="REPLACEMENT_OK", + evidence_ids=[source.id], + scope="project:status-qa", + confidence=0.99, + ) + approve_fact(db_path=db_path, fact_id=approved.id) + from agent_memory.core.curation import dispute_memory, supersede_fact + + dispute_memory( + db_path=db_path, + memory_type="fact", + memory_id=disputed.id, + reason="Contradicted by source #1", + actor="reviewer:test", + evidence_ids=[source.id], + ) + supersede_fact( + db_path=db_path, + superseded_fact_id=disputed.id, + replacement_fact_id=replacement.id, + reason="Replacement has newer evidence", + actor="reviewer:test", + evidence_ids=[source.id], + ) + + env = {**os.environ, "PYTHONPATH": "src"} + result = subprocess.run( + [ + sys.executable, + "-m", + "agent_memory.api.cli", + "review", + "explain", + "fact", + str(db_path), + str(disputed.id), + ], + cwd=Path(__file__).resolve().parents[1], + env=env, + capture_output=True, + text=True, + ) + + assert result.returncode == 0, result.stderr + payload = json.loads(result.stdout) + assert payload["memory_type"] == "fact" + assert payload["fact"]["id"] == disputed.id + assert payload["decision"]["current_status"] == "deprecated" + assert payload["decision"]["visible_in_default_retrieval"] is False + assert payload["decision"]["summary"] == "deprecated: hidden from default retrieval; superseded by fact #3" + assert payload["claim_slot"]["counts"] == {"approved": 2, "candidate": 0, "disputed": 0, "deprecated": 1} + assert [item["object_ref_or_value"] for item in payload["claim_slot"]["facts"]] == [ + "REPLACEMENT_OK", + "APPROVED_OK", + "DISPUTED_BAD", + ] + assert [entry["to_status"] for entry in payload["history"]] == ["disputed", "deprecated"] + assert payload["history"][-1]["reason"] == "Replacement has newer evidence" + assert payload["replacement_chain"]["superseded_by"][0]["replacement_fact_id"] == replacement.id + assert payload["default_retrieval_policy"] == "approved_only" + + def test_python_module_cli_review_history_shows_transition_reasons(tmp_path: Path) -> None: db_path = tmp_path / "review-history.db" initialize_database(db_path)