diff --git a/.dev/status/current-handoff.md b/.dev/status/current-handoff.md index 6a1c550..4acf9cf 100644 --- a/.dev/status/current-handoff.md +++ b/.dev/status/current-handoff.md @@ -1,7 +1,7 @@ # agent-memory current handoff Status: AI-authored draft. Not yet human-approved. -Last updated: 2026-05-01 10:21 KST +Last updated: 2026-05-01 11:10 KST ## Trigger for the next session @@ -16,7 +16,7 @@ read this file first. Do not ask the user to restate context. Verify repo state, ## Ready-to-say answer -agent-memory는 v0.1.38까지 배포/Hermes QA가 완료됐고, 현재는 Priority 5 dogfood/noise monitoring의 다음 slice인 read-only observation review candidate report를 진행 중이야. 브랜치는 `feat/observation-review-candidates`, worktree는 `/Users/reddit/Project/agent-memory/.worktrees/observation-review-candidates`야. 목표는 `observations audit`의 top injected refs를 `review explain`, replacement/supersedes chain, `graph inspect` 요약과 copy-paste follow-up commands로 연결하는 거야. 자동 cleanup/mutation은 하지 않고 forensic review만 강화한다. +agent-memory는 v0.1.39까지 배포/Hermes QA가 완료됐고, 현재는 Priority 5 dogfood/noise monitoring에서 v0.1.39 dogfood 결과를 바탕으로 `observations review-candidates`의 JSON 계약을 더 운영 친화적으로 다듬는 slice를 진행 중이야. 브랜치는 `feat/observation-review-temporal`, worktree는 `/Users/reddit/Project/agent-memory/.worktrees/observation-review-temporal`야. 목표는 review-candidates 결과에 top-level count, per-ref observation window, fact status-history summary를 추가해 historical injections와 현재 lifecycle 상태를 더 쉽게 구분하는 것이다. 자동 cleanup/mutation은 여전히 하지 않는다. ## Current repo state @@ -30,19 +30,19 @@ Expected GitHub identity: - Use `HOME=/Users/reddit` for gh commands. - Remote: `origin` -> `https://github.com/cafitac/agent-memory.git` -Verified base before this slice: +Verified before this slice: -- latest completed release: `v0.1.38` -- v0.1.38 removed observation query previews, skipped Hermes doctor/test synthetic observations, added audit quality warnings, and verified a real Hermes turn used an agent-memory fact. -- local Hermes hook uses `/Users/reddit/.agent-memory/runtime/v0.1.38/.venv/bin/agent-memory` against `/Users/reddit/.agent-memory/memory.db`. +- latest completed release: `v0.1.39` +- v0.1.39 added read-only `agent-memory observations review-candidates` and completed published smoke/Hermes runtime QA. +- local Hermes hook uses `/Users/reddit/.agent-memory/runtime/v0.1.39/.venv/bin/python -m agent_memory.api.cli hermes-pre-llm-hook ...` against `/Users/reddit/.agent-memory/memory.db`. - root checkout was clean on `main...origin/main` except local-only untracked state. - open PRs were `[]`. Active slice/worktree: -- branch: `feat/observation-review-candidates` -- worktree: `/Users/reddit/Project/agent-memory/.worktrees/observation-review-candidates` -- intended release after merge: likely `v0.1.39` +- branch: `feat/observation-review-temporal` +- worktree: `/Users/reddit/Project/agent-memory/.worktrees/observation-review-temporal` +- intended release after merge: likely `v0.1.40` Expected local untracked artifacts to preserve in the root checkout: @@ -54,62 +54,63 @@ Expected local untracked artifacts to preserve in the root checkout: Do not delete or commit these unless the user explicitly asks. -## Current slice: observation review candidates +## Current slice: observation review temporal summaries Goal: - Keep dogfood/noise monitoring read-only. -- Turn `observations audit` top refs into actionable forensic review candidates. -- Help operators see lifecycle status, replacement chains, and relation graph hints without exposing raw user queries or mutating memory. +- Make `observations review-candidates` easier to consume from local dogfood output. +- Add compact count/window/history summaries without exposing raw user queries and without mutating memory. Implemented so far in the active worktree: -- New CLI: - - `agent-memory observations review-candidates --limit N --top N --frequent-threshold N` -- Output contract: - - `kind: retrieval_observation_review_candidates` - - `read_only: true` - - nested `observation_audit` payload - - `candidates[]` derived from `top_memory_refs` - - fact refs include `review_explain` payload equivalent to `review explain fact` - - graph summary includes depth-1 relation neighbor refs and edge count - - signals include existing audit signals plus `has_replacement` and `has_graph_relations` when applicable - - copy-paste commands for `review explain`, `review replacements`, and `graph inspect` -- Refactored CLI review explain to reuse `_fact_review_explanation_payload`. +- `observations audit` top refs now include `observation_window`: + - `first_observation_id` + - `first_observed_at` + - `latest_observation_id` + - `latest_observed_at` +- `observations review-candidates` now includes top-level: + - `observation_count` + - `candidate_count` +- Each review candidate now includes: + - the propagated `observation_window` + - `status_history_summary.transition_count` + - `status_history_summary.latest_transition` - Docs updated: - `README.md` - `docs/hermes-dogfood.md` -- Test added in `tests/test_cli.py`: - - `test_python_module_cli_observations_review_candidates_explains_top_refs_without_mutation_or_raw_queries` +- Tests updated in `tests/test_cli.py`: + - audit regression asserts per-ref observation window. + - review-candidates regression asserts top-level counts and status history summary. Verification so far: - RED confirmed: - - `observations review-candidates` was not a valid subcommand. + - focused tests failed on missing `observation_window` and top-level `observation_count`. - GREEN focused: - - `uv run pytest tests/test_cli.py::test_python_module_cli_observations_review_candidates_explains_top_refs_without_mutation_or_raw_queries -q` - - `1 passed` -- Broader focused: - - audit, review-candidates, review explain, graph inspect CLI tests - - `4 passed` -- Help smoke: - - `PYTHONPATH=src uv run python -m agent_memory.api.cli observations review-candidates --help` - - exit 0 + - `TMPDIR=$PWD/.tmp-test uv run pytest tests/test_cli.py::test_python_module_cli_observations_audit_reports_frequent_and_stale_refs_without_raw_queries tests/test_cli.py::test_python_module_cli_observations_review_candidates_explains_top_refs_without_mutation_or_raw_queries -q` + - `2 passed` Remaining before PR: -1. Run full local verification: +1. Run broader/full local verification: + - focused CLI tests around audit/review-candidates - `uv run pytest tests/ -q` - `uv run python scripts/check_release_metadata.py` - `uv run python scripts/smoke_release_readiness.py` - `npm pack --dry-run` - `git diff --check` - `node --check bin/agent-memory.js` -2. Run real temp-DB smoke for `observations review-candidates` and confirm no raw secret-like query text appears. +2. Run real local DB smoke for `observations review-candidates` and verify the new fields exist. 3. Run static diff secret scan. 4. Create PR, watch CI, merge, follow release-sync/publish/published smoke/Hermes QA. -5. After v0.1.39 install, repeat Hermes hook doctor and run installed `observations review-candidates` against the existing local DB. +5. After v0.1.40 install, repeat Hermes hook doctor and installed `observations review-candidates` against the existing local DB. ## Next natural slice after this one -After the read-only review candidate report is released and dogfooded, continue gathering real observation data. If enough non-synthetic observations accumulate, the next likely work is retrieval quality diagnostics for high empty-retrieval ratios or scope/ranking misses. Avoid automatic cleanup/deprecation until the review candidate workflow has been used on real local data. +After the review-candidates contract is released and dogfooded, continue Priority 5 by either: + +1. improving retrieval diagnostics for empty retrieval/high empty ratio, or +2. adding an explicit human review cadence/checklist around candidate reports. + +Avoid automatic cleanup/deprecation until the review candidate workflow has been used on real local data for a while. diff --git a/README.md b/README.md index 7fc7659..0aedead 100644 --- a/README.md +++ b/README.md @@ -112,7 +112,7 @@ agent-memory observations audit "$DB" --limit 200 --top 10 --frequent-threshold agent-memory observations review-candidates "$DB" --limit 200 --top 10 --frequent-threshold 3 ``` -Use the observation log and audit report to spot frequently injected or surprising memories before changing retrieval behavior. The audit output is read-only JSON with surface/scope counts, empty-retrieval count and ratio, quality warnings such as `low_observation_count` or `high_empty_retrieval_ratio`, top injected memory refs, current status for known refs, and simple signals such as `frequently_injected` and `current_status_not_approved`. `observations review-candidates` is also read-only; it turns the top audit refs into forensic candidates with fact review explanations, replacement-chain hints, graph-neighborhood summaries, and copy-paste follow-up commands such as `review explain`, `review replacements`, and `graph inspect`. Treat these reports as local operator telemetry, not a synced analytics feature or an automatic cleanup workflow. +Use the observation log and audit report to spot frequently injected or surprising memories before changing retrieval behavior. The audit output is read-only JSON with surface/scope counts, empty-retrieval count and ratio, quality warnings such as `low_observation_count` or `high_empty_retrieval_ratio`, top injected memory refs, current status for known refs, per-ref observation windows, and simple signals such as `frequently_injected` and `current_status_not_approved`. `observations review-candidates` is also read-only; it turns the top audit refs into forensic candidates with top-level `observation_count`/`candidate_count`, fact review explanations, status-history summaries, replacement-chain hints, graph-neighborhood summaries, and copy-paste follow-up commands such as `review explain`, `review replacements`, and `graph inspect`. Treat these reports as local operator telemetry, not a synced analytics feature or an automatic cleanup workflow. ## Hermes quickstart diff --git a/docs/hermes-dogfood.md b/docs/hermes-dogfood.md index 0ef9525..0a3d948 100644 --- a/docs/hermes-dogfood.md +++ b/docs/hermes-dogfood.md @@ -58,7 +58,8 @@ Use this before tuning ranking or adding broader graph traversal: first confirm - fact refs include the same lifecycle explanation as `agent-memory review explain fact ...`. - replacement/supersedes chains are surfaced as candidate signals instead of mutating anything. - relation graph neighbors are summarized so you know when `agent-memory graph inspect ...` is worth running. -- the JSON includes copy-paste follow-up commands for `review explain`, `review replacements`, and `graph inspect`. +- the JSON includes `observation_count`/`candidate_count`, each ref's observation window, and copy-paste follow-up commands for `review explain`, `review replacements`, and `graph inspect`. +- fact refs include a `status_history_summary` so historical injections that were later deprecated/superseded are easier to distinguish from currently approved frequent memories. Do not treat review candidates as automatic cleanup recommendations. They are a short list for human review; approve/deprecate/supersede decisions should still be explicit curation actions. diff --git a/src/agent_memory/api/cli.py b/src/agent_memory/api/cli.py index f517bc6..5b69e90 100644 --- a/src/agent_memory/api/cli.py +++ b/src/agent_memory/api/cli.py @@ -197,6 +197,7 @@ def _audit_retrieval_observations( ) memory_ref_counts: Counter[str] = Counter() sample_observation_ids_by_ref: dict[str, list[int]] = defaultdict(list) + observation_windows_by_ref: dict[str, dict[str, Any]] = {} empty_retrieval_count = 0 for observation in observations: if not observation.retrieved_memory_refs: @@ -206,6 +207,21 @@ def _audit_retrieval_observations( sample_ids = sample_observation_ids_by_ref[memory_ref] if len(sample_ids) < 5: sample_ids.append(observation.id) + window = observation_windows_by_ref.setdefault( + memory_ref, + { + "first_observation_id": observation.id, + "first_observed_at": observation.created_at, + "latest_observation_id": observation.id, + "latest_observed_at": observation.created_at, + }, + ) + if observation.id < window["first_observation_id"]: + window["first_observation_id"] = observation.id + window["first_observed_at"] = observation.created_at + if observation.id > window["latest_observation_id"]: + window["latest_observation_id"] = observation.id + window["latest_observed_at"] = observation.created_at top_memory_refs = [] for memory_ref, injection_count in sorted(memory_ref_counts.items(), key=lambda item: (-item[1], item[0]))[:top]: @@ -222,6 +238,7 @@ def _audit_retrieval_observations( "current_status": current_status, "signals": signals, "sample_observation_ids": sample_observation_ids_by_ref[memory_ref], + "observation_window": observation_windows_by_ref[memory_ref], } ) @@ -282,6 +299,12 @@ def _review_candidates_from_observations( if graph["edges"]: signals.append("has_graph_relations") + history = review_explain["history"] if review_explain is not None else [] + status_history_summary = { + "transition_count": len(history), + "latest_transition": history[-1] if history else None, + } + commands = {"graph_inspect": f"agent-memory graph inspect {db_path} {memory_ref} --depth 1"} if parts is not None: memory_type, memory_id = parts @@ -299,6 +322,7 @@ def _review_candidates_from_observations( **top_ref, "signals": signals, "review_explain": review_explain, + "status_history_summary": status_history_summary, "graph_summary": { "start_ref": graph["start_ref"], "depth": graph["depth"], @@ -313,6 +337,8 @@ def _review_candidates_from_observations( return { "kind": "retrieval_observation_review_candidates", "read_only": True, + "observation_count": audit["observation_count"], + "candidate_count": len(candidates), "observation_audit": audit, "candidates": candidates, } diff --git a/tests/test_cli.py b/tests/test_cli.py index f6f2d39..329a8bc 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -257,6 +257,9 @@ def test_python_module_cli_observations_audit_reports_frequent_and_stale_refs_wi assert top_ref["current_status"] == "deprecated" assert top_ref["signals"] == ["frequently_injected", "current_status_not_approved"] assert top_ref["sample_observation_ids"] + assert top_ref["observation_window"]["first_observation_id"] <= top_ref["observation_window"]["latest_observation_id"] + assert top_ref["observation_window"]["first_observed_at"] + assert top_ref["observation_window"]["latest_observed_at"] assert "SUPERSECRET" not in audit_result.stdout assert "abc123" not in audit_result.stdout @@ -351,6 +354,8 @@ def test_python_module_cli_observations_review_candidates_explains_top_refs_with payload = json.loads(review_result.stdout) assert payload["kind"] == "retrieval_observation_review_candidates" assert payload["read_only"] is True + assert payload["observation_count"] == 2 + assert payload["candidate_count"] == 1 assert payload["observation_audit"]["kind"] == "retrieval_observation_audit" assert payload["observation_audit"]["read_only"] is True candidate = payload["candidates"][0] @@ -363,6 +368,11 @@ def test_python_module_cli_observations_review_candidates_explains_top_refs_with "has_replacement", "has_graph_relations", ] + assert candidate["observation_window"]["first_observation_id"] <= candidate["observation_window"]["latest_observation_id"] + assert candidate["observation_window"]["first_observed_at"] + assert candidate["observation_window"]["latest_observed_at"] + assert candidate["status_history_summary"]["transition_count"] == 2 + assert candidate["status_history_summary"]["latest_transition"]["to_status"] == "deprecated" assert candidate["review_explain"]["decision"]["visible_in_default_retrieval"] is False assert candidate["review_explain"]["replacement_chain"]["superseded_by"][0]["replacement_fact_id"] == replacement_fact.id assert candidate["graph_summary"]["edge_count"] == 1