From 992391132234bd0d153e6eba12b65fd46b1b962b Mon Sep 17 00:00:00 2001 From: cafitac Date: Fri, 1 May 2026 02:02:21 +0900 Subject: [PATCH] fix: clean observation dogfood telemetry --- .dev/status/current-handoff.md | 110 ++++--- README.md | 4 +- docs/hermes-dogfood.md | 14 +- src/agent_memory/api/cli.py | 11 + src/agent_memory/integrations/hermes_hooks.py | 22 +- src/agent_memory/storage/sqlite.py | 44 +-- tests/test_cli.py | 274 ++++++++++++++++++ 7 files changed, 408 insertions(+), 71 deletions(-) diff --git a/.dev/status/current-handoff.md b/.dev/status/current-handoff.md index 5eaafa6..3b1e72e 100644 --- a/.dev/status/current-handoff.md +++ b/.dev/status/current-handoff.md @@ -1,7 +1,7 @@ # agent-memory current handoff Status: AI-authored draft. Not yet human-approved. -Last updated: 2026-05-01 01:10 KST +Last updated: 2026-05-01 01:55 KST ## Trigger for the next session @@ -16,7 +16,7 @@ read this file first. Do not ask the user to restate context. Verify repo state, ## Ready-to-say answer -agent-memory는 v0.1.36까지 배포/Hermes QA가 완료됐고, 지금은 Priority 5 dogfood/noise monitoring의 다음 slice인 read-only observation audit 작업 중이야. 현재 브랜치는 `feat/observations-audit`, worktree는 `/Users/reddit/Project/agent-memory/.worktrees/observations-audit`이고, 목표는 기존 retrieval observation log를 바탕으로 자주 주입되는 memory ref, surface/scope 분포, 빈 retrieval, deprecated/disputed/missing ref 신호를 raw query 없이 요약하는 `agent-memory observations audit` CLI를 추가하는 거야. +agent-memory는 v0.1.37까지 배포/Hermes QA가 완료됐고, 현재는 실제 dogfood QA에서 발견된 observation 데이터 품질 이슈를 고치는 slice를 진행 중이야. 브랜치는 `fix/observation-dogfood-quality`, worktree는 `/Users/reddit/Project/agent-memory/.worktrees/observation-dogfood-quality`야. 목표는 query preview 제거, `hermes hooks doctor/test` synthetic pre-LLM payload가 dogfood observation을 오염시키지 않게 하기, audit에 데이터 부족/empty retrieval 품질 경고를 추가하기, 그리고 기존 DB에서 `memory_status_transitions` table이 없을 때 approve/review가 lazy migration 되도록 하는 거야. 실제 Hermes가 agent-memory에서 가져온 정보를 답변에 사용하는 E2E도 확인했어. ## Current repo state @@ -32,15 +32,15 @@ Expected GitHub identity: Verified base before this slice: -- latest completed release: `v0.1.36` -- v0.1.36 included secret-safe local retrieval observation logging and lazy migration for existing DBs without `retrieval_observations`. -- local Hermes hook uses `/Users/reddit/.agent-memory/runtime/v0.1.36/.venv/bin/agent-memory` against `/Users/reddit/.agent-memory/memory.db`. +- latest completed release: `v0.1.37` +- v0.1.37 added read-only `agent-memory observations audit` and was published to GitHub/npm/PyPI. +- local Hermes hook uses `/Users/reddit/.agent-memory/runtime/v0.1.37/.venv/bin/agent-memory` against `/Users/reddit/.agent-memory/memory.db`. Active slice/worktree: -- branch: `feat/observations-audit` -- worktree: `/Users/reddit/Project/agent-memory/.worktrees/observations-audit` -- intended release after merge: likely `v0.1.37` +- branch: `fix/observation-dogfood-quality` +- worktree: `/Users/reddit/Project/agent-memory/.worktrees/observation-dogfood-quality` +- intended release after merge: likely `v0.1.38` Expected local untracked artifacts to preserve in the root checkout: @@ -52,68 +52,86 @@ Expected local untracked artifacts to preserve in the root checkout: Do not delete or commit these unless the user explicitly asks. -## Current slice: read-only retrieval observation audit +## Current slice: observation dogfood data quality Goal: -- Add a local-only, secret-safe, read-only audit report over `retrieval_observations`. -- Summarize dogfood/noise signals before changing ranking, graph traversal, or mutating memory cleanup. +- Keep observation telemetry useful for real dogfood QA. +- Avoid storing prompt-like query previews. +- Avoid synthetic hook doctor/test payloads polluting observation audits. +- Make audit explicitly report low-signal data states. +- Ensure existing DBs lazily migrate missing lifecycle tables encountered during real local QA. Implemented so far in the active worktree: -- New CLI: - - `agent-memory observations audit --limit 200 --top 10 --frequent-threshold 3` -- JSON output includes: - - `kind: retrieval_observation_audit` - - `read_only: true` - - `observation_count` - - `surface_counts` - - `preferred_scope_counts` - - `empty_retrieval_count` - - `top_memory_refs[]` with `memory_ref`, `injection_count`, `current_status`, `signals`, and sample observation ids -- Current signals: - - `frequently_injected` - - `current_status_not_approved` -- Storage helper added: - - `get_memory_status(db_path, memory_type=..., memory_id=...)` +- `record_retrieval_observation` now writes `query_preview = None` for new observations. +- Hermes pre-LLM hook detects the deterministic `hermes hooks doctor/test` payload: + - session_id `test-session` + - user_message `What is the weather?` + - empty conversation_history + - is_first_turn true + - model `gpt-4` + - platform `cli` +- Synthetic doctor/test payloads still exercise hook context injection but do not write dogfood observation rows. +- `observations audit` now returns: + - `empty_retrieval_ratio` + - `quality_warnings` + - `no_observations` + - `low_observation_count` + - `high_empty_retrieval_ratio` +- `memory_status_transitions` now has lazy/idempotent schema ensure used by initialize, status update, and status history paths. - Docs updated: - `README.md` - `docs/hermes-dogfood.md` - -Secret-safety contract: - -- audit uses existing observation rows and does not read or emit raw query text. -- output contains counts, memory refs, statuses, and observation ids only. -- keep this data local unless intentionally exported. +- Tests added/updated in `tests/test_cli.py`: + - query preview is absent from observation list output + - audit reports low-signal empty retrievals + - approve-fact migrates existing DBs missing `memory_status_transitions` + - Hermes hook synthetic doctor payload skips observation write + - Hermes hook context includes retrieved memory content when line budget allows Verification so far: -- RED confirmed before implementation: - - `agent-memory observations audit` failed with argparse invalid choice. +- RED confirmed: + - query_preview still present + - synthetic doctor payload wrote observation rows + - audit lacked `empty_retrieval_ratio`/`quality_warnings` + - existing DB without `memory_status_transitions` failed approve with sqlite OperationalError - GREEN focused: - - `uv run pytest tests/test_cli.py::test_python_module_cli_observations_audit_reports_frequent_and_stale_refs_without_raw_queries -q` - - `1 passed` -- Focused regression group: - - `uv run pytest tests/test_cli.py::test_python_module_cli_observations_audit_reports_frequent_and_stale_refs_without_raw_queries tests/test_cli.py::test_python_module_cli_retrieve_observe_records_secret_safe_local_observation tests/test_cli.py::test_python_module_cli_observations_list_migrates_existing_database_without_observation_table -q` - - `3 passed` -- CLI help smoke: - - `uv run python -m agent_memory.api.cli observations audit --help` - - `uv run python -m agent_memory.api.cli observations list --help` - - both exit 0. + - `uv run pytest tests/test_cli.py::test_python_module_cli_approve_fact_migrates_existing_database_without_status_transition_table tests/test_cli.py::test_python_module_cli_retrieve_observe_records_secret_safe_local_observation tests/test_cli.py::test_python_module_cli_observations_audit_reports_low_signal_empty_retrievals tests/test_cli.py::test_python_module_cli_hermes_pre_llm_hook_skips_synthetic_doctor_observation tests/test_cli.py::test_python_module_cli_hermes_pre_llm_hook_injects_retrieved_memory_context -q` + - `5 passed` + +Live local Hermes QA already confirmed on v0.1.37 runtime before this patch: + +- Created a temporary approved fact in `/Users/reddit/.agent-memory/memory.db` with marker `AM_LIVE_E2E_1777567838` scoped to `/Users/reddit/Project/agent-memory`. +- Direct hook check confirmed: + - `direct_hook_contains_marker=True` + - `direct_hook_contains_agent_memory_context=True` + - `direct_hook_contains_retrieved_fact=True` +- Actual Hermes command confirmed the model used injected memory: + - `hermes --accept-hooks -z "What is the Hermes live E2E QA marker? Return only the marker and nothing else."` + - output contained `AM_LIVE_E2E_1777567838` +- Cleanup done: + - test fact id 2 deprecated with reason `live E2E QA cleanup` + - `review explain` showed `visible_in_default_retrieval: false` +- During live QA, an existing DB migration gap was discovered: + - approve failed until `agent-memory init ~/.agent-memory/memory.db` created `memory_status_transitions` + - this is now covered by the new lazy migration test/fix. Remaining before PR: -1. Run full local verification: +1. Run broader focused group and full local verification: - `uv run pytest tests/ -q` - `uv run python scripts/check_release_metadata.py` - `uv run python scripts/smoke_release_readiness.py` - `npm pack --dry-run` - `git diff --check` - `node --check bin/agent-memory.js` -2. Run real smoke for `observations audit` on a temp DB and confirm no raw secret-like query text appears. +2. Run real smoke for observation list/audit on a temp DB and confirm query_preview is null and no raw secret-like text appears. 3. Run static diff secret scan. 4. Create PR, watch CI, merge, follow release-sync/publish/published smoke/Hermes QA. +5. After v0.1.38 install, repeat Hermes hook doctor and one real E2E check with the new runtime. ## Next natural slice after this one -After this audit slice is released and Hermes QA passes, the next likely Priority 5 step is dogfood cadence refinement: use the audit report over real Hermes observations to decide whether ranking/scope filters need adjustment. Avoid mutating cleanup or broad graph retrieval until the read-only signals have been observed in real use. +After this data-quality fix is released and Hermes QA passes, continue dogfood/noise monitoring using the cleaner audit data. Avoid mutating cleanup or broader graph retrieval until there are enough real, non-synthetic observations to justify ranking/scope changes. diff --git a/README.md b/README.md index 8060f24..212bb5c 100644 --- a/README.md +++ b/README.md @@ -103,7 +103,7 @@ agent-memory graph inspect "$DB" fact:1 --depth 2 --limit 50 The JSON output includes the start ref, visited node refs, relation edges, traversal depth per edge, and a `read_only: true` marker. It is intended as a safe graph-foundation slice before enabling any broader graph traversal in default retrieval. -For local dogfood and noise monitoring, retrievals can leave a secret-safe observation log. Normal `retrieve` only records an observation when explicitly asked; the Hermes pre-LLM hook records one automatically in the local SQLite DB. Observations store a query hash, a redacted short preview, selected memory refs, top memory ref, response mode, scope, and surface. They do not store the raw query text. +For local dogfood and noise monitoring, retrievals can leave a secret-safe observation log. Normal `retrieve` only records an observation when explicitly asked; the Hermes pre-LLM hook records one automatically in the local SQLite DB for real turns. Observations store a query hash, selected memory refs, top memory ref, response mode, scope, and surface. They do not store the raw query text or a query preview. Deterministic `hermes hooks doctor/test` pre-LLM payloads exercise context injection but are skipped as dogfood observations so synthetic weather prompts do not pollute the audit. ```bash agent-memory retrieve "$DB" "How should I install agent-memory?" --preferred-scope user:default --observe cli @@ -111,7 +111,7 @@ agent-memory observations list "$DB" --limit 20 agent-memory observations audit "$DB" --limit 200 --top 10 --frequent-threshold 3 ``` -Use the observation log and audit report to spot frequently injected or surprising memories before changing retrieval behavior. The audit output is read-only JSON with surface/scope counts, empty-retrieval count, top injected memory refs, current status for known refs, and simple signals such as `frequently_injected` and `current_status_not_approved`. Treat it as local operator telemetry, not a synced analytics stream. +Use the observation log and audit report to spot frequently injected or surprising memories before changing retrieval behavior. The audit output is read-only JSON with surface/scope counts, empty-retrieval count and ratio, quality warnings such as `low_observation_count` or `high_empty_retrieval_ratio`, top injected memory refs, current status for known refs, and simple signals such as `frequently_injected` and `current_status_not_approved`. Treat it as local operator telemetry, not a synced analytics stream. ## Hermes quickstart diff --git a/docs/hermes-dogfood.md b/docs/hermes-dogfood.md index 45e7d91..c1ed53c 100644 --- a/docs/hermes-dogfood.md +++ b/docs/hermes-dogfood.md @@ -36,21 +36,27 @@ Capture these observations for each dogfood run: - whether returned context includes only approved memory - whether unrelated scopes stay out of the prompt - whether failure paths fail closed with no broken prompt text -- whether `agent-memory observations list ~/.agent-memory/memory.db --limit 20` shows the expected memory refs without raw query text or secrets -- whether `agent-memory observations audit ~/.agent-memory/memory.db --limit 200 --top 10` highlights frequently injected or no-longer-approved refs before any retrieval tuning +- whether `agent-memory observations list ~/.agent-memory/memory.db --limit 20` shows the expected memory refs without raw query text, query previews, or secrets +- whether `agent-memory observations audit ~/.agent-memory/memory.db --limit 200 --top 10` highlights frequently injected or no-longer-approved refs, low observation counts, and high empty-retrieval ratios before any retrieval tuning A good conservative smoke has low latency, at most one surfaced memory, no noisy reason codes, no workflow-blocking error if the memory DB is missing, and a local observation entry that explains what memory was injected. ## Local observation log -Hermes pre-LLM hook retrievals write a secret-safe local observation row to the SQLite DB. The row is intended for dogfood/noise review and stores the surface, query hash, redacted query preview, selected memory refs, top memory ref, response mode, scope, and small metadata. It does not store the raw query text. +Hermes pre-LLM hook retrievals write a secret-safe local observation row to the SQLite DB for real turns. The row is intended for dogfood/noise review and stores the surface, query hash, selected memory refs, top memory ref, response mode, scope, and small metadata. It does not store the raw query text or a query preview. `hermes hooks doctor` / `hermes hooks test pre_llm_call` still exercise hook context injection, but their deterministic synthetic weather payload is skipped as dogfood observation data. ```bash agent-memory observations list ~/.agent-memory/memory.db --limit 20 agent-memory observations audit ~/.agent-memory/memory.db --limit 200 --top 10 --frequent-threshold 3 ``` -Use this before tuning ranking or adding broader graph traversal: first confirm which memories are frequently injected, which scopes are active, whether retrieval is often empty, and whether any frequently injected refs are now deprecated/disputed/missing. The audit command is read-only and summarizes local observation rows without emitting raw query text. Keep this data local unless you intentionally export it. +Use this before tuning ranking or adding broader graph traversal: first confirm which memories are frequently injected, which scopes are active, whether retrieval is often empty, and whether any frequently injected refs are now deprecated/disputed/missing. The audit command is read-only and summarizes local observation rows without emitting raw query text or query previews. Keep this data local unless you intentionally export it. + +When the audit reports `quality_warnings`, treat them as QA signals rather than cleanup instructions: + +- `no_observations`: Hermes has not produced dogfood observation data yet; check hook install/allowlist and run a real prompt. +- `low_observation_count`: keep dogfooding before drawing ranking conclusions. +- `high_empty_retrieval_ratio`: memory retrieval is often returning no approved refs; check scopes, approved memory coverage, and query wording before changing rankers. ## Fallback and rollback diff --git a/src/agent_memory/api/cli.py b/src/agent_memory/api/cli.py index f1a72d1..89701c4 100644 --- a/src/agent_memory/api/cli.py +++ b/src/agent_memory/api/cli.py @@ -184,6 +184,15 @@ def _audit_retrieval_observations( } ) + empty_retrieval_ratio = empty_retrieval_count / len(observations) if observations else 0.0 + quality_warnings = [] + if not observations: + quality_warnings.append("no_observations") + if 0 < len(observations) < 10: + quality_warnings.append("low_observation_count") + if empty_retrieval_ratio >= 0.5 and observations: + quality_warnings.append("high_empty_retrieval_ratio") + return { "kind": "retrieval_observation_audit", "read_only": True, @@ -194,6 +203,8 @@ def _audit_retrieval_observations( "surface_counts": dict(sorted(surface_counts.items())), "preferred_scope_counts": dict(sorted(preferred_scope_counts.items())), "empty_retrieval_count": empty_retrieval_count, + "empty_retrieval_ratio": round(empty_retrieval_ratio, 4), + "quality_warnings": quality_warnings, "top_memory_refs": top_memory_refs, } diff --git a/src/agent_memory/integrations/hermes_hooks.py b/src/agent_memory/integrations/hermes_hooks.py index 4926efb..0403b70 100644 --- a/src/agent_memory/integrations/hermes_hooks.py +++ b/src/agent_memory/integrations/hermes_hooks.py @@ -35,6 +35,25 @@ def scope_from_cwd(cwd: str | Path | None) -> str | None: def resolve_effective_preferred_scope(payload: "HermesShellHookPayload", options: "HermesPreLlmHookOptions") -> str | None: return options.preferred_scope or scope_from_cwd(payload.cwd) + +def is_synthetic_hermes_doctor_payload(payload: "HermesShellHookPayload") -> bool: + """Detect the deterministic pre-LLM payload used by `hermes hooks doctor/test`. + + Doctor/test payloads should still exercise hook context injection, but they + should not be counted as dogfood retrieval observations because they look like + real user turns and otherwise pollute noisy-memory audits. + """ + return ( + payload.hook_event_name == "pre_llm_call" + and payload.session_id == "test-session" + and payload.extra.get("user_message") == "What is the weather?" + and payload.extra.get("conversation_history") == [] + and payload.extra.get("is_first_turn") is True + and payload.extra.get("model") == "gpt-4" + and payload.extra.get("platform") == "cli" + ) + + class HermesShellHookPayload(BaseModel): hook_event_name: str tool_name: str | None = None @@ -369,13 +388,14 @@ def build_pre_llm_hook_context( return {} effective_preferred_scope = resolve_effective_preferred_scope(payload, options) + observation_surface = None if is_synthetic_hermes_doctor_payload(payload) else "hermes-pre-llm-hook" try: packet = retrieve_memory_packet( db_path=options.db_path, query=user_message, limit=options.limit, preferred_scope=effective_preferred_scope, - observation_surface="hermes-pre-llm-hook", + observation_surface=observation_surface, observation_metadata={"hook_event_name": payload.hook_event_name}, ) context = prepare_hermes_memory_context( diff --git a/src/agent_memory/storage/sqlite.py b/src/agent_memory/storage/sqlite.py index 8a0b739..5ad1df2 100644 --- a/src/agent_memory/storage/sqlite.py +++ b/src/agent_memory/storage/sqlite.py @@ -3,7 +3,6 @@ import hashlib import json import math -import re import sqlite3 from datetime import datetime from importlib.resources import files @@ -111,9 +110,31 @@ def initialize_database(db_path: Path | str) -> None: connection.execute( "CREATE INDEX IF NOT EXISTS idx_episodes_status_scope_importance ON episodes(status, scope, importance_score)" ) + _ensure_memory_status_transitions_schema(connection) _ensure_retrieval_observations_schema(connection) +def _ensure_memory_status_transitions_schema(connection: sqlite3.Connection) -> None: + connection.execute( + """ + CREATE TABLE IF NOT EXISTS memory_status_transitions ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + memory_type TEXT NOT NULL CHECK (memory_type IN ('fact', 'procedure', 'episode')), + memory_id INTEGER NOT NULL, + from_status TEXT NOT NULL CHECK (from_status IN ('candidate', 'approved', 'disputed', 'deprecated')), + to_status TEXT NOT NULL CHECK (to_status IN ('candidate', 'approved', 'disputed', 'deprecated')), + reason TEXT, + actor TEXT, + evidence_ids_json TEXT NOT NULL DEFAULT '[]', + created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP + ) + """ + ) + connection.execute( + "CREATE INDEX IF NOT EXISTS idx_memory_status_transitions_memory ON memory_status_transitions(memory_type, memory_id, id)" + ) + + def _ensure_retrieval_observations_schema(connection: sqlite3.Connection) -> None: connection.execute( """ @@ -448,6 +469,7 @@ def list_memory_status_history( memory_id: int, ) -> list[MemoryStatusTransition]: with connect(db_path) as connection: + _ensure_memory_status_transitions_schema(connection) rows = connection.execute( """ SELECT * @@ -747,6 +769,7 @@ def _update_status( evidence_ids: list[int], ) -> T: with connect(db_path) as connection: + _ensure_memory_status_transitions_schema(connection) current_row = connection.execute(f"SELECT * FROM {table_name} WHERE id = ?", (object_id,)).fetchone() if current_row is None: raise ValueError(f"No {memory_type} memory found with id {object_id}") @@ -807,21 +830,6 @@ def record_memory_retrieval( ) -_SECRET_ASSIGNMENT_PATTERN = re.compile(r"(?i)\b(password|passwd|pwd|token|api[_-]?key|secret|credential|connection[_-]?string)\s*[:=]\s*\S+") -_BEARER_PATTERN = re.compile(r"(?i)\bbearer\s+[A-Za-z0-9._~+\-/=]+") -_LONG_TOKEN_PATTERN = re.compile(r"\b[A-Za-z0-9_\-]{24,}\b") - - -def _redacted_query_preview(query: str, *, max_chars: int = 120) -> str: - preview = _SECRET_ASSIGNMENT_PATTERN.sub(lambda match: f"{match.group(1)}=[REDACTED]", query) - preview = _BEARER_PATTERN.sub("Bearer [REDACTED]", preview) - preview = _LONG_TOKEN_PATTERN.sub("[REDACTED]", preview) - preview = " ".join(preview.split()) - if len(preview) > max_chars: - return f"{preview[: max_chars - 1]}…" - return preview - - def _memory_ref(memory_type: str, memory_id: int) -> str: return f"{memory_type}:{memory_id}" @@ -862,7 +870,7 @@ def record_retrieval_observation( ( surface, query_sha256, - _redacted_query_preview(query), + None, preferred_scope, limit, json.dumps(list(statuses)), @@ -1291,7 +1299,7 @@ def retrieval_observation_from_row(row: sqlite3.Row) -> RetrievalObservation: created_at=row["created_at"], surface=row["surface"], query_sha256=row["query_sha256"], - query_preview=row["query_preview"], + query_preview=None, preferred_scope=row["preferred_scope"], limit=row["limit_value"], statuses=json.loads(row["statuses_json"]), diff --git a/tests/test_cli.py b/tests/test_cli.py index f18b30e..718538e 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,5 +1,6 @@ import json import os +import sqlite3 import subprocess import sys from pathlib import Path @@ -160,6 +161,7 @@ def test_python_module_cli_retrieve_observe_records_secret_safe_local_observatio assert payload["observations"][0]["surface"] == "cli-test" assert payload["observations"][0]["query_sha256"] assert payload["observations"][0]["query_text"] is None + assert payload["observations"][0]["query_preview"] is None assert payload["observations"][0]["retrieved_memory_refs"] == [f"fact:{fact.id}"] assert payload["observations"][0]["top_memory_ref"] == f"fact:{fact.id}" assert "SUPERSECRET" not in list_result.stdout @@ -259,6 +261,123 @@ def test_python_module_cli_observations_audit_reports_frequent_and_stale_refs_wi assert "abc123" not in audit_result.stdout +def test_python_module_cli_observations_audit_reports_low_signal_empty_retrievals(tmp_path: Path) -> None: + db_path = tmp_path / "observation-audit-empty.db" + initialize_database(db_path) + + env = {**os.environ, "PYTHONPATH": "src"} + for query in ("no matching alpha", "no matching beta"): + retrieve_result = subprocess.run( + [ + sys.executable, + "-m", + "agent_memory.api.cli", + "retrieve", + str(db_path), + query, + "--observe", + "cli-test", + ], + cwd=Path(__file__).resolve().parents[1], + env=env, + capture_output=True, + text=True, + ) + assert retrieve_result.returncode == 0, retrieve_result.stderr + + audit_result = subprocess.run( + [ + sys.executable, + "-m", + "agent_memory.api.cli", + "observations", + "audit", + str(db_path), + "--limit", + "20", + ], + cwd=Path(__file__).resolve().parents[1], + env=env, + capture_output=True, + text=True, + ) + + assert audit_result.returncode == 0, audit_result.stderr + payload = json.loads(audit_result.stdout) + assert payload["observation_count"] == 2 + assert payload["empty_retrieval_count"] == 2 + assert payload["empty_retrieval_ratio"] == 1.0 + assert "low_observation_count" in payload["quality_warnings"] + assert "high_empty_retrieval_ratio" in payload["quality_warnings"] + + + +def test_python_module_cli_approve_fact_migrates_existing_database_without_status_transition_table( + tmp_path: Path, +) -> None: + db_path = tmp_path / "legacy-status-transition.db" + initialize_database(db_path) + source = ingest_source_text( + db_path=db_path, + source_type="transcript", + content="Legacy status transition migration smoke.", + metadata={"project": "legacy-status-transition"}, + ) + fact = create_candidate_fact( + db_path=db_path, + subject_ref="Legacy transition", + predicate="marker", + object_ref_or_value="STATUS_TRANSITION_OK", + evidence_ids=[source.id], + scope="project:legacy-status-transition", + confidence=0.95, + ) + with sqlite3.connect(db_path) as connection: + connection.execute("DROP TABLE memory_status_transitions") + + env = {**os.environ, "PYTHONPATH": "src"} + approve_result = subprocess.run( + [ + sys.executable, + "-m", + "agent_memory.api.cli", + "approve-fact", + str(db_path), + str(fact.id), + ], + cwd=Path(__file__).resolve().parents[1], + env=env, + capture_output=True, + text=True, + ) + + assert approve_result.returncode == 0, approve_result.stderr + approve_payload = json.loads(approve_result.stdout) + assert approve_payload["status"] == "approved" + + history_result = subprocess.run( + [ + sys.executable, + "-m", + "agent_memory.api.cli", + "review", + "history", + "fact", + str(db_path), + str(fact.id), + ], + cwd=Path(__file__).resolve().parents[1], + env=env, + capture_output=True, + text=True, + ) + assert history_result.returncode == 0, history_result.stderr + history_payload = json.loads(history_result.stdout) + assert history_payload["history"][0]["from_status"] == "candidate" + assert history_payload["history"][0]["to_status"] == "approved" + + + def test_python_module_cli_observations_list_migrates_existing_database_without_observation_table(tmp_path: Path) -> None: db_path = tmp_path / "legacy-observation.db" initialize_database(db_path) @@ -1230,6 +1349,161 @@ def test_python_module_cli_hermes_pre_llm_hook_outputs_context_for_hermes_shell_ +def test_python_module_cli_hermes_pre_llm_hook_skips_synthetic_doctor_observation(tmp_path: Path) -> None: + db_path = tmp_path / "module-cli-hermes-synthetic-observation.db" + initialize_database(db_path) + source = ingest_source_text( + db_path=db_path, + source_type="transcript", + content="Synthetic hook doctor weather memory should not become dogfood observation data.", + metadata={"project": "synthetic-hook"}, + ) + fact = create_candidate_fact( + db_path=db_path, + subject_ref="Weather", + predicate="qa_marker", + object_ref_or_value="SYNTHETIC_SKIP", + evidence_ids=[source.id], + scope="project:synthetic-hook", + confidence=0.95, + ) + approve_fact(db_path=db_path, fact_id=fact.id) + + hook_payload = { + "hook_event_name": "pre_llm_call", + "session_id": "test-session", + "cwd": str(tmp_path), + "extra": { + "user_message": "What is the weather?", + "conversation_history": [], + "is_first_turn": True, + "model": "gpt-4", + "platform": "cli", + }, + } + env = {**os.environ, "PYTHONPATH": "src"} + result = subprocess.run( + [ + sys.executable, + "-m", + "agent_memory.api.cli", + "hermes-pre-llm-hook", + str(db_path), + "--preferred-scope", + "project:synthetic-hook", + "--top-k", + "1", + "--max-prompt-lines", + "8", + ], + cwd=Path(__file__).resolve().parents[1], + env=env, + input=json.dumps(hook_payload), + capture_output=True, + text=True, + ) + + assert result.returncode == 0, result.stderr + assert "SYNTHETIC_SKIP" in json.loads(result.stdout)["context"] + + observations_result = subprocess.run( + [ + sys.executable, + "-m", + "agent_memory.api.cli", + "observations", + "list", + str(db_path), + ], + cwd=Path(__file__).resolve().parents[1], + env=env, + capture_output=True, + text=True, + ) + assert observations_result.returncode == 0, observations_result.stderr + observations_payload = json.loads(observations_result.stdout) + assert observations_payload["observations"] == [] + + + +def test_python_module_cli_hermes_pre_llm_hook_injects_retrieved_memory_context(tmp_path: Path) -> None: + db_path = tmp_path / "module-cli-hermes-injection-proof.db" + initialize_database(db_path) + source = ingest_source_text( + db_path=db_path, + source_type="transcript", + content="The live Hermes QA marker is AM_LIVE_QA_137.", + metadata={"project": "hermes-injection-proof"}, + ) + fact = create_candidate_fact( + db_path=db_path, + subject_ref="Hermes live QA", + predicate="marker", + object_ref_or_value="AM_LIVE_QA_137", + evidence_ids=[source.id], + scope="project:hermes-injection-proof", + confidence=0.95, + ) + approve_fact(db_path=db_path, fact_id=fact.id) + + hook_payload = { + "hook_event_name": "pre_llm_call", + "session_id": "real-session-shape", + "cwd": str(tmp_path), + "extra": { + "user_message": "What is the live Hermes QA marker?", + "platform": "cli", + }, + } + env = {**os.environ, "PYTHONPATH": "src"} + result = subprocess.run( + [ + sys.executable, + "-m", + "agent_memory.api.cli", + "hermes-pre-llm-hook", + str(db_path), + "--preferred-scope", + "project:hermes-injection-proof", + "--top-k", + "1", + "--max-prompt-lines", + "8", + "--no-reason-codes", + ], + cwd=Path(__file__).resolve().parents[1], + env=env, + input=json.dumps(hook_payload), + capture_output=True, + text=True, + ) + + assert result.returncode == 0, result.stderr + hook_response = json.loads(result.stdout) + assert "" in hook_response["context"] + assert "Retrieved fact" in hook_response["context"] + assert "AM_LIVE_QA_137" in hook_response["context"] + + observations_result = subprocess.run( + [ + sys.executable, + "-m", + "agent_memory.api.cli", + "observations", + "list", + str(db_path), + ], + cwd=Path(__file__).resolve().parents[1], + env=env, + capture_output=True, + text=True, + ) + assert observations_result.returncode == 0, observations_result.stderr + observations_payload = json.loads(observations_result.stdout) + assert observations_payload["observations"][0]["retrieved_memory_refs"] == [f"fact:{fact.id}"] + + + def test_python_module_cli_hermes_pre_llm_hook_derives_path_scope_from_payload_cwd( tmp_path: Path, ) -> None: