diff --git a/.dev/status/current-handoff.md b/.dev/status/current-handoff.md index 3b1e72e..6a1c550 100644 --- a/.dev/status/current-handoff.md +++ b/.dev/status/current-handoff.md @@ -1,7 +1,7 @@ # agent-memory current handoff Status: AI-authored draft. Not yet human-approved. -Last updated: 2026-05-01 01:55 KST +Last updated: 2026-05-01 10:21 KST ## Trigger for the next session @@ -16,7 +16,7 @@ read this file first. Do not ask the user to restate context. Verify repo state, ## Ready-to-say answer -agent-memory는 v0.1.37까지 배포/Hermes QA가 완료됐고, 현재는 실제 dogfood QA에서 발견된 observation 데이터 품질 이슈를 고치는 slice를 진행 중이야. 브랜치는 `fix/observation-dogfood-quality`, worktree는 `/Users/reddit/Project/agent-memory/.worktrees/observation-dogfood-quality`야. 목표는 query preview 제거, `hermes hooks doctor/test` synthetic pre-LLM payload가 dogfood observation을 오염시키지 않게 하기, audit에 데이터 부족/empty retrieval 품질 경고를 추가하기, 그리고 기존 DB에서 `memory_status_transitions` table이 없을 때 approve/review가 lazy migration 되도록 하는 거야. 실제 Hermes가 agent-memory에서 가져온 정보를 답변에 사용하는 E2E도 확인했어. +agent-memory는 v0.1.38까지 배포/Hermes QA가 완료됐고, 현재는 Priority 5 dogfood/noise monitoring의 다음 slice인 read-only observation review candidate report를 진행 중이야. 브랜치는 `feat/observation-review-candidates`, worktree는 `/Users/reddit/Project/agent-memory/.worktrees/observation-review-candidates`야. 목표는 `observations audit`의 top injected refs를 `review explain`, replacement/supersedes chain, `graph inspect` 요약과 copy-paste follow-up commands로 연결하는 거야. 자동 cleanup/mutation은 하지 않고 forensic review만 강화한다. ## Current repo state @@ -32,15 +32,17 @@ Expected GitHub identity: Verified base before this slice: -- latest completed release: `v0.1.37` -- v0.1.37 added read-only `agent-memory observations audit` and was published to GitHub/npm/PyPI. -- local Hermes hook uses `/Users/reddit/.agent-memory/runtime/v0.1.37/.venv/bin/agent-memory` against `/Users/reddit/.agent-memory/memory.db`. +- latest completed release: `v0.1.38` +- v0.1.38 removed observation query previews, skipped Hermes doctor/test synthetic observations, added audit quality warnings, and verified a real Hermes turn used an agent-memory fact. +- local Hermes hook uses `/Users/reddit/.agent-memory/runtime/v0.1.38/.venv/bin/agent-memory` against `/Users/reddit/.agent-memory/memory.db`. +- root checkout was clean on `main...origin/main` except local-only untracked state. +- open PRs were `[]`. Active slice/worktree: -- branch: `fix/observation-dogfood-quality` -- worktree: `/Users/reddit/Project/agent-memory/.worktrees/observation-dogfood-quality` -- intended release after merge: likely `v0.1.38` +- branch: `feat/observation-review-candidates` +- worktree: `/Users/reddit/Project/agent-memory/.worktrees/observation-review-candidates` +- intended release after merge: likely `v0.1.39` Expected local untracked artifacts to preserve in the root checkout: @@ -52,86 +54,62 @@ Expected local untracked artifacts to preserve in the root checkout: Do not delete or commit these unless the user explicitly asks. -## Current slice: observation dogfood data quality +## Current slice: observation review candidates Goal: -- Keep observation telemetry useful for real dogfood QA. -- Avoid storing prompt-like query previews. -- Avoid synthetic hook doctor/test payloads polluting observation audits. -- Make audit explicitly report low-signal data states. -- Ensure existing DBs lazily migrate missing lifecycle tables encountered during real local QA. +- Keep dogfood/noise monitoring read-only. +- Turn `observations audit` top refs into actionable forensic review candidates. +- Help operators see lifecycle status, replacement chains, and relation graph hints without exposing raw user queries or mutating memory. Implemented so far in the active worktree: -- `record_retrieval_observation` now writes `query_preview = None` for new observations. -- Hermes pre-LLM hook detects the deterministic `hermes hooks doctor/test` payload: - - session_id `test-session` - - user_message `What is the weather?` - - empty conversation_history - - is_first_turn true - - model `gpt-4` - - platform `cli` -- Synthetic doctor/test payloads still exercise hook context injection but do not write dogfood observation rows. -- `observations audit` now returns: - - `empty_retrieval_ratio` - - `quality_warnings` - - `no_observations` - - `low_observation_count` - - `high_empty_retrieval_ratio` -- `memory_status_transitions` now has lazy/idempotent schema ensure used by initialize, status update, and status history paths. +- New CLI: + - `agent-memory observations review-candidates --limit N --top N --frequent-threshold N` +- Output contract: + - `kind: retrieval_observation_review_candidates` + - `read_only: true` + - nested `observation_audit` payload + - `candidates[]` derived from `top_memory_refs` + - fact refs include `review_explain` payload equivalent to `review explain fact` + - graph summary includes depth-1 relation neighbor refs and edge count + - signals include existing audit signals plus `has_replacement` and `has_graph_relations` when applicable + - copy-paste commands for `review explain`, `review replacements`, and `graph inspect` +- Refactored CLI review explain to reuse `_fact_review_explanation_payload`. - Docs updated: - `README.md` - `docs/hermes-dogfood.md` -- Tests added/updated in `tests/test_cli.py`: - - query preview is absent from observation list output - - audit reports low-signal empty retrievals - - approve-fact migrates existing DBs missing `memory_status_transitions` - - Hermes hook synthetic doctor payload skips observation write - - Hermes hook context includes retrieved memory content when line budget allows +- Test added in `tests/test_cli.py`: + - `test_python_module_cli_observations_review_candidates_explains_top_refs_without_mutation_or_raw_queries` Verification so far: - RED confirmed: - - query_preview still present - - synthetic doctor payload wrote observation rows - - audit lacked `empty_retrieval_ratio`/`quality_warnings` - - existing DB without `memory_status_transitions` failed approve with sqlite OperationalError + - `observations review-candidates` was not a valid subcommand. - GREEN focused: - - `uv run pytest tests/test_cli.py::test_python_module_cli_approve_fact_migrates_existing_database_without_status_transition_table tests/test_cli.py::test_python_module_cli_retrieve_observe_records_secret_safe_local_observation tests/test_cli.py::test_python_module_cli_observations_audit_reports_low_signal_empty_retrievals tests/test_cli.py::test_python_module_cli_hermes_pre_llm_hook_skips_synthetic_doctor_observation tests/test_cli.py::test_python_module_cli_hermes_pre_llm_hook_injects_retrieved_memory_context -q` - - `5 passed` - -Live local Hermes QA already confirmed on v0.1.37 runtime before this patch: - -- Created a temporary approved fact in `/Users/reddit/.agent-memory/memory.db` with marker `AM_LIVE_E2E_1777567838` scoped to `/Users/reddit/Project/agent-memory`. -- Direct hook check confirmed: - - `direct_hook_contains_marker=True` - - `direct_hook_contains_agent_memory_context=True` - - `direct_hook_contains_retrieved_fact=True` -- Actual Hermes command confirmed the model used injected memory: - - `hermes --accept-hooks -z "What is the Hermes live E2E QA marker? Return only the marker and nothing else."` - - output contained `AM_LIVE_E2E_1777567838` -- Cleanup done: - - test fact id 2 deprecated with reason `live E2E QA cleanup` - - `review explain` showed `visible_in_default_retrieval: false` -- During live QA, an existing DB migration gap was discovered: - - approve failed until `agent-memory init ~/.agent-memory/memory.db` created `memory_status_transitions` - - this is now covered by the new lazy migration test/fix. + - `uv run pytest tests/test_cli.py::test_python_module_cli_observations_review_candidates_explains_top_refs_without_mutation_or_raw_queries -q` + - `1 passed` +- Broader focused: + - audit, review-candidates, review explain, graph inspect CLI tests + - `4 passed` +- Help smoke: + - `PYTHONPATH=src uv run python -m agent_memory.api.cli observations review-candidates --help` + - exit 0 Remaining before PR: -1. Run broader focused group and full local verification: +1. Run full local verification: - `uv run pytest tests/ -q` - `uv run python scripts/check_release_metadata.py` - `uv run python scripts/smoke_release_readiness.py` - `npm pack --dry-run` - `git diff --check` - `node --check bin/agent-memory.js` -2. Run real smoke for observation list/audit on a temp DB and confirm query_preview is null and no raw secret-like text appears. +2. Run real temp-DB smoke for `observations review-candidates` and confirm no raw secret-like query text appears. 3. Run static diff secret scan. 4. Create PR, watch CI, merge, follow release-sync/publish/published smoke/Hermes QA. -5. After v0.1.38 install, repeat Hermes hook doctor and one real E2E check with the new runtime. +5. After v0.1.39 install, repeat Hermes hook doctor and run installed `observations review-candidates` against the existing local DB. ## Next natural slice after this one -After this data-quality fix is released and Hermes QA passes, continue dogfood/noise monitoring using the cleaner audit data. Avoid mutating cleanup or broader graph retrieval until there are enough real, non-synthetic observations to justify ranking/scope changes. +After the read-only review candidate report is released and dogfooded, continue gathering real observation data. If enough non-synthetic observations accumulate, the next likely work is retrieval quality diagnostics for high empty-retrieval ratios or scope/ranking misses. Avoid automatic cleanup/deprecation until the review candidate workflow has been used on real local data. diff --git a/README.md b/README.md index 212bb5c..7fc7659 100644 --- a/README.md +++ b/README.md @@ -109,9 +109,10 @@ For local dogfood and noise monitoring, retrievals can leave a secret-safe obser agent-memory retrieve "$DB" "How should I install agent-memory?" --preferred-scope user:default --observe cli agent-memory observations list "$DB" --limit 20 agent-memory observations audit "$DB" --limit 200 --top 10 --frequent-threshold 3 +agent-memory observations review-candidates "$DB" --limit 200 --top 10 --frequent-threshold 3 ``` -Use the observation log and audit report to spot frequently injected or surprising memories before changing retrieval behavior. The audit output is read-only JSON with surface/scope counts, empty-retrieval count and ratio, quality warnings such as `low_observation_count` or `high_empty_retrieval_ratio`, top injected memory refs, current status for known refs, and simple signals such as `frequently_injected` and `current_status_not_approved`. Treat it as local operator telemetry, not a synced analytics stream. +Use the observation log and audit report to spot frequently injected or surprising memories before changing retrieval behavior. The audit output is read-only JSON with surface/scope counts, empty-retrieval count and ratio, quality warnings such as `low_observation_count` or `high_empty_retrieval_ratio`, top injected memory refs, current status for known refs, and simple signals such as `frequently_injected` and `current_status_not_approved`. `observations review-candidates` is also read-only; it turns the top audit refs into forensic candidates with fact review explanations, replacement-chain hints, graph-neighborhood summaries, and copy-paste follow-up commands such as `review explain`, `review replacements`, and `graph inspect`. Treat these reports as local operator telemetry, not a synced analytics feature or an automatic cleanup workflow. ## Hermes quickstart diff --git a/docs/hermes-dogfood.md b/docs/hermes-dogfood.md index c1ed53c..0ef9525 100644 --- a/docs/hermes-dogfood.md +++ b/docs/hermes-dogfood.md @@ -48,10 +48,20 @@ Hermes pre-LLM hook retrievals write a secret-safe local observation row to the ```bash agent-memory observations list ~/.agent-memory/memory.db --limit 20 agent-memory observations audit ~/.agent-memory/memory.db --limit 200 --top 10 --frequent-threshold 3 +agent-memory observations review-candidates ~/.agent-memory/memory.db --limit 200 --top 10 --frequent-threshold 3 ``` Use this before tuning ranking or adding broader graph traversal: first confirm which memories are frequently injected, which scopes are active, whether retrieval is often empty, and whether any frequently injected refs are now deprecated/disputed/missing. The audit command is read-only and summarizes local observation rows without emitting raw query text or query previews. Keep this data local unless you intentionally export it. +`observations review-candidates` is the next read-only step after audit. It keeps the same secret-safe observation summary, then expands each top ref into a forensic candidate: + +- fact refs include the same lifecycle explanation as `agent-memory review explain fact ...`. +- replacement/supersedes chains are surfaced as candidate signals instead of mutating anything. +- relation graph neighbors are summarized so you know when `agent-memory graph inspect ...` is worth running. +- the JSON includes copy-paste follow-up commands for `review explain`, `review replacements`, and `graph inspect`. + +Do not treat review candidates as automatic cleanup recommendations. They are a short list for human review; approve/deprecate/supersede decisions should still be explicit curation actions. + When the audit reports `quality_warnings`, treat them as QA signals rather than cleanup instructions: - `no_observations`: Hermes has not produced dogfood observation data yet; check hook install/allowlist and run a real prompt. diff --git a/src/agent_memory/api/cli.py b/src/agent_memory/api/cli.py index 89701c4..f517bc6 100644 --- a/src/agent_memory/api/cli.py +++ b/src/agent_memory/api/cli.py @@ -125,16 +125,57 @@ def _status_counts_for_facts(facts) -> dict[str, int]: return counts -def _current_status_for_memory_ref(db_path: Path, memory_ref: str) -> str | None: +def _memory_ref_parts(memory_ref: str) -> tuple[str, int] | None: memory_type, separator, raw_id = memory_ref.partition(":") if separator != ":" or not raw_id.isdigit() or memory_type not in {"fact", "procedure", "episode"}: return None + return memory_type, int(raw_id) + + +def _current_status_for_memory_ref(db_path: Path, memory_ref: str) -> str | None: + parts = _memory_ref_parts(memory_ref) + if parts is None: + return None + memory_type, memory_id = parts try: - return get_memory_status(db_path, memory_type=memory_type, memory_id=int(raw_id)) + return get_memory_status(db_path, memory_type=memory_type, memory_id=memory_id) except ValueError: return "missing" +def _fact_review_explanation_payload(db_path: Path, *, fact_id: int) -> dict[str, Any]: + fact = get_fact(db_path, fact_id=fact_id) + claim_facts = list_facts_by_claim_slot( + db_path, + subject_ref=fact.subject_ref, + predicate=fact.predicate, + scope=fact.scope, + ) + history = list_memory_status_history(db_path, memory_type="fact", memory_id=fact.id) + replacement_relations = list_fact_replacement_relations(db_path, fact_id=fact.id) + replacement_chain = _fact_replacement_chain_payload(replacement_relations, fact_id=fact.id) + return { + "memory_type": "fact", + "memory_id": fact.id, + "fact": fact.model_dump(mode="json"), + "decision": { + "current_status": fact.status, + "visible_in_default_retrieval": fact.status == "approved", + "summary": _fact_decision_summary(status=fact.status, replacement_chain=replacement_chain), + }, + "claim_slot": { + "subject_ref": fact.subject_ref, + "predicate": fact.predicate, + "scope": fact.scope, + "counts": _status_counts_for_facts(claim_facts), + "facts": [item.model_dump(mode="json") for item in claim_facts], + }, + "history": [entry.model_dump(mode="json") for entry in history], + "replacement_chain": replacement_chain, + "default_retrieval_policy": "approved_only", + } + + def _audit_retrieval_observations( db_path: Path, *, @@ -209,6 +250,74 @@ def _audit_retrieval_observations( } +def _review_candidates_from_observations( + db_path: Path, + *, + limit: int, + top: int, + frequent_threshold: int, +) -> dict[str, Any]: + audit = _audit_retrieval_observations( + db_path, + limit=limit, + top=top, + frequent_threshold=frequent_threshold, + ) + candidates = [] + for top_ref in audit["top_memory_refs"]: + memory_ref = top_ref["memory_ref"] + parts = _memory_ref_parts(memory_ref) + review_explain = None + replacement_chain = None + if parts is not None and parts[0] == "fact" and top_ref["current_status"] != "missing": + review_explain = _fact_review_explanation_payload(db_path, fact_id=parts[1]) + replacement_chain = review_explain["replacement_chain"] + + graph = _inspect_relation_graph(db_path, start_ref=memory_ref, depth=1, limit=25) + signals = list(top_ref["signals"]) + if replacement_chain is not None and ( + replacement_chain["superseded_by"] or replacement_chain["replaces"] + ): + signals.append("has_replacement") + if graph["edges"]: + signals.append("has_graph_relations") + + commands = {"graph_inspect": f"agent-memory graph inspect {db_path} {memory_ref} --depth 1"} + if parts is not None: + memory_type, memory_id = parts + if memory_type == "fact": + commands["review_explain"] = f"agent-memory review explain fact {db_path} {memory_id}" + commands["review_replacements"] = f"agent-memory review replacements fact {db_path} {memory_id}" + + ordered_commands = {} + for command_name in ("review_explain", "review_replacements", "graph_inspect"): + if command_name in commands: + ordered_commands[command_name] = commands[command_name] + + candidates.append( + { + **top_ref, + "signals": signals, + "review_explain": review_explain, + "graph_summary": { + "start_ref": graph["start_ref"], + "depth": graph["depth"], + "edge_count": len(graph["edges"]), + "neighbor_refs": [edge["neighbor_ref"] for edge in graph["edges"]], + "truncated": graph["truncated"], + }, + "commands": ordered_commands, + } + ) + + return { + "kind": "retrieval_observation_review_candidates", + "read_only": True, + "observation_audit": audit, + "candidates": candidates, + } + + def _inspect_relation_graph(db_path: Path, *, start_ref: str, depth: int, limit: int) -> dict[str, Any]: if depth < 0: raise ValueError("graph inspect depth must be >= 0") @@ -519,6 +628,14 @@ def _build_parser() -> argparse.ArgumentParser: observations_audit_parser.add_argument("--limit", type=int, default=200) observations_audit_parser.add_argument("--top", type=int, default=10) observations_audit_parser.add_argument("--frequent-threshold", type=int, default=3) + observations_review_candidates_parser = observations_subparsers.add_parser( + "review-candidates", + help="Build a read-only forensic review report from top retrieval observation refs.", + ) + observations_review_candidates_parser.add_argument("db_path", type=Path) + observations_review_candidates_parser.add_argument("--limit", type=int, default=200) + observations_review_candidates_parser.add_argument("--top", type=int, default=10) + observations_review_candidates_parser.add_argument("--frequent-threshold", type=int, default=3) graph_parser = subparsers.add_parser("graph") graph_subparsers = graph_parser.add_subparsers(dest="graph_action", required=True) @@ -841,41 +958,7 @@ def main() -> None: ) return elif args.review_action == "explain": - fact = get_fact(args.db_path, fact_id=args.memory_id) - claim_facts = list_facts_by_claim_slot( - args.db_path, - subject_ref=fact.subject_ref, - predicate=fact.predicate, - scope=fact.scope, - ) - history = list_memory_status_history(args.db_path, memory_type="fact", memory_id=fact.id) - replacement_relations = list_fact_replacement_relations(args.db_path, fact_id=fact.id) - replacement_chain = _fact_replacement_chain_payload(replacement_relations, fact_id=fact.id) - print( - json.dumps( - { - "memory_type": "fact", - "memory_id": fact.id, - "fact": fact.model_dump(mode="json"), - "decision": { - "current_status": fact.status, - "visible_in_default_retrieval": fact.status == "approved", - "summary": _fact_decision_summary(status=fact.status, replacement_chain=replacement_chain), - }, - "claim_slot": { - "subject_ref": fact.subject_ref, - "predicate": fact.predicate, - "scope": fact.scope, - "counts": _status_counts_for_facts(claim_facts), - "facts": [item.model_dump(mode="json") for item in claim_facts], - }, - "history": [entry.model_dump(mode="json") for entry in history], - "replacement_chain": replacement_chain, - "default_retrieval_policy": "approved_only", - }, - indent=2, - ) - ) + print(json.dumps(_fact_review_explanation_payload(args.db_path, fact_id=args.memory_id), indent=2)) return elif args.review_action == "conflicts": facts = list_facts_by_claim_slot( @@ -950,6 +1033,19 @@ def main() -> None: ) ) return + if args.observations_action == "review-candidates": + print( + json.dumps( + _review_candidates_from_observations( + args.db_path, + limit=args.limit, + top=args.top, + frequent_threshold=args.frequent_threshold, + ), + indent=2, + ) + ) + return raise ValueError(f"Unsupported observations action: {args.observations_action}") if args.command == "graph": diff --git a/tests/test_cli.py b/tests/test_cli.py index 718538e..f6f2d39 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -6,7 +6,7 @@ from pathlib import Path from agent_memory.api.cli import main -from agent_memory.core.curation import approve_fact, create_candidate_fact +from agent_memory.core.curation import approve_fact, create_candidate_fact, supersede_fact from agent_memory.core.ingestion import ingest_source_text from agent_memory.integrations.hermes_hooks import scope_from_cwd from agent_memory.storage.sqlite import initialize_database, insert_relation, update_memory_status @@ -261,6 +261,120 @@ def test_python_module_cli_observations_audit_reports_frequent_and_stale_refs_wi assert "abc123" not in audit_result.stdout +def test_python_module_cli_observations_review_candidates_explains_top_refs_without_mutation_or_raw_queries( + tmp_path: Path, +) -> None: + db_path = tmp_path / "observation-review-candidates.db" + initialize_database(db_path) + source = ingest_source_text( + db_path=db_path, + source_type="transcript", + content="Review candidate target phrase moved from OLD_VALUE to NEW_VALUE.", + metadata={"project": "observation-review"}, + ) + old_fact = create_candidate_fact( + db_path=db_path, + subject_ref="Review candidate", + predicate="target_phrase", + object_ref_or_value="OLD_VALUE", + evidence_ids=[source.id], + scope="project:observation-review", + confidence=0.7, + ) + replacement_fact = create_candidate_fact( + db_path=db_path, + subject_ref="Review candidate", + predicate="target_phrase", + object_ref_or_value="NEW_VALUE", + evidence_ids=[source.id], + scope="project:observation-review", + confidence=0.95, + ) + approve_fact(db_path=db_path, fact_id=old_fact.id) + + env = {**os.environ, "PYTHONPATH": "src"} + for secret_query in ( + "What is the review candidate target phrase? password=SUPERSECRET", + "Repeat the review candidate target phrase token=abc123", + ): + retrieve_result = subprocess.run( + [ + sys.executable, + "-m", + "agent_memory.api.cli", + "retrieve", + str(db_path), + secret_query, + "--preferred-scope", + "project:observation-review", + "--observe", + "cli-test", + ], + cwd=Path(__file__).resolve().parents[1], + env=env, + capture_output=True, + text=True, + ) + assert retrieve_result.returncode == 0, retrieve_result.stderr + + supersede_fact( + db_path=db_path, + superseded_fact_id=old_fact.id, + replacement_fact_id=replacement_fact.id, + reason="new target phrase replaced old one", + actor="test", + evidence_ids=[source.id], + ) + + review_result = subprocess.run( + [ + sys.executable, + "-m", + "agent_memory.api.cli", + "observations", + "review-candidates", + str(db_path), + "--limit", + "50", + "--top", + "5", + "--frequent-threshold", + "2", + ], + cwd=Path(__file__).resolve().parents[1], + env=env, + capture_output=True, + text=True, + ) + + assert review_result.returncode == 0, review_result.stderr + payload = json.loads(review_result.stdout) + assert payload["kind"] == "retrieval_observation_review_candidates" + assert payload["read_only"] is True + assert payload["observation_audit"]["kind"] == "retrieval_observation_audit" + assert payload["observation_audit"]["read_only"] is True + candidate = payload["candidates"][0] + assert candidate["memory_ref"] == f"fact:{old_fact.id}" + assert candidate["injection_count"] == 2 + assert candidate["current_status"] == "deprecated" + assert candidate["signals"] == [ + "frequently_injected", + "current_status_not_approved", + "has_replacement", + "has_graph_relations", + ] + assert candidate["review_explain"]["decision"]["visible_in_default_retrieval"] is False + assert candidate["review_explain"]["replacement_chain"]["superseded_by"][0]["replacement_fact_id"] == replacement_fact.id + assert candidate["graph_summary"]["edge_count"] == 1 + assert candidate["commands"] == { + "review_explain": f"agent-memory review explain fact {db_path} {old_fact.id}", + "review_replacements": f"agent-memory review replacements fact {db_path} {old_fact.id}", + "graph_inspect": f"agent-memory graph inspect {db_path} fact:{old_fact.id} --depth 1", + } + assert "SUPERSECRET" not in review_result.stdout + assert "abc123" not in review_result.stdout + + def test_python_module_cli_observations_audit_reports_low_signal_empty_retrievals(tmp_path: Path) -> None: db_path = tmp_path / "observation-audit-empty.db" initialize_database(db_path)