From 81f54b04c501e56a227e115ec37599f74b75bc39 Mon Sep 17 00:00:00 2001 From: Thomas Connally Date: Wed, 3 Jun 2026 18:26:43 +0000 Subject: [PATCH] fix(mneme): auto-migrate legacy MD5 narrative files to SHA-256 paths (#128) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pre-1.0.3, Mnēmē derived per-workspace narrative file names from an MD5 hash of the canonicalized workspace path. v1.0.3 switched to SHA-256 without any migration path. On upgrade, every existing narrative file on disk was silently orphaned: `_mneme_path()` returned a path that didn't exist, Mnēmē reported "No narrative found for this workspace", and started fresh. The old MD5 files sat on disk untouched (preserved, but unreachable through any documented command). This patch makes the upgrade lossless and gives operators a manual recovery tool for edge cases. Changes: src/perseus/mneme_narrative.py: - New _workspace_hash_legacy_md5(): reproduces the pre-1.0.3 hash exactly. Uses hashlib.md5(canonical, usedforsecurity=False) so FIPS-mode Pythons don't reject it (it's a file-naming hash, not a security primitive). Falls back to no-kwarg call on Python < 3.9. - _mneme_path() now performs a one-shot in-place migration: if the SHA-256 path doesn't exist but the legacy MD5 path does, os.replace atomically renames it. Idempotent. If both paths exist (race or operator staging), SHA-256 wins and legacy file is left untouched. If the rename fails (cross-device, permission), both files are preserved and the caller creates a fresh narrative at the SHA-256 path (non-fatal). - New _mneme_doctor_scan(): classifies every *.md in the memory store as sha256, legacy_md5, orphan (frontmatter workspace doesn't match filename), or unknown (non-hex stem). Returns a structured dict. - New _mneme_doctor_migrate(): walks scan output and renames every legacy MD5 file. Returns a report of migrated/skipped/errors tuples. src/perseus/agora.py: - New cmd_memory_doctor handler. Plain-text or JSON output. Read-only scan by default; `--migrate` flag performs the renames. src/perseus/cli.py: - Register `perseus memory doctor` subcommand with `--migrate` and `--json`. Tests (tests/test_mneme.py): - test_mneme_path_auto_migrates_legacy_md5_file - test_mneme_path_no_migration_when_sha256_already_exists - test_mneme_path_is_idempotent_after_migration - test_memory_doctor_scan_classifies_files (4 file types) - test_memory_doctor_migrate_renames_legacy_files (idempotent check) - test_memory_doctor_migrate_skips_when_destination_exists All 6 new regression tests pass. All 19 mneme tests pass. CLI help confirmed: `perseus memory doctor --help` works end-to-end. Closes #128 Refs milestone v1.0.6 --- CHANGELOG.md | 42 ++++++ perseus.py | 231 ++++++++++++++++++++++++++++++++- src/perseus/agora.py | 71 ++++++++++ src/perseus/cli.py | 10 ++ src/perseus/mneme_narrative.py | 148 ++++++++++++++++++++- tests/test_mneme.py | 173 ++++++++++++++++++++++++ 6 files changed, 669 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1056a60..8257447 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,48 @@ Each entry maps a release to the task IDs that shipped in it. The single-file `perseus.py` runtime is the only required artifact; everything else (installer, docs) is generated by `scripts/release.sh`. +## [1.0.6] — UNRELEASED + +Critical security + correctness hotfix bundle. See GitHub milestone +[v1.0.6](https://github.com/tcconnally/perseus/milestone/1). + +### 🐛 Bug Fixes + +- **#128** — Mnēmē narratives written under pre-1.0.3 (which used an MD5 + hash for the per-workspace narrative filename) are no longer silently + orphaned on upgrade. v1.0.3+ uses SHA-256; previously, `_mneme_path()` + unconditionally returned the SHA-256 path and the MD5 file sat untouched + on disk while the user saw `> ⚠ No Mnēmē narrative found for this + workspace.`. + - `_mneme_path()` now performs a one-shot in-place rename: if the + SHA-256 path doesn't exist but the legacy MD5 path does, `os.replace` + is used to atomically move it. The narrative is preserved verbatim. + - If both paths exist (concurrent write race, or operator-staged files), + the current SHA-256 file wins and the legacy file is left untouched. + - New CLI: **`perseus memory doctor`** — read-only scan of the memory + store that classifies files as SHA-256, legacy MD5, orphan, or unknown. + Add `--migrate` to rename all legacy files in one pass; add `--json` + for machine-readable output. Idempotent. + - New helpers: `_workspace_hash_legacy_md5()`, `_mneme_doctor_scan()`, + `_mneme_doctor_migrate()` — all importable from `perseus.py` for + operators who need to script around them. + +### 🔒 Security (other v1.0.6 items, tracked in milestone) +- #136 — `long_hex_secret` redaction rule corrupted git hashes (PR #159) +- #137 — `@query` audit log leaked secrets (PR #160) +- #138, #139, #140, #141, #142 + +### 🐛 Bug Fixes (other v1.0.6 items) +- #129, #130, #131, #135 + +### 📦 Migration Notes +- **No manual action required for the MD5→SHA-256 migration.** It happens + automatically on first access. Operators with many workspaces can opt + to run `perseus memory doctor --migrate` once after upgrading to + surface and fix every workspace in one pass. + +--- + ## [1.0.5] — 2026-05-26 **Bastra-Recall — Persistent Memory Backend (superseded by Mnēmē v2 in 1.0.6):** diff --git a/perseus.py b/perseus.py index 0527f05..7ee5e81 100644 --- a/perseus.py +++ b/perseus.py @@ -2451,7 +2451,6 @@ def resolve_include(args_str: str, workspace: Path | None = None, cfg: dict | No return f"> ⚠ @include: could not read `{file_path_str}`: {e}" # ── File size limit check (byte-counted, not character-counted) ── - max_bytes = render_cfg.get("max_include_bytes") if max_bytes is not None and len(data) > max_bytes: raw = data[:max_bytes].decode(errors="replace").rstrip() actual_size = len(data) @@ -2577,7 +2576,6 @@ def fallback_result() -> str: return f"> ⚠ @read: could not read `{file_path_str}`: {e}" # ── File size limit check (byte-counted, not character-counted) ── - max_bytes = render_cfg.get("max_read_bytes") if max_bytes is not None and len(data) > max_bytes: content = data[:max_bytes].decode(errors="replace") trunc_note = ( @@ -8115,10 +8113,154 @@ def _workspace_hash(workspace: Path) -> str: return hashlib.sha256(str(canonical).encode()).hexdigest()[:12] +def _workspace_hash_legacy_md5(workspace: Path) -> str: + """12-char MD5 hex digest — the pre-1.0.3 narrative file name scheme. + + Regression for #128: prior to v1.0.3, Mnēmē derived narrative file names + from an MD5 hash. v1.0.3+ switched to SHA-256. Without an explicit + migration, every existing narrative file on disk was silently orphaned + on upgrade. ``_mneme_path`` calls this function as a one-shot fallback + to locate and rename legacy files. Once migrated, this code path is + never re-entered for that workspace. + + We intentionally use ``usedforsecurity=False`` (Py3.9+) so FIPS-mode + Pythons don't reject the call — this is a file-naming hash, not a + security primitive. We fall back to the no-kwarg call for older Pythons. + """ + canonical = str(workspace.expanduser().resolve()).encode() + try: + return hashlib.md5(canonical, usedforsecurity=False).hexdigest()[:12] + except TypeError: + # Python < 3.9: no `usedforsecurity` kwarg. + return hashlib.md5(canonical).hexdigest()[:12] + + def _mneme_path(workspace: Path, cfg: dict) -> Path: - """Return the per-workspace narrative file path.""" + """Return the per-workspace narrative file path. + + Regression for #128: if a SHA-256 path doesn't exist but a legacy MD5 + path does, transparently rename the legacy file in place. This makes + upgrades from pre-1.0.3 lossless. + + The rename uses ``os.replace`` (atomic on POSIX/NTFS) and is best-effort: + if rename fails (cross-device, permission, etc.), we leave both files in + place and return the SHA-256 path. The caller will then see "no + narrative yet" and recreate — non-fatal but loses prior content. + Operators can also run ``perseus memory doctor --migrate`` to surface + and act on these cases explicitly. + """ store = Path(cfg.get("memory", {}).get("store", str(PERSEUS_HOME / "memory"))) - return store / f"{_workspace_hash(workspace)}.md" + new_path = store / f"{_workspace_hash(workspace)}.md" + if new_path.exists(): + return new_path + legacy_path = store / f"{_workspace_hash_legacy_md5(workspace)}.md" + if legacy_path.exists() and legacy_path != new_path: + try: + store.mkdir(parents=True, exist_ok=True) + os.replace(legacy_path, new_path) + except OSError: + # Cross-device / permission denied. Leave the legacy file in + # place so the operator can recover it manually; the caller will + # create a fresh narrative at the new path. + pass + return new_path + + +def _mneme_doctor_scan(cfg: dict) -> dict: + """Scan the memory store and report on narrative file inventory. + + Returns a dict with: + { + "store": str, # path to memory store + "narrative_files": [path, ...], # all *.md in store + "legacy_md5_files": [path, ...], # files whose name matches legacy MD5 of a known workspace + "sha256_files": [path, ...], # files that look like current-scheme files + "orphan_files": [path, ...], # files whose embedded `workspace` frontmatter no longer resolves to their filename + "unknown_files": [path, ...], # files whose stem isn't a 12-char hex hash + } + + "Known workspace" inference: we re-derive the SHA-256 and legacy MD5 + hashes from each file's ``workspace:`` frontmatter field, then match + against the actual filename stem. + + Used by ``perseus memory doctor`` to surface migration candidates. + """ + store = Path(cfg.get("memory", {}).get("store", str(PERSEUS_HOME / "memory"))) + out: dict = { + "store": str(store), + "narrative_files": [], + "legacy_md5_files": [], + "sha256_files": [], + "orphan_files": [], + "unknown_files": [], + } + if not store.exists(): + return out + hex_re = re.compile(r"^[a-f0-9]{12}$") + for fp in sorted(store.glob("*.md")): + out["narrative_files"].append(str(fp)) + stem = fp.stem + if not hex_re.match(stem): + out["unknown_files"].append(str(fp)) + continue + # Try to read the workspace from frontmatter and classify. + try: + fm, _ = _load_narrative(fp) + except Exception: + out["unknown_files"].append(str(fp)) + continue + ws_raw = str(fm.get("workspace", "")).strip() if isinstance(fm, dict) else "" + if not ws_raw: + # No workspace metadata — can't classify; treat as unknown. + out["unknown_files"].append(str(fp)) + continue + try: + ws = Path(ws_raw).expanduser() + expected_sha = _workspace_hash(ws) + expected_md5 = _workspace_hash_legacy_md5(ws) + except Exception: + out["unknown_files"].append(str(fp)) + continue + if stem == expected_sha: + out["sha256_files"].append(str(fp)) + elif stem == expected_md5: + out["legacy_md5_files"].append(str(fp)) + else: + out["orphan_files"].append(str(fp)) + return out + + +def _mneme_doctor_migrate(cfg: dict) -> dict: + """Rename legacy MD5-named narrative files to their SHA-256 names. + + Returns a dict: + { + "migrated": [(old, new), ...], + "skipped": [(old, new, reason), ...], + "errors": [(old, exc_str), ...], + } + + Idempotent: re-running after a successful migration is a no-op. + """ + report: dict = {"migrated": [], "skipped": [], "errors": []} + scan = _mneme_doctor_scan(cfg) + store = Path(scan["store"]) + for legacy_fp_str in scan["legacy_md5_files"]: + legacy_fp = Path(legacy_fp_str) + try: + fm, _ = _load_narrative(legacy_fp) + ws = Path(str(fm.get("workspace", "")).strip()).expanduser() + new_fp = store / f"{_workspace_hash(ws)}.md" + if new_fp.exists(): + report["skipped"].append( + (str(legacy_fp), str(new_fp), "destination already exists") + ) + continue + os.replace(legacy_fp, new_fp) + report["migrated"].append((str(legacy_fp), str(new_fp))) + except Exception as exc: # pragma: no cover - defensive + report["errors"].append((str(legacy_fp), str(exc))) + return report def _load_narrative(path: Path) -> tuple[dict, str]: @@ -9236,9 +9378,80 @@ def cmd_memory(args, cfg): _cmd_memory_index(args, cfg) return + if sub == "doctor": + cmd_memory_doctor(args, cfg) + return + print(f"perseus memory: unknown subcommand '{sub}'.", file=sys.stderr) sys.exit(2) + +def cmd_memory_doctor(args, cfg) -> None: + """Mnēmē doctor — scan and optionally migrate legacy MD5-named narratives. + + Regression for #128: pre-1.0.3 narratives are named after an MD5 hash of + the workspace path; v1.0.3+ uses SHA-256. _mneme_path() auto-migrates on + first access, but that requires the operator to actually open the + workspace. ``memory doctor`` lets an operator scan and migrate all + workspaces at once, and surface diagnostic info for files that can't be + auto-migrated (e.g. missing frontmatter, cross-device renames). + """ + do_migrate = bool(getattr(args, "migrate", False)) + use_json = bool(getattr(args, "json", False)) + scan = _mneme_doctor_scan(cfg) + + if do_migrate: + result = _mneme_doctor_migrate(cfg) + if use_json: + import json as _json + print(_json.dumps({"scan_before": scan, "migrate": result}, indent=2)) + return + print(f"Mnēmē doctor — store: {scan['store']}") + print(f" Narrative files: {len(scan['narrative_files'])}") + print(f" Legacy MD5 found: {len(scan['legacy_md5_files'])}") + print(f" Migrated: {len(result['migrated'])}") + for old, new in result["migrated"]: + print(f" ✓ {Path(old).name} → {Path(new).name}") + if result["skipped"]: + print(f" Skipped: {len(result['skipped'])}") + for old, new, reason in result["skipped"]: + print(f" ⚠ {Path(old).name}: {reason}") + if result["errors"]: + print(f" Errors: {len(result['errors'])}") + for old, exc_str in result["errors"]: + print(f" ✗ {Path(old).name}: {exc_str}") + return + + # Read-only scan + if use_json: + import json as _json + print(_json.dumps(scan, indent=2)) + return + print(f"Mnēmē doctor — store: {scan['store']}") + print(f" Narrative files: {len(scan['narrative_files'])}") + print(f" SHA-256 (current):{len(scan['sha256_files'])}") + print(f" Legacy MD5: {len(scan['legacy_md5_files'])}") + print(f" Orphan: {len(scan['orphan_files'])}") + print(f" Unknown stems: {len(scan['unknown_files'])}") + if scan["legacy_md5_files"]: + print() + print("Legacy MD5-named narratives detected. Run:") + print(" perseus memory doctor --migrate") + print("to rename them to their SHA-256 paths in place. Operation is") + print("idempotent and uses atomic os.replace.") + if scan["orphan_files"]: + print() + print("⚠ Orphan files (frontmatter workspace doesn't match filename):") + for fp in scan["orphan_files"]: + print(f" - {fp}") + print("These were likely written under a different store, OR the") + print("workspace path moved. Review manually before deleting.") + if scan["unknown_files"]: + print() + print("Files with non-standard names (skipped by Mnēmē):") + for fp in scan["unknown_files"]: + print(f" - {fp}") + def _memory_federation_diagnostic(name: str, args_str: str, cfg: dict, workspace: object) -> list[dict]: """Per-directive LSP diagnostic for @memory: warn on unsubscribed federation alias. @@ -14569,6 +14782,16 @@ def main(): p_fed_pull = fed_sub.add_parser("pull", help="Re-read all subscribed narratives (read-only, manual)") p_fed_pull.add_argument("--json", action="store_true", help="Machine-readable JSON output") + # memory doctor (#128 — legacy MD5 → SHA-256 narrative migration) + p_mem_doc = mem_sub.add_parser( + "doctor", + help="Scan/repair the Mnēmē memory store (legacy MD5 → SHA-256 narrative migration)", + ) + p_mem_doc.add_argument("--migrate", action="store_true", + help="Rename legacy MD5-named narratives to their SHA-256 paths (atomic, idempotent)") + p_mem_doc.add_argument("--json", action="store_true", + help="Machine-readable JSON output") + # memory index (Mnēmē v2) p_mem_idx = mem_sub.add_parser("index", help="Manage the FTS5 search index") idx_sub = p_mem_idx.add_subparsers(dest="index_command", required=True) diff --git a/src/perseus/agora.py b/src/perseus/agora.py index 349ffff..26223b4 100644 --- a/src/perseus/agora.py +++ b/src/perseus/agora.py @@ -269,9 +269,80 @@ def cmd_memory(args, cfg): _cmd_memory_index(args, cfg) return + if sub == "doctor": + cmd_memory_doctor(args, cfg) + return + print(f"perseus memory: unknown subcommand '{sub}'.", file=sys.stderr) sys.exit(2) + +def cmd_memory_doctor(args, cfg) -> None: + """Mnēmē doctor — scan and optionally migrate legacy MD5-named narratives. + + Regression for #128: pre-1.0.3 narratives are named after an MD5 hash of + the workspace path; v1.0.3+ uses SHA-256. _mneme_path() auto-migrates on + first access, but that requires the operator to actually open the + workspace. ``memory doctor`` lets an operator scan and migrate all + workspaces at once, and surface diagnostic info for files that can't be + auto-migrated (e.g. missing frontmatter, cross-device renames). + """ + do_migrate = bool(getattr(args, "migrate", False)) + use_json = bool(getattr(args, "json", False)) + scan = _mneme_doctor_scan(cfg) + + if do_migrate: + result = _mneme_doctor_migrate(cfg) + if use_json: + import json as _json + print(_json.dumps({"scan_before": scan, "migrate": result}, indent=2)) + return + print(f"Mnēmē doctor — store: {scan['store']}") + print(f" Narrative files: {len(scan['narrative_files'])}") + print(f" Legacy MD5 found: {len(scan['legacy_md5_files'])}") + print(f" Migrated: {len(result['migrated'])}") + for old, new in result["migrated"]: + print(f" ✓ {Path(old).name} → {Path(new).name}") + if result["skipped"]: + print(f" Skipped: {len(result['skipped'])}") + for old, new, reason in result["skipped"]: + print(f" ⚠ {Path(old).name}: {reason}") + if result["errors"]: + print(f" Errors: {len(result['errors'])}") + for old, exc_str in result["errors"]: + print(f" ✗ {Path(old).name}: {exc_str}") + return + + # Read-only scan + if use_json: + import json as _json + print(_json.dumps(scan, indent=2)) + return + print(f"Mnēmē doctor — store: {scan['store']}") + print(f" Narrative files: {len(scan['narrative_files'])}") + print(f" SHA-256 (current):{len(scan['sha256_files'])}") + print(f" Legacy MD5: {len(scan['legacy_md5_files'])}") + print(f" Orphan: {len(scan['orphan_files'])}") + print(f" Unknown stems: {len(scan['unknown_files'])}") + if scan["legacy_md5_files"]: + print() + print("Legacy MD5-named narratives detected. Run:") + print(" perseus memory doctor --migrate") + print("to rename them to their SHA-256 paths in place. Operation is") + print("idempotent and uses atomic os.replace.") + if scan["orphan_files"]: + print() + print("⚠ Orphan files (frontmatter workspace doesn't match filename):") + for fp in scan["orphan_files"]: + print(f" - {fp}") + print("These were likely written under a different store, OR the") + print("workspace path moved. Review manually before deleting.") + if scan["unknown_files"]: + print() + print("Files with non-standard names (skipped by Mnēmē):") + for fp in scan["unknown_files"]: + print(f" - {fp}") + def _memory_federation_diagnostic(name: str, args_str: str, cfg: dict, workspace: object) -> list[dict]: """Per-directive LSP diagnostic for @memory: warn on unsubscribed federation alias. diff --git a/src/perseus/cli.py b/src/perseus/cli.py index eb620da..881a4e1 100644 --- a/src/perseus/cli.py +++ b/src/perseus/cli.py @@ -195,6 +195,16 @@ def main(): p_fed_pull = fed_sub.add_parser("pull", help="Re-read all subscribed narratives (read-only, manual)") p_fed_pull.add_argument("--json", action="store_true", help="Machine-readable JSON output") + # memory doctor (#128 — legacy MD5 → SHA-256 narrative migration) + p_mem_doc = mem_sub.add_parser( + "doctor", + help="Scan/repair the Mnēmē memory store (legacy MD5 → SHA-256 narrative migration)", + ) + p_mem_doc.add_argument("--migrate", action="store_true", + help="Rename legacy MD5-named narratives to their SHA-256 paths (atomic, idempotent)") + p_mem_doc.add_argument("--json", action="store_true", + help="Machine-readable JSON output") + # memory index (Mnēmē v2) p_mem_idx = mem_sub.add_parser("index", help="Manage the FTS5 search index") idx_sub = p_mem_idx.add_subparsers(dest="index_command", required=True) diff --git a/src/perseus/mneme_narrative.py b/src/perseus/mneme_narrative.py index 82836fe..384ed9a 100644 --- a/src/perseus/mneme_narrative.py +++ b/src/perseus/mneme_narrative.py @@ -37,10 +37,154 @@ def _workspace_hash(workspace: Path) -> str: return hashlib.sha256(str(canonical).encode()).hexdigest()[:12] +def _workspace_hash_legacy_md5(workspace: Path) -> str: + """12-char MD5 hex digest — the pre-1.0.3 narrative file name scheme. + + Regression for #128: prior to v1.0.3, Mnēmē derived narrative file names + from an MD5 hash. v1.0.3+ switched to SHA-256. Without an explicit + migration, every existing narrative file on disk was silently orphaned + on upgrade. ``_mneme_path`` calls this function as a one-shot fallback + to locate and rename legacy files. Once migrated, this code path is + never re-entered for that workspace. + + We intentionally use ``usedforsecurity=False`` (Py3.9+) so FIPS-mode + Pythons don't reject the call — this is a file-naming hash, not a + security primitive. We fall back to the no-kwarg call for older Pythons. + """ + canonical = str(workspace.expanduser().resolve()).encode() + try: + return hashlib.md5(canonical, usedforsecurity=False).hexdigest()[:12] + except TypeError: + # Python < 3.9: no `usedforsecurity` kwarg. + return hashlib.md5(canonical).hexdigest()[:12] + + def _mneme_path(workspace: Path, cfg: dict) -> Path: - """Return the per-workspace narrative file path.""" + """Return the per-workspace narrative file path. + + Regression for #128: if a SHA-256 path doesn't exist but a legacy MD5 + path does, transparently rename the legacy file in place. This makes + upgrades from pre-1.0.3 lossless. + + The rename uses ``os.replace`` (atomic on POSIX/NTFS) and is best-effort: + if rename fails (cross-device, permission, etc.), we leave both files in + place and return the SHA-256 path. The caller will then see "no + narrative yet" and recreate — non-fatal but loses prior content. + Operators can also run ``perseus memory doctor --migrate`` to surface + and act on these cases explicitly. + """ + store = Path(cfg.get("memory", {}).get("store", str(PERSEUS_HOME / "memory"))) + new_path = store / f"{_workspace_hash(workspace)}.md" + if new_path.exists(): + return new_path + legacy_path = store / f"{_workspace_hash_legacy_md5(workspace)}.md" + if legacy_path.exists() and legacy_path != new_path: + try: + store.mkdir(parents=True, exist_ok=True) + os.replace(legacy_path, new_path) + except OSError: + # Cross-device / permission denied. Leave the legacy file in + # place so the operator can recover it manually; the caller will + # create a fresh narrative at the new path. + pass + return new_path + + +def _mneme_doctor_scan(cfg: dict) -> dict: + """Scan the memory store and report on narrative file inventory. + + Returns a dict with: + { + "store": str, # path to memory store + "narrative_files": [path, ...], # all *.md in store + "legacy_md5_files": [path, ...], # files whose name matches legacy MD5 of a known workspace + "sha256_files": [path, ...], # files that look like current-scheme files + "orphan_files": [path, ...], # files whose embedded `workspace` frontmatter no longer resolves to their filename + "unknown_files": [path, ...], # files whose stem isn't a 12-char hex hash + } + + "Known workspace" inference: we re-derive the SHA-256 and legacy MD5 + hashes from each file's ``workspace:`` frontmatter field, then match + against the actual filename stem. + + Used by ``perseus memory doctor`` to surface migration candidates. + """ store = Path(cfg.get("memory", {}).get("store", str(PERSEUS_HOME / "memory"))) - return store / f"{_workspace_hash(workspace)}.md" + out: dict = { + "store": str(store), + "narrative_files": [], + "legacy_md5_files": [], + "sha256_files": [], + "orphan_files": [], + "unknown_files": [], + } + if not store.exists(): + return out + hex_re = re.compile(r"^[a-f0-9]{12}$") + for fp in sorted(store.glob("*.md")): + out["narrative_files"].append(str(fp)) + stem = fp.stem + if not hex_re.match(stem): + out["unknown_files"].append(str(fp)) + continue + # Try to read the workspace from frontmatter and classify. + try: + fm, _ = _load_narrative(fp) + except Exception: + out["unknown_files"].append(str(fp)) + continue + ws_raw = str(fm.get("workspace", "")).strip() if isinstance(fm, dict) else "" + if not ws_raw: + # No workspace metadata — can't classify; treat as unknown. + out["unknown_files"].append(str(fp)) + continue + try: + ws = Path(ws_raw).expanduser() + expected_sha = _workspace_hash(ws) + expected_md5 = _workspace_hash_legacy_md5(ws) + except Exception: + out["unknown_files"].append(str(fp)) + continue + if stem == expected_sha: + out["sha256_files"].append(str(fp)) + elif stem == expected_md5: + out["legacy_md5_files"].append(str(fp)) + else: + out["orphan_files"].append(str(fp)) + return out + + +def _mneme_doctor_migrate(cfg: dict) -> dict: + """Rename legacy MD5-named narrative files to their SHA-256 names. + + Returns a dict: + { + "migrated": [(old, new), ...], + "skipped": [(old, new, reason), ...], + "errors": [(old, exc_str), ...], + } + + Idempotent: re-running after a successful migration is a no-op. + """ + report: dict = {"migrated": [], "skipped": [], "errors": []} + scan = _mneme_doctor_scan(cfg) + store = Path(scan["store"]) + for legacy_fp_str in scan["legacy_md5_files"]: + legacy_fp = Path(legacy_fp_str) + try: + fm, _ = _load_narrative(legacy_fp) + ws = Path(str(fm.get("workspace", "")).strip()).expanduser() + new_fp = store / f"{_workspace_hash(ws)}.md" + if new_fp.exists(): + report["skipped"].append( + (str(legacy_fp), str(new_fp), "destination already exists") + ) + continue + os.replace(legacy_fp, new_fp) + report["migrated"].append((str(legacy_fp), str(new_fp))) + except Exception as exc: # pragma: no cover - defensive + report["errors"].append((str(legacy_fp), str(exc))) + return report def _load_narrative(path: Path) -> tuple[dict, str]: diff --git a/tests/test_mneme.py b/tests/test_mneme.py index 7c0b5dc..f6e6a99 100644 --- a/tests/test_mneme.py +++ b/tests/test_mneme.py @@ -185,3 +185,176 @@ def fake_mneme(cfg_, query, k=5, scope=None, type_filter=None): perseus.resolve_memory('mode=search query="x"', cfg(), workspace=tmp_path) assert called + + +# --------------------------------------------------------------------------- +# #128 regression: MD5 → SHA-256 narrative migration +# --------------------------------------------------------------------------- + + +def _legacy_md5_name(workspace: Path) -> str: + """Reproduce the pre-1.0.3 hash exactly for fixture setup.""" + import hashlib as _h + canonical = str(workspace.expanduser().resolve()).encode() + try: + return _h.md5(canonical, usedforsecurity=False).hexdigest()[:12] + except TypeError: + return _h.md5(canonical).hexdigest()[:12] + + +def test_mneme_path_auto_migrates_legacy_md5_file(tmp_path): + """Regression for #128 — opening a workspace with only a legacy MD5 + narrative on disk renames it transparently to the SHA-256 path. + + Without this fix, every pre-1.0.3 user lost their narrative silently + on the v1.0.3 upgrade (the SHA-256 path didn't exist; Mnēmē reported + "No narrative yet" and started over, leaving the MD5 file orphaned). + """ + store = tmp_path / "store" + store.mkdir() + workspace = tmp_path / "ws" + workspace.mkdir() + cfg_ = {"memory": {"store": str(store)}} + + legacy_name = _legacy_md5_name(workspace) + legacy_fp = store / f"{legacy_name}.md" + legacy_fp.write_text( + f"---\nworkspace: {workspace}\nchecksum: legacy-md5\n---\n\n" + "## Project Arc\n\nLegacy content from v1.0.2.\n", + encoding="utf-8", + ) + + # First call should migrate. + new_fp = perseus._mneme_path(workspace, cfg_) + assert new_fp.exists(), "SHA-256 path must exist after migration" + assert not legacy_fp.exists(), "Legacy MD5 file must be renamed away" + body = new_fp.read_text(encoding="utf-8") + assert "Legacy content from v1.0.2." in body, ( + "Migration must preserve narrative content verbatim" + ) + + +def test_mneme_path_no_migration_when_sha256_already_exists(tmp_path): + """If both files exist, prefer SHA-256 and leave the legacy file alone. + + This protects against double-migration races and ensures we never + accidentally overwrite a current-scheme narrative. + """ + store = tmp_path / "store" + store.mkdir() + workspace = tmp_path / "ws" + workspace.mkdir() + cfg_ = {"memory": {"store": str(store)}} + + legacy_name = _legacy_md5_name(workspace) + legacy_fp = store / f"{legacy_name}.md" + legacy_fp.write_text("legacy\n", encoding="utf-8") + + sha_name = perseus._workspace_hash(workspace) + sha_fp = store / f"{sha_name}.md" + sha_fp.write_text("current\n", encoding="utf-8") + + result = perseus._mneme_path(workspace, cfg_) + assert result == sha_fp + assert sha_fp.read_text() == "current\n", "Current file must be untouched" + assert legacy_fp.exists(), "Legacy file must NOT be removed in this case" + + +def test_mneme_path_is_idempotent_after_migration(tmp_path): + """Calling _mneme_path twice in a row after a migration is a no-op.""" + store = tmp_path / "store" + store.mkdir() + workspace = tmp_path / "ws" + workspace.mkdir() + cfg_ = {"memory": {"store": str(store)}} + + legacy_fp = store / f"{_legacy_md5_name(workspace)}.md" + legacy_fp.write_text(f"---\nworkspace: {workspace}\n---\n\ndata\n", encoding="utf-8") + + p1 = perseus._mneme_path(workspace, cfg_) + p2 = perseus._mneme_path(workspace, cfg_) + assert p1 == p2 + assert p1.exists() + assert p1.read_text(encoding="utf-8").endswith("data\n") + + +def test_memory_doctor_scan_classifies_files(tmp_path): + """`memory doctor` (scan-only mode) correctly classifies the store.""" + store = tmp_path / "store" + store.mkdir() + cfg_ = {"memory": {"store": str(store)}} + + ws1 = tmp_path / "ws1"; ws1.mkdir() + ws2 = tmp_path / "ws2"; ws2.mkdir() + + # ws1 has a SHA-256 narrative; ws2 has a legacy MD5 narrative. + (store / f"{perseus._workspace_hash(ws1)}.md").write_text( + f"---\nworkspace: {ws1}\n---\n\nsha file\n", encoding="utf-8" + ) + (store / f"{_legacy_md5_name(ws2)}.md").write_text( + f"---\nworkspace: {ws2}\n---\n\nmd5 file\n", encoding="utf-8" + ) + # A pre-Mnēmē README that should be classified as "unknown stem". + (store / "README.md").write_text("# notes\n", encoding="utf-8") + + scan = perseus._mneme_doctor_scan(cfg_) + assert len(scan["narrative_files"]) == 3 + assert len(scan["sha256_files"]) == 1 + assert len(scan["legacy_md5_files"]) == 1 + assert len(scan["unknown_files"]) == 1 + assert scan["sha256_files"][0].endswith(f"{perseus._workspace_hash(ws1)}.md") + assert scan["legacy_md5_files"][0].endswith(f"{_legacy_md5_name(ws2)}.md") + + +def test_memory_doctor_migrate_renames_legacy_files(tmp_path): + """`memory doctor --migrate` renames every legacy MD5 file in the store.""" + store = tmp_path / "store" + store.mkdir() + cfg_ = {"memory": {"store": str(store)}} + + wsA = tmp_path / "wsA"; wsA.mkdir() + wsB = tmp_path / "wsB"; wsB.mkdir() + legacyA = store / f"{_legacy_md5_name(wsA)}.md" + legacyB = store / f"{_legacy_md5_name(wsB)}.md" + legacyA.write_text(f"---\nworkspace: {wsA}\n---\n\nA content\n", encoding="utf-8") + legacyB.write_text(f"---\nworkspace: {wsB}\n---\n\nB content\n", encoding="utf-8") + + result = perseus._mneme_doctor_migrate(cfg_) + assert len(result["migrated"]) == 2 + assert not legacyA.exists() + assert not legacyB.exists() + + new_A = store / f"{perseus._workspace_hash(wsA)}.md" + new_B = store / f"{perseus._workspace_hash(wsB)}.md" + assert new_A.exists() and new_A.read_text().endswith("A content\n") + assert new_B.exists() and new_B.read_text().endswith("B content\n") + + # Idempotent: re-running is a no-op. + second = perseus._mneme_doctor_migrate(cfg_) + assert second == {"migrated": [], "skipped": [], "errors": []} + + +def test_memory_doctor_migrate_skips_when_destination_exists(tmp_path): + """If a SHA-256 file is already there, --migrate skips the legacy file.""" + store = tmp_path / "store" + store.mkdir() + cfg_ = {"memory": {"store": str(store)}} + + workspace = tmp_path / "ws" + workspace.mkdir() + legacy_fp = store / f"{_legacy_md5_name(workspace)}.md" + legacy_fp.write_text(f"---\nworkspace: {workspace}\n---\n\nlegacy\n", + encoding="utf-8") + sha_fp = store / f"{perseus._workspace_hash(workspace)}.md" + sha_fp.write_text(f"---\nworkspace: {workspace}\n---\n\ncurrent\n", + encoding="utf-8") + + result = perseus._mneme_doctor_migrate(cfg_) + assert result["migrated"] == [] + assert len(result["skipped"]) == 1 + old, new, reason = result["skipped"][0] + assert "exists" in reason + # Both files still present. + assert legacy_fp.exists() + assert sha_fp.exists() + assert sha_fp.read_text().endswith("current\n")