diff --git a/.devague/current b/.devague/current index 63d0030..2d89266 100644 --- a/.devague/current +++ b/.devague/current @@ -1 +1 @@ -data-refinery-now-owns-store-file-migration-a-cons +data-refinery-s-files-backend-can-write-a-fail-clo diff --git a/.devague/current_plan b/.devague/current_plan index d3365f0..2d89266 100644 --- a/.devague/current_plan +++ b/.devague/current_plan @@ -1 +1 @@ -data-refinery-cli-ships-the-storage-data-quality-i +data-refinery-s-files-backend-can-write-a-fail-clo diff --git a/.devague/frames/data-refinery-s-files-backend-can-write-a-fail-clo.json b/.devague/frames/data-refinery-s-files-backend-can-write-a-fail-clo.json new file mode 100644 index 0000000..f163e9e --- /dev/null +++ b/.devague/frames/data-refinery-s-files-backend-can-write-a-fail-clo.json @@ -0,0 +1,192 @@ +{ + "slug": "data-refinery-s-files-backend-can-write-a-fail-clo", + "title": "data-refinery's files backend can write a fail-closed .gitignore on store-dir materialization, so a consumer keeps private shards out of git without ever constructing a write path itself", + "schema_version": 1, + "status": "exported", + "created": "2026-06-24T12:45:59Z", + "updated": "2026-06-24T13:01:34Z", + "claims": [ + { + "id": "c1", + "kind": "announcement", + "text": "data-refinery's files backend can write a fail-closed .gitignore on store-dir materialization, so a consumer keeps private shards out of git without ever constructing a write path itself", + "origin": "user", + "status": "confirmed", + "honesty_conditions": [ + { + "id": "h6", + "text": "a files store dir opted in to write_gitignore ends up with private shards untracked by git and public shards tracked, and the consumer supplied only a bool + a base_dir it owns (no write path)", + "status": "confirmed" + } + ], + "hard_questions": [], + "links": [] + }, + { + "id": "c2", + "kind": "audience", + "text": "eidetic-cli (the first consumer, moving to repo-contained memory) and the assisting agent; more generally any files-backend consumer that wants repo-contained private shards", + "origin": "llm", + "status": "confirmed", + "honesty_conditions": [ + { + "id": "h7", + "text": "eidetic-cli is a real, named first consumer whose repo-contained-memory cutover (its 2026-06-24 spec) is blocked on this issue, and the surface is generic enough that any other files-backend consumer could opt in identically", + "status": "confirmed" + } + ], + "hard_questions": [], + "links": [] + }, + { + "id": "c3", + "kind": "after_state", + "text": "a files store dir is materialized with a fail-closed .gitignore that ignores everything but public shards, so private shards (__private.jsonl) are git-ignored from their first write; the consumer opts in with a single flag and never builds a write path", + "origin": "llm", + "status": "confirmed", + "honesty_conditions": [ + { + "id": "h8", + "text": "the consumer reaches the materialized .gitignore by passing only write_gitignore=True + a base_dir it already owns; a test drives store.put/migrate and asserts the file exists without the caller building any path", + "status": "confirmed" + } + ], + "hard_questions": [], + "links": [] + }, + { + "id": "c4", + "kind": "before_state", + "text": "a consumer that wants private shards out of git must construct and write a .gitignore itself, reintroducing exactly the pythonsecurity:S2083 write-path sink that #8 removed by moving path-construction to DR", + "origin": "llm", + "status": "confirmed", + "honesty_conditions": [ + { + "id": "h9", + "text": "without this endpoint the consumer's own .gitignore write is a flagged pythonsecurity:S2083 path sink (eidetic's prior BLOCKER that #8 and this issue move to DR)", + "status": "confirmed" + } + ], + "hard_questions": [], + "links": [] + }, + { + "id": "c5", + "kind": "why_it_matters", + "text": "DR owns the __.jsonl on-disk layout, so DR must own the ignore pattern that tracks it; a whitelist (fail-closed) excludes any future private filename or sidecar DR introduces by default rather than silently leaking it", + "origin": "llm", + "status": "confirmed", + "honesty_conditions": [ + { + "id": "h10", + "text": "because the whitelist allows only *__public.jsonl, an arbitrary non-public sidecar name DR might add later is git-ignored by default \u2014 verifiable with git check-ignore on a made-up sidecar filename", + "status": "confirmed" + } + ], + "hard_questions": [], + "links": [] + }, + { + "id": "c6", + "kind": "boundary", + "text": "files backend only (mongo/neo4j have no on-disk dir -> no-op); never write on a read (list()/get() must not create files); never clobber an existing .gitignore; default OFF so existing dirs stay byte-identical", + "origin": "llm", + "status": "confirmed", + "honesty_conditions": [ + { + "id": "h11", + "text": "mongo/neo4j is a no-op (no .gitignore), a read get()/list() creates nothing, and an existing .gitignore is never rewritten \u2014 each is a distinct passing test", + "status": "confirmed" + } + ], + "hard_questions": [], + "links": [] + }, + { + "id": "c7", + "kind": "success_signal", + "text": "in an opted-in dir, git check-ignore reports __private.jsonl ignored and __public.jsonl tracked; re-materializing writes nothing (idempotent); option OFF is byte-identical to today; shipped in a tagged release eidetic can pin a floor to", + "origin": "llm", + "status": "confirmed", + "honesty_conditions": [ + { + "id": "h12", + "text": "the acceptance trio (check-ignore private-ignored & public-tracked; idempotent re-run writes nothing; OFF is byte-identical) are all expressible as passing tests, and the change ships under a bumped version + CHANGELOG entry", + "status": "confirmed" + } + ], + "hard_questions": [], + "links": [] + }, + { + "id": "c8", + "kind": "requirement", + "text": "expose an opt-in write_gitignore flag (default False) on FilesBackend init, plumbed through the store surface eidetic consumes so the consumer passes only a bool and a base_dir it already owns", + "origin": "llm", + "status": "confirmed", + "honesty_conditions": [ + { + "id": "h1", + "text": "with the flag OFF (the default), a materialized store dir is byte-for-byte identical to today: no .gitignore, no extra files, no behavior change on any existing consumer or dir", + "status": "confirmed" + } + ], + "hard_questions": [], + "links": [] + }, + { + "id": "c9", + "kind": "requirement", + "text": "when on, ensure base_dir/.gitignore holds the fail-closed whitelist exactly: a line '*', then '!.gitignore', then '!*__public.jsonl' \u2014 created only on a write/materialize, never on a read", + "origin": "llm", + "status": "confirmed", + "honesty_conditions": [ + { + "id": "h2", + "text": "in a real git repo, git check-ignore confirms __private.jsonl is ignored AND __public.jsonl is tracked under an opted-in base_dir", + "status": "confirmed" + }, + { + "id": "h3", + "text": "a read-only get()/list() (and a dry-run migrate) never creates the .gitignore; only an actual write/materialize does", + "status": "confirmed" + } + ], + "hard_questions": [], + "links": [] + }, + { + "id": "c10", + "kind": "decision", + "text": "create-when-absent only: if any .gitignore already exists, do nothing (no rewrite, no clobber) \u2014 it may carry user edits; idempotency is existence-based, not content-match", + "origin": "llm", + "status": "confirmed", + "honesty_conditions": [ + { + "id": "h4", + "text": "re-materializing when a .gitignore already exists writes nothing and never overwrites it, even if its content differs from the canonical whitelist", + "status": "confirmed" + } + ], + "hard_questions": [], + "links": [] + }, + { + "id": "c11", + "kind": "assumption", + "text": "eidetic consumes write_gitignore via the importable store surface (store.migrate and/or store.put with base_dir + write_gitignore), which requires fixing files.build to stop dropping kwargs; no new CLI flag is needed for v1", + "origin": "llm", + "status": "confirmed", + "honesty_conditions": [ + { + "id": "h5", + "text": "eidetic can reach write_gitignore through the importable store surface it already uses (store.migrate / store.put), so it never constructs a filesystem write path \u2014 confirming this requires checking eidetic's actual consumption call", + "status": "confirmed" + } + ], + "hard_questions": [], + "links": [] + } + ], + "open_vagueness": [] +} diff --git a/.devague/plans/data-refinery-s-files-backend-can-write-a-fail-clo.json b/.devague/plans/data-refinery-s-files-backend-can-write-a-fail-clo.json new file mode 100644 index 0000000..610ba65 --- /dev/null +++ b/.devague/plans/data-refinery-s-files-backend-can-write-a-fail-clo.json @@ -0,0 +1,206 @@ +{ + "slug": "data-refinery-s-files-backend-can-write-a-fail-clo", + "title": "data-refinery's files backend can write a fail-closed .gitignore on store-dir materialization, so a consumer keeps private shards out of git without ever constructing a write path itself", + "frame_slug": "data-refinery-s-files-backend-can-write-a-fail-clo", + "schema_version": 1, + "status": "exported", + "created": "2026-06-24T14:07:57Z", + "updated": "2026-06-24T14:10:17Z", + "targets": [ + { + "id": "c1", + "kind": "announcement", + "text": "data-refinery's files backend can write a fail-closed .gitignore on store-dir materialization, so a consumer keeps private shards out of git without ever constructing a write path itself" + }, + { + "id": "h6", + "kind": "honesty", + "text": "a files store dir opted in to write_gitignore ends up with private shards untracked by git and public shards tracked, and the consumer supplied only a bool + a base_dir it owns (no write path)" + }, + { + "id": "c2", + "kind": "audience", + "text": "eidetic-cli (the first consumer, moving to repo-contained memory) and the assisting agent; more generally any files-backend consumer that wants repo-contained private shards" + }, + { + "id": "h7", + "kind": "honesty", + "text": "eidetic-cli is a real, named first consumer whose repo-contained-memory cutover (its 2026-06-24 spec) is blocked on this issue, and the surface is generic enough that any other files-backend consumer could opt in identically" + }, + { + "id": "c3", + "kind": "after_state", + "text": "a files store dir is materialized with a fail-closed .gitignore that ignores everything but public shards, so private shards (__private.jsonl) are git-ignored from their first write; the consumer opts in with a single flag and never builds a write path" + }, + { + "id": "h8", + "kind": "honesty", + "text": "the consumer reaches the materialized .gitignore by passing only write_gitignore=True + a base_dir it already owns; a test drives store.put/migrate and asserts the file exists without the caller building any path" + }, + { + "id": "c4", + "kind": "before_state", + "text": "a consumer that wants private shards out of git must construct and write a .gitignore itself, reintroducing exactly the pythonsecurity:S2083 write-path sink that #8 removed by moving path-construction to DR" + }, + { + "id": "h9", + "kind": "honesty", + "text": "without this endpoint the consumer's own .gitignore write is a flagged pythonsecurity:S2083 path sink (eidetic's prior BLOCKER that #8 and this issue move to DR)" + }, + { + "id": "c5", + "kind": "why_it_matters", + "text": "DR owns the __.jsonl on-disk layout, so DR must own the ignore pattern that tracks it; a whitelist (fail-closed) excludes any future private filename or sidecar DR introduces by default rather than silently leaking it" + }, + { + "id": "h10", + "kind": "honesty", + "text": "because the whitelist allows only *__public.jsonl, an arbitrary non-public sidecar name DR might add later is git-ignored by default \u2014 verifiable with git check-ignore on a made-up sidecar filename" + }, + { + "id": "c6", + "kind": "boundary", + "text": "files backend only (mongo/neo4j have no on-disk dir -> no-op); never write on a read (list()/get() must not create files); never clobber an existing .gitignore; default OFF so existing dirs stay byte-identical" + }, + { + "id": "h11", + "kind": "honesty", + "text": "mongo/neo4j is a no-op (no .gitignore), a read get()/list() creates nothing, and an existing .gitignore is never rewritten \u2014 each is a distinct passing test" + }, + { + "id": "c7", + "kind": "success_signal", + "text": "in an opted-in dir, git check-ignore reports __private.jsonl ignored and __public.jsonl tracked; re-materializing writes nothing (idempotent); option OFF is byte-identical to today; shipped in a tagged release eidetic can pin a floor to" + }, + { + "id": "h12", + "kind": "honesty", + "text": "the acceptance trio (check-ignore private-ignored & public-tracked; idempotent re-run writes nothing; OFF is byte-identical) are all expressible as passing tests, and the change ships under a bumped version + CHANGELOG entry" + }, + { + "id": "c8", + "kind": "requirement", + "text": "expose an opt-in write_gitignore flag (default False) on FilesBackend init, plumbed through the store surface eidetic consumes so the consumer passes only a bool and a base_dir it already owns" + }, + { + "id": "h1", + "kind": "honesty", + "text": "with the flag OFF (the default), a materialized store dir is byte-for-byte identical to today: no .gitignore, no extra files, no behavior change on any existing consumer or dir" + }, + { + "id": "c9", + "kind": "requirement", + "text": "when on, ensure base_dir/.gitignore holds the fail-closed whitelist exactly: a line '*', then '!.gitignore', then '!*__public.jsonl' \u2014 created only on a write/materialize, never on a read" + }, + { + "id": "h2", + "kind": "honesty", + "text": "in a real git repo, git check-ignore confirms __private.jsonl is ignored AND __public.jsonl is tracked under an opted-in base_dir" + }, + { + "id": "h3", + "kind": "honesty", + "text": "a read-only get()/list() (and a dry-run migrate) never creates the .gitignore; only an actual write/materialize does" + } + ], + "tasks": [ + { + "id": "t1", + "summary": "Core files-backend .gitignore support + unit/integration tests", + "origin": "llm", + "status": "confirmed", + "acceptance_criteria": [ + "FilesBackend(base_dir, write_gitignore=True) creates base_dir/.gitignore on the first upsert with exactly the bytes '*\\n!.gitignore\\n!*__public.jsonl\\n'", + "default write_gitignore=False writes no .gitignore; the materialized dir is byte-identical to current behavior (regression test)", + "get()/list() never create .gitignore even when write_gitignore=True (gitignore lives on write paths only, never in __init__)", + "an existing .gitignore is never overwritten even when its content differs from the whitelist (create-when-absent)", + "in a temp git repo: git check-ignore reports __private.jsonl and an arbitrary non-public sidecar name ignored, and __public.jsonl tracked", + "files.build(base_dir=..., write_gitignore=...) honors both kwargs (no longer dropped); store.put/get/list forward them through get_backend", + "mongo/neo4j backends remain unaffected (no .gitignore behavior); a re-run after the file exists writes nothing" + ], + "deps": [], + "covers": [ + "c1", + "c5", + "c6", + "c8", + "c9", + "h1", + "h2", + "h3", + "h6", + "h8", + "h10", + "h11", + "h12" + ] + }, + { + "id": "t2", + "summary": "Plumb write_gitignore + base_dir through store.migrate", + "origin": "llm", + "status": "confirmed", + "acceptance_criteria": [ + "store.migrate(transform, backend='files', base_dir=..., write_gitignore=True) materializes base_dir/.gitignore during the apply pass", + "dry_run=True writes nothing, including no .gitignore", + "migrate() signature gains write_gitignore: bool = False; with it off, migrate is byte-identical to today" + ], + "deps": [ + "t1" + ], + "covers": [ + "c3" + ] + }, + { + "id": "t3", + "summary": "Docs + version bump + CHANGELOG for the opt-in surface", + "origin": "llm", + "status": "confirmed", + "acceptance_criteria": [ + "docs/contract.md documents write_gitignore on the files put/migrate surface: the fail-closed whitelist, create-when-absent, and the mongo/neo4j no-op", + "README.md + AGENTS.colleague.md note the opt-in; CHANGELOG.md gains an Added entry; pyproject.toml version is bumped so version-check passes", + "the rationale (DR owns the layout so DR owns the ignore pattern; moves eidetic's S2083 sink) is captured in the contract doc" + ], + "deps": [ + "t1", + "t2" + ], + "covers": [ + "c4", + "c7", + "h9", + "h12" + ] + }, + { + "id": "t4", + "summary": "Cross-check eidetic-cli can reach write_gitignore via the importable surface", + "origin": "llm", + "status": "confirmed", + "acceptance_criteria": [ + "eidetic-cli's store consumption call sites are inspected and confirmed able to pass write_gitignore via store.migrate/store.put with a base_dir it owns (Option B); if not, a follow-up issue is filed naming the surface eidetic needs", + "the tagged-release floor eidetic will pin is identified (the version this ships in)" + ], + "deps": [], + "covers": [ + "c2", + "h7" + ] + } + ], + "risks": [ + { + "id": "r1", + "text": "git check-ignore acceptance tests require a git binary; the test must skip gracefully when git is absent rather than fail", + "kind": "unknown_nonblocking", + "task_id": "t1" + }, + { + "id": "r2", + "text": "t4 inspects sibling repo eidetic-cli, which may not be checked out locally; if absent, cross-check defers to a brief/issue on eidetic-cli rather than blocking the release", + "kind": "unknown_nonblocking", + "task_id": "t4" + } + ] +} diff --git a/AGENTS.colleague.md b/AGENTS.colleague.md index 3b0dbf4..c8cf5f5 100644 --- a/AGENTS.colleague.md +++ b/AGENTS.colleague.md @@ -16,7 +16,9 @@ behavior, update both. data-refinery-cli owns the **storage + data-quality infrastructure layer** split out of eidetic-cli (issue #1): the mongo + neo4j substrate, the docker stack (published to GHCR), a storage-neutral **store** (`store put/get/list` over a -files/mongo/neo4j `Backend`, also importable as `data_refinery.store`), and a +files/mongo/neo4j `Backend`, also importable as `data_refinery.store`; the files +backend accepts an opt-in `write_gitignore=True` to write a fail-closed +`.gitignore`, files-only, default off), and a **consumer-agnostic** data-quality surface (`validate`, `dedup`, `integrity`, `freshness`). It treats stored data as **opaque envelopes** (`{id, hash, content, scope, metadata}`) and never interprets them as "memories" diff --git a/CHANGELOG.md b/CHANGELOG.md index 2ceace1..d1f2fe0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,16 @@ All notable changes to this project will be documented in this file. Format follows [Keep a Changelog](https://keepachangelog.com/). This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.9.0] - 2026-06-24 + +### Added + +- Files store backend: optional `write_gitignore` flag (default off) that writes a fail-closed `.gitignore` (`*` / `!.gitignore` / `!*__public.jsonl`) into the store `base_dir` on materialization, so a consumer keeps private shards out of git without ever constructing a filesystem write path (issue #12). Reachable via `FilesBackend(base_dir, write_gitignore=True)`, `store.put/get/list(..., backend="files", base_dir=..., write_gitignore=True)`, and `store.migrate(..., write_gitignore=True)`. Written only on a write/materialize (never on a read or a dry-run migrate), create-when-absent (never clobbers an existing `.gitignore`), files backend only (mongo/neo4j are a no-op). + +### Changed + +- `data_refinery.store.backends.files.build()` now honors `base_dir` and `write_gitignore` kwargs (it previously dropped all kwargs), so the importable `store.put`/`get`/`list` surface can target a caller-owned `base_dir`. + ## [0.8.0] - 2026-06-24 ### Added diff --git a/README.md b/README.md index 91f0fab..abc23ea 100644 --- a/README.md +++ b/README.md @@ -62,6 +62,10 @@ store.list() # -> list[Envelope] # Upgrade a populated legacy store to the current Envelope format — the consumer # supplies only a transform, never a filesystem write path (data-refinery owns it): store.migrate(record_to_envelope, base_dir="/path/to/store") + +# Opt-in: write a fail-closed .gitignore so private shards stay out of git +# (files backend only; default off, byte-identical when omitted): +store.put(env, backend="files", base_dir="/path/to/store", write_gitignore=True) ``` ## CLI diff --git a/data_refinery/store/backends/files.py b/data_refinery/store/backends/files.py index 63b4b27..c97206e 100644 --- a/data_refinery/store/backends/files.py +++ b/data_refinery/store/backends/files.py @@ -20,6 +20,10 @@ _ENV_DIR = "DR_DATA_DIR" _JSONL_GLOB = "*.jsonl" # one scope file per (name, visibility) _TMP_SUFFIX = ".tmp" # atomic-write temp sibling: ".jsonl.tmp" +_GITIGNORE_NAME = ".gitignore" +# Fail-closed whitelist (issue #12): ignore everything but public shards, so any +# future private filename or sidecar is excluded by default rather than leaked. +_GITIGNORE_BODY = "*\n!.gitignore\n!*__public.jsonl\n" # Re-derived from the public `Visibility` type so it never drifts from it. _VISIBILITIES: tuple[str, ...] = get_args(Visibility) @@ -32,16 +36,34 @@ class FilesBackend: """Persist envelopes as JSONL files, one file per scope.""" - def __init__(self, base_dir: str | None = None) -> None: + def __init__(self, base_dir: str | None = None, *, write_gitignore: bool = False) -> None: if base_dir is None: base_dir = os.environ.get(_ENV_DIR) or str(Path.home() / ".data-refinery" / "store") self._base = Path(base_dir) self._base.mkdir(parents=True, exist_ok=True) + self._write_gitignore = write_gitignore # -- Backend protocol ----------------------------------------------- + def _ensure_gitignore(self) -> None: + """Create ``.gitignore`` in *base_dir* when ``write_gitignore`` is set. + + Create-when-absent only (never overwrites user edits). Reuses the shared + :meth:`_atomic_write` (temp sibling + ``os.replace``), so a write fault + surfaces as a structured ``CliError`` — never a traceback — and a crash + leaves either no file or the complete whitelist (the orphan temp is + reaped by :meth:`_reap_orphan_tmp`). + """ + if not self._write_gitignore: + return + gi = self._base / _GITIGNORE_NAME + if gi.exists(): + return + self._atomic_write(gi, _GITIGNORE_BODY) + def upsert(self, envelope: Envelope) -> None: """Insert or replace *envelope* idempotently (by id; dedup by hash on insert).""" + self._ensure_gitignore() path = self._scope_file(envelope.scope) records = self._load(path) @@ -141,6 +163,7 @@ def migrate( # per file (temp sibling + os.replace), so a crash here still leaves each # file either fully old or fully new and the run is safe to resume. if not dry_run: + self._ensure_gitignore() for path, new_text in plan: self._atomic_write(path, new_text) return { @@ -215,13 +238,19 @@ def _assert_contained(path: Path, root: Path) -> None: @staticmethod def _reap_orphan_tmp(root: Path) -> None: - """Remove ``*.jsonl.tmp`` left by a prior interrupted rewrite. + """Remove ``*.jsonl.tmp`` / ``.gitignore.tmp`` left by an interrupted write. ``os.replace`` consumes the temp on success, so a surviving temp is the residue of a crash *before* the swap — the real file is intact. Reaping - keeps the store dir tidy and the ``*.jsonl`` glob unambiguous. + keeps the store dir tidy and the ``*.jsonl`` glob unambiguous. The + ``.gitignore`` temp shares the same atomic-write path, so it is reaped + here too (it falls outside the ``*.jsonl.tmp`` glob). """ - for tmp in root.glob(_JSONL_GLOB + _TMP_SUFFIX): + temps = list(root.glob(_JSONL_GLOB + _TMP_SUFFIX)) + gi_tmp = root / (_GITIGNORE_NAME + _TMP_SUFFIX) + if gi_tmp.exists(): + temps.append(gi_tmp) + for tmp in temps: try: tmp.unlink() except OSError: # pragma: no cover - best effort @@ -345,6 +374,8 @@ def _to_envelope(obj: dict[str, Any], transform: Transform | None) -> Envelope | return transform(obj) -def build(**_kwargs: object) -> Backend: - """Factory: a default FilesBackend (ignores kwargs like ``timeout_ms``).""" - return FilesBackend() +def build( + *, base_dir: str | None = None, write_gitignore: bool = False, **_kwargs: object +) -> Backend: + """Factory: a FilesBackend honouring ``base_dir`` and ``write_gitignore``.""" + return FilesBackend(base_dir, write_gitignore=write_gitignore) diff --git a/data_refinery/store/migrate.py b/data_refinery/store/migrate.py index 36929cf..3074d6d 100644 --- a/data_refinery/store/migrate.py +++ b/data_refinery/store/migrate.py @@ -28,6 +28,7 @@ def migrate( *, backend: str = DEFAULT_BACKEND, base_dir: str | None = None, + write_gitignore: bool = False, dry_run: bool = False, ) -> dict[str, Any]: """Upgrade an on-disk store to the current Envelope format. @@ -39,6 +40,11 @@ def migrate( optionally the store root it already owns via *base_dir*) — never a per-file write path. + With ``write_gitignore=True`` the files backend materialises the fail-closed + ``.gitignore`` (``* / !.gitignore / !*__public.jsonl``) during the apply pass; + a dry_run never writes it; default ``False`` is byte-identical to today. + Files backend only. + Idempotent: a second run rewrites nothing. The consumer's transform need **not** itself be idempotent — after the first run every line is a canonical Envelope, and the files backend keeps an already-canonical line **verbatim** @@ -55,7 +61,9 @@ def migrate( structured :class:`CliError`. """ if backend == "files": - return FilesBackend(base_dir).migrate(transform, dry_run=dry_run) + return FilesBackend(base_dir, write_gitignore=write_gitignore).migrate( + transform, dry_run=dry_run + ) raise CliError( code=EXIT_USER_ERROR, message=f"store migration is not yet supported for backend {backend!r}", diff --git a/docs/contract.md b/docs/contract.md index 187b417..4f8f4ed 100644 --- a/docs/contract.md +++ b/docs/contract.md @@ -150,7 +150,7 @@ summary = migrate(record_to_envelope, base_dir="/path/to/store") # -> dict # record_to_envelope: Callable[[dict], Envelope | None] (None drops a record) ``` -`migrate(transform=None, *, backend="files", base_dir=None, dry_run=False)` +`migrate(transform=None, *, backend="files", base_dir=None, write_gitignore=False, dry_run=False)` returns `{backend, files, migrated, migrated_files, skipped, dry_run}`. With `transform=None` it re-canonicalises data-refinery's own Envelope-JSONL (the self-heal path the `store migrate` CLI verb uses). The consumer supplies a @@ -171,6 +171,42 @@ write path. - **Files granularity only** today — `mongo` (vectors) / `neo4j` (graph) migration are a later granularity and exit `1` with a `hint:`. +### Fail-closed `.gitignore` opt-in (stable) + +When a consumer opts in with `write_gitignore=True`, the files backend ensures a +fail-closed `.gitignore` exists in the store `base_dir` with exactly this +content: + +```gitignore +* +!.gitignore +!*__public.jsonl +``` + +It ignores everything and only ever allows public shards (and the `.gitignore` +itself) to be tracked, so any future private filename or sidecar is excluded by +default. + +**Behavior:** + +- **Opt-in** — default `False`; off is byte-identical to today. +- **Materialise on write, never on read** — written only during an upsert or a + non-dry `store migrate` apply; never on `get`/`list` or a dry-run. +- **Create-when-absent only** — an existing `.gitignore` is never overwritten. +- **Files backend only** — `mongo`/`neo4j` are a no-op. + +**Reachable surfaces:** + +- `FilesBackend(base_dir, write_gitignore=True)` +- `data_refinery.store.put/get/list(..., backend="files", base_dir=..., write_gitignore=True)` + (`get_backend` forwards kwargs to the files `build()`) +- `data_refinery.store.migrate(transform, *, backend="files", base_dir=..., write_gitignore=..., dry_run=...)` + +**Rationale:** data-refinery owns the `__.jsonl` on-disk +layout, so it owns the ignore pattern that tracks it; this keeps the `.gitignore` +write-path sink (the consumer's prior pythonsecurity:S2083) on the storage +owner. Continues issues #8 / #1. + ## Versioning policy | Change | Requires | diff --git a/docs/plans/2026-06-24-data-refinery-s-files-backend-can-write-a-fail-clo.md b/docs/plans/2026-06-24-data-refinery-s-files-backend-can-write-a-fail-clo.md new file mode 100644 index 0000000..f5f2a90 --- /dev/null +++ b/docs/plans/2026-06-24-data-refinery-s-files-backend-can-write-a-fail-clo.md @@ -0,0 +1,49 @@ +# Build Plan — data-refinery's files backend can write a fail-closed .gitignore on store-dir materialization, so a consumer keeps private shards out of git without ever constructing a write path itself + +slug: `data-refinery-s-files-backend-can-write-a-fail-clo` · status: `exported` · from frame: `data-refinery-s-files-backend-can-write-a-fail-clo` + +> data-refinery's files backend can write a fail-closed .gitignore on store-dir materialization, so a consumer keeps private shards out of git without ever constructing a write path itself + +## Tasks + +### t1 — Core files-backend .gitignore support + unit/integration tests + +- covers: c1, c5, c6, c8, c9, h1, h2, h3, h6, h8, h10, h11, h12 +- acceptance: + - FilesBackend(base_dir, write_gitignore=True) creates base_dir/.gitignore on the first upsert with exactly the bytes '*\n!.gitignore\n!*__public.jsonl\n' + - default write_gitignore=False writes no .gitignore; the materialized dir is byte-identical to current behavior (regression test) + - get()/list() never create .gitignore even when write_gitignore=True (gitignore lives on write paths only, never in __init__) + - an existing .gitignore is never overwritten even when its content differs from the whitelist (create-when-absent) + - in a temp git repo: git check-ignore reports `__private.jsonl` and an arbitrary non-public sidecar name ignored, and `__public.jsonl` tracked + - files.build(base_dir=..., write_gitignore=...) honors both kwargs (no longer dropped); store.put/get/list forward them through get_backend + - mongo/neo4j backends remain unaffected (no .gitignore behavior); a re-run after the file exists writes nothing + +### t2 — Plumb write_gitignore + base_dir through store.migrate + +- depends on: t1 +- covers: c3 +- acceptance: + - store.migrate(transform, backend='files', base_dir=..., write_gitignore=True) materializes base_dir/.gitignore during the apply pass + - dry_run=True writes nothing, including no .gitignore + - migrate() signature gains write_gitignore: bool = False; with it off, migrate is byte-identical to today + +### t3 — Docs + version bump + CHANGELOG for the opt-in surface + +- depends on: t1, t2 +- covers: c4, c7, h9, h12 +- acceptance: + - docs/contract.md documents write_gitignore on the files put/migrate surface: the fail-closed whitelist, create-when-absent, and the mongo/neo4j no-op + - README.md + AGENTS.colleague.md note the opt-in; CHANGELOG.md gains an Added entry; pyproject.toml version is bumped so version-check passes + - the rationale (DR owns the layout so DR owns the ignore pattern; moves eidetic's S2083 sink) is captured in the contract doc + +### t4 — Cross-check eidetic-cli can reach write_gitignore via the importable surface + +- covers: c2, h7 +- acceptance: + - eidetic-cli's store consumption call sites are inspected and confirmed able to pass write_gitignore via store.migrate/store.put with a base_dir it owns (Option B); if not, a follow-up issue is filed naming the surface eidetic needs + - the tagged-release floor eidetic will pin is identified (the version this ships in) + +## Risks + +- [unknown_nonblocking] git check-ignore acceptance tests require a git binary; the test must skip gracefully when git is absent rather than fail (task t1) +- [unknown_nonblocking] t4 inspects sibling repo eidetic-cli, which may not be checked out locally; if absent, cross-check defers to a brief/issue on eidetic-cli rather than blocking the release (task t4) diff --git a/docs/specs/2026-06-24-data-refinery-s-files-backend-can-write-a-fail-clo.md b/docs/specs/2026-06-24-data-refinery-s-files-backend-can-write-a-fail-clo.md new file mode 100644 index 0000000..39aa5b0 --- /dev/null +++ b/docs/specs/2026-06-24-data-refinery-s-files-backend-can-write-a-fail-clo.md @@ -0,0 +1,52 @@ +# data-refinery's files backend can write a fail-closed .gitignore on store-dir materialization, so a consumer keeps private shards out of git without ever constructing a write path itself + +> data-refinery's files backend can write a fail-closed .gitignore on store-dir materialization, so a consumer keeps private shards out of git without ever constructing a write path itself + +## Audience + +- eidetic-cli (the first consumer, moving to repo-contained memory) and the assisting agent; more generally any files-backend consumer that wants repo-contained private shards + +## Before → After + +- Before: a consumer that wants private shards out of git must construct and write a .gitignore itself, reintroducing exactly the pythonsecurity:S2083 write-path sink that #8 removed by moving path-construction to DR +- After: a files store dir is materialized with a fail-closed .gitignore that ignores everything but public shards, so private shards (`__private.jsonl`) are git-ignored from their first write; the consumer opts in with a single flag and never builds a write path + +## Why it matters + +- DR owns the `__.jsonl` on-disk layout, so DR must own the ignore pattern that tracks it; a whitelist (fail-closed) excludes any future private filename or sidecar DR introduces by default rather than silently leaking it + +## Requirements + +- expose an opt-in write_gitignore flag (default False) on FilesBackend init, plumbed through the store surface eidetic consumes so the consumer passes only a bool and a base_dir it already owns + - honesty: with the flag OFF (the default), a materialized store dir is byte-for-byte identical to today: no .gitignore, no extra files, no behavior change on any existing consumer or dir +- when on, ensure base_dir/.gitignore holds the fail-closed whitelist exactly: a line '*', then '!.gitignore', then '!*__public.jsonl' — created only on a write/materialize, never on a read + - honesty: in a real git repo, git check-ignore confirms `__private.jsonl` is ignored AND `__public.jsonl` is tracked under an opted-in base_dir + - honesty: a read-only get()/list() (and a dry-run migrate) never creates the .gitignore; only an actual write/materialize does + +## Honesty conditions + +- a files store dir opted in to write_gitignore ends up with private shards untracked by git and public shards tracked, and the consumer supplied only a bool + a base_dir it owns (no write path) +- eidetic-cli is a real, named first consumer whose repo-contained-memory cutover (its 2026-06-24 spec) is blocked on this issue, and the surface is generic enough that any other files-backend consumer could opt in identically +- the consumer reaches the materialized .gitignore by passing only write_gitignore=True + a base_dir it already owns; a test drives store.put/migrate and asserts the file exists without the caller building any path +- without this endpoint the consumer's own .gitignore write is a flagged pythonsecurity:S2083 path sink (eidetic's prior BLOCKER that #8 and this issue move to DR) +- because the whitelist allows only *__public.jsonl, an arbitrary non-public sidecar name DR might add later is git-ignored by default — verifiable with git check-ignore on a made-up sidecar filename +- mongo/neo4j is a no-op (no .gitignore), a read get()/list() creates nothing, and an existing .gitignore is never rewritten — each is a distinct passing test +- the acceptance trio (check-ignore private-ignored & public-tracked; idempotent re-run writes nothing; OFF is byte-identical) are all expressible as passing tests, and the change ships under a bumped version + CHANGELOG entry +- re-materializing when a .gitignore already exists writes nothing and never overwrites it, even if its content differs from the canonical whitelist +- eidetic can reach write_gitignore through the importable store surface it already uses (store.migrate / store.put), so it never constructs a filesystem write path — confirming this requires checking eidetic's actual consumption call + +## Success signals + +- in an opted-in dir, git check-ignore reports `__private.jsonl` ignored and `__public.jsonl` tracked; re-materializing writes nothing (idempotent); option OFF is byte-identical to today; shipped in a tagged release eidetic can pin a floor to + +## Scope / boundaries + +- files backend only (mongo/neo4j have no on-disk dir -> no-op); never write on a read (list()/get() must not create files); never clobber an existing .gitignore; default OFF so existing dirs stay byte-identical + +## Assumptions + +- eidetic consumes write_gitignore via the importable store surface (store.migrate and/or store.put with base_dir + write_gitignore), which requires fixing files.build to stop dropping kwargs; no new CLI flag is needed for v1 + +## Decisions + +- create-when-absent only: if any .gitignore already exists, do nothing (no rewrite, no clobber) — it may carry user edits; idempotency is existence-based, not content-match diff --git a/pyproject.toml b/pyproject.toml index 6ad5d0d..2230c67 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data-refinery-cli" -version = "0.8.0" +version = "0.9.0" description = "Agent and CLI for data quality in storage and retrieval — validating, deduplicating, and checking the integrity and freshness of data as it is stored and fetched. Split out of eidetic-cli so eidetic keeps agent-memory; sibling to daria, the Data Refinery Intelligent Agent." readme = "README.md" license = "Apache-2.0" diff --git a/tests/test_store_gitignore.py b/tests/test_store_gitignore.py new file mode 100644 index 0000000..b1e4190 --- /dev/null +++ b/tests/test_store_gitignore.py @@ -0,0 +1,161 @@ +"""Files-backend .gitignore materialization (issue #12). + +When ``write_gitignore=True`` is passed to ``FilesBackend`` (or forwarded +through ``store.put`` / ``store.migrate``), the backend creates a fail-closed +``.gitignore`` on the first write, ignoring everything except public shards. +Reads never create the file; existing files are never overwritten. +""" + +from __future__ import annotations + +import shutil +import subprocess + +import pytest + +import data_refinery.store as store +from data_refinery.store.backends.files import FilesBackend, build +from data_refinery.store.envelope import Envelope, Scope + +_GITIGNORE_CONTENT = "*\n!.gitignore\n!*__public.jsonl\n" + + +# ------------------------------------------------------------------ +# Content / existence +# ------------------------------------------------------------------ + + +def test_gitignore_content_after_upsert(tmp_path) -> None: + """A write_gitignore=True upsert creates .gitignore with the canonical content.""" + backend = FilesBackend(base_dir=str(tmp_path), write_gitignore=True) + backend.upsert(Envelope(id="a", content="hello")) + gi = tmp_path / ".gitignore" + assert gi.exists() + assert gi.read_text() == _GITIGNORE_CONTENT + + +def test_default_no_gitignore(tmp_path) -> None: + """Default (write_gitignore=False) never creates .gitignore.""" + backend = FilesBackend(base_dir=str(tmp_path)) + backend.upsert(Envelope(id="a", content="hello")) + assert not (tmp_path / ".gitignore").exists() + + +def test_read_does_not_create_gitignore(tmp_path) -> None: + """get()/list() with write_gitignore=True must NOT create .gitignore.""" + backend = FilesBackend(base_dir=str(tmp_path), write_gitignore=True) + # No envelopes yet — reads on an empty store + assert backend.get("nope", Scope("default", "public")) is None + assert backend.list(Scope("default", "public")) == [] + assert not (tmp_path / ".gitignore").exists() + + +def test_existing_gitignore_never_overwritten(tmp_path) -> None: + """A pre-existing .gitignore with different content is never clobbered.""" + gi = tmp_path / ".gitignore" + gi.write_text("my-custom-rules\n") + backend = FilesBackend(base_dir=str(tmp_path), write_gitignore=True) + backend.upsert(Envelope(id="a", content="hello")) + assert gi.read_text() == "my-custom-rules\n" + + +# ------------------------------------------------------------------ +# Real git integration +# ------------------------------------------------------------------ + + +@pytest.mark.skipif( + shutil.which("git") is None, + reason="git not installed", +) +def test_git_check_ignore_private_ignored_public_tracked(tmp_path) -> None: + """In a real git repo, private shards are ignored and public shards are tracked.""" + # Initialise a git repo inside tmp_path + subprocess.run(["git", "init"], cwd=tmp_path, check=True, capture_output=True) + subprocess.run( + ["git", "config", "user.email", "test@test.com"], + cwd=tmp_path, + check=True, + capture_output=True, + ) + subprocess.run( + ["git", "config", "user.name", "Test"], + cwd=tmp_path, + check=True, + capture_output=True, + ) + + backend = FilesBackend(base_dir=str(tmp_path), write_gitignore=True) + backend.upsert(Envelope(id="priv", content="secret", scope=Scope("myapp", "private"))) + backend.upsert(Envelope(id="pub", content="hello", scope=Scope("myapp", "public"))) + + private_file = tmp_path / "myapp__private.jsonl" + public_file = tmp_path / "myapp__public.jsonl" + sidecar = tmp_path / "foo__index.bin" + sidecar.write_text("sidecar") + + # Private shard is ignored + result = subprocess.run( + ["git", "check-ignore", "-q", str(private_file)], + cwd=tmp_path, + capture_output=True, + ) + assert result.returncode == 0, "private shard should be ignored" + + # Arbitrary non-public sidecar is ignored + result = subprocess.run( + ["git", "check-ignore", "-q", str(sidecar)], + cwd=tmp_path, + capture_output=True, + ) + assert result.returncode == 0, "non-public sidecar should be ignored" + + # Public shard is NOT ignored + result = subprocess.run( + ["git", "check-ignore", "-q", str(public_file)], + cwd=tmp_path, + capture_output=True, + ) + assert result.returncode != 0, "public shard should NOT be ignored" + + +# ------------------------------------------------------------------ +# Factory / store.put forwarding +# ------------------------------------------------------------------ + + +def test_build_forwards_write_gitignore(tmp_path) -> None: + """build(base_dir=..., write_gitignore=True) returns a backend that honours the flag.""" + backend = build(base_dir=str(tmp_path), write_gitignore=True) + assert isinstance(backend, FilesBackend) + backend.upsert(Envelope(id="a", content="hello")) + assert (tmp_path / ".gitignore").exists() + + +def test_store_put_forwards_write_gitignore(tmp_path) -> None: + """store.put(..., backend='files', write_gitignore=True) materialises .gitignore.""" + store.put( + Envelope(id="a", content="hello"), + backend="files", + base_dir=str(tmp_path), + write_gitignore=True, + ) + assert (tmp_path / ".gitignore").exists() + assert (tmp_path / ".gitignore").read_text() == _GITIGNORE_CONTENT + + +# ------------------------------------------------------------------ +# Crash hygiene +# ------------------------------------------------------------------ + + +def test_orphan_gitignore_tmp_is_reaped_by_migrate(tmp_path) -> None: + """A stray .gitignore.tmp (a crashed write's debris) is reaped on migrate.""" + backend = FilesBackend(base_dir=str(tmp_path), write_gitignore=True) + backend.upsert(Envelope(id="a", content="hello")) # scope file + .gitignore + stray = tmp_path / ".gitignore.tmp" + stray.write_text("debris") + backend.migrate() # reaps orphan temps before planning + assert not stray.exists() + # The real .gitignore is untouched (create-when-absent on a re-materialise). + assert (tmp_path / ".gitignore").read_text() == _GITIGNORE_CONTENT diff --git a/tests/test_store_migrate.py b/tests/test_store_migrate.py index 0b342f0..4a0a599 100644 --- a/tests/test_store_migrate.py +++ b/tests/test_store_migrate.py @@ -431,3 +431,30 @@ def test_cli_store_migrate_unsupported_backend_exits_1(files_env: str, capsys) - # Text mode: the same error renders the load-bearing `hint:` prefix. assert main(["store", "migrate", "--backend", "mongo"]) == 1 assert "hint:" in capsys.readouterr().err + + +# --- write_gitignore through the importable endpoint ------------------------- + + +def test_migrate_write_gitignore_creates_gitignore(tmp_path: Path) -> None: + # Seed a scope file WITHOUT write_gitignore so no .gitignore exists yet. + store.put(Envelope(id="a", content="x"), backend="files", base_dir=str(tmp_path)) + assert not (tmp_path / ".gitignore").exists() + store.migrate(base_dir=str(tmp_path), write_gitignore=True) + assert (tmp_path / ".gitignore").exists() + assert (tmp_path / ".gitignore").read_text( + encoding="utf-8" + ) == "*\n!.gitignore\n!*__public.jsonl\n" + + +def test_migrate_write_gitignore_dry_run_does_not_create(tmp_path: Path) -> None: + store.put(Envelope(id="a", content="x"), backend="files", base_dir=str(tmp_path)) + assert not (tmp_path / ".gitignore").exists() + store.migrate(base_dir=str(tmp_path), write_gitignore=True, dry_run=True) + assert not (tmp_path / ".gitignore").exists() + + +def test_migrate_default_no_write_gitignore(tmp_path: Path) -> None: + store.put(Envelope(id="a", content="x"), backend="files", base_dir=str(tmp_path)) + store.migrate(base_dir=str(tmp_path)) + assert not (tmp_path / ".gitignore").exists() diff --git a/uv.lock b/uv.lock index 14157a2..12b9cb8 100644 --- a/uv.lock +++ b/uv.lock @@ -156,7 +156,7 @@ wheels = [ [[package]] name = "data-refinery-cli" -version = "0.6.0" +version = "0.9.0" source = { editable = "." } [package.optional-dependencies]