From 662cd640d36ef258ec2439b51fdac8563749dfe4 Mon Sep 17 00:00:00 2001 From: Ori Nachum Date: Wed, 24 Jun 2026 17:14:22 +0300 Subject: [PATCH 1/8] spec+plan: fail-closed .gitignore on files-store materialization (issue #12) Converged devague frame + build plan for DR's files backend optionally writing a fail-closed .gitignore on store-dir materialization, so a consumer (eidetic-cli) keeps private shards out of git without constructing a write path. Surface decision (Option B): write_gitignore on FilesBackend init + store.migrate, and fix files.build to honor base_dir+write_gitignore so store.put/get/list flow them. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01LAEeF8y7RrKft8de7rZfDM --- .devague/current | 2 +- .devague/current_plan | 2 +- ...-s-files-backend-can-write-a-fail-clo.json | 192 ++++++++++++++++ ...-s-files-backend-can-write-a-fail-clo.json | 206 ++++++++++++++++++ ...ry-s-files-backend-can-write-a-fail-clo.md | 49 +++++ ...ry-s-files-backend-can-write-a-fail-clo.md | 52 +++++ 6 files changed, 501 insertions(+), 2 deletions(-) create mode 100644 .devague/frames/data-refinery-s-files-backend-can-write-a-fail-clo.json create mode 100644 .devague/plans/data-refinery-s-files-backend-can-write-a-fail-clo.json create mode 100644 docs/plans/2026-06-24-data-refinery-s-files-backend-can-write-a-fail-clo.md create mode 100644 docs/specs/2026-06-24-data-refinery-s-files-backend-can-write-a-fail-clo.md diff --git a/.devague/current b/.devague/current index 63d0030..2d89266 100644 --- a/.devague/current +++ b/.devague/current @@ -1 +1 @@ -data-refinery-now-owns-store-file-migration-a-cons +data-refinery-s-files-backend-can-write-a-fail-clo diff --git a/.devague/current_plan b/.devague/current_plan index d3365f0..2d89266 100644 --- a/.devague/current_plan +++ b/.devague/current_plan @@ -1 +1 @@ -data-refinery-cli-ships-the-storage-data-quality-i +data-refinery-s-files-backend-can-write-a-fail-clo diff --git a/.devague/frames/data-refinery-s-files-backend-can-write-a-fail-clo.json b/.devague/frames/data-refinery-s-files-backend-can-write-a-fail-clo.json new file mode 100644 index 0000000..f163e9e --- /dev/null +++ b/.devague/frames/data-refinery-s-files-backend-can-write-a-fail-clo.json @@ -0,0 +1,192 @@ +{ + "slug": "data-refinery-s-files-backend-can-write-a-fail-clo", + "title": "data-refinery's files backend can write a fail-closed .gitignore on store-dir materialization, so a consumer keeps private shards out of git without ever constructing a write path itself", + "schema_version": 1, + "status": "exported", + "created": "2026-06-24T12:45:59Z", + "updated": "2026-06-24T13:01:34Z", + "claims": [ + { + "id": "c1", + "kind": "announcement", + "text": "data-refinery's files backend can write a fail-closed .gitignore on store-dir materialization, so a consumer keeps private shards out of git without ever constructing a write path itself", + "origin": "user", + "status": "confirmed", + "honesty_conditions": [ + { + "id": "h6", + "text": "a files store dir opted in to write_gitignore ends up with private shards untracked by git and public shards tracked, and the consumer supplied only a bool + a base_dir it owns (no write path)", + "status": "confirmed" + } + ], + "hard_questions": [], + "links": [] + }, + { + "id": "c2", + "kind": "audience", + "text": "eidetic-cli (the first consumer, moving to repo-contained memory) and the assisting agent; more generally any files-backend consumer that wants repo-contained private shards", + "origin": "llm", + "status": "confirmed", + "honesty_conditions": [ + { + "id": "h7", + "text": "eidetic-cli is a real, named first consumer whose repo-contained-memory cutover (its 2026-06-24 spec) is blocked on this issue, and the surface is generic enough that any other files-backend consumer could opt in identically", + "status": "confirmed" + } + ], + "hard_questions": [], + "links": [] + }, + { + "id": "c3", + "kind": "after_state", + "text": "a files store dir is materialized with a fail-closed .gitignore that ignores everything but public shards, so private shards (__private.jsonl) are git-ignored from their first write; the consumer opts in with a single flag and never builds a write path", + "origin": "llm", + "status": "confirmed", + "honesty_conditions": [ + { + "id": "h8", + "text": "the consumer reaches the materialized .gitignore by passing only write_gitignore=True + a base_dir it already owns; a test drives store.put/migrate and asserts the file exists without the caller building any path", + "status": "confirmed" + } + ], + "hard_questions": [], + "links": [] + }, + { + "id": "c4", + "kind": "before_state", + "text": "a consumer that wants private shards out of git must construct and write a .gitignore itself, reintroducing exactly the pythonsecurity:S2083 write-path sink that #8 removed by moving path-construction to DR", + "origin": "llm", + "status": "confirmed", + "honesty_conditions": [ + { + "id": "h9", + "text": "without this endpoint the consumer's own .gitignore write is a flagged pythonsecurity:S2083 path sink (eidetic's prior BLOCKER that #8 and this issue move to DR)", + "status": "confirmed" + } + ], + "hard_questions": [], + "links": [] + }, + { + "id": "c5", + "kind": "why_it_matters", + "text": "DR owns the __.jsonl on-disk layout, so DR must own the ignore pattern that tracks it; a whitelist (fail-closed) excludes any future private filename or sidecar DR introduces by default rather than silently leaking it", + "origin": "llm", + "status": "confirmed", + "honesty_conditions": [ + { + "id": "h10", + "text": "because the whitelist allows only *__public.jsonl, an arbitrary non-public sidecar name DR might add later is git-ignored by default \u2014 verifiable with git check-ignore on a made-up sidecar filename", + "status": "confirmed" + } + ], + "hard_questions": [], + "links": [] + }, + { + "id": "c6", + "kind": "boundary", + "text": "files backend only (mongo/neo4j have no on-disk dir -> no-op); never write on a read (list()/get() must not create files); never clobber an existing .gitignore; default OFF so existing dirs stay byte-identical", + "origin": "llm", + "status": "confirmed", + "honesty_conditions": [ + { + "id": "h11", + "text": "mongo/neo4j is a no-op (no .gitignore), a read get()/list() creates nothing, and an existing .gitignore is never rewritten \u2014 each is a distinct passing test", + "status": "confirmed" + } + ], + "hard_questions": [], + "links": [] + }, + { + "id": "c7", + "kind": "success_signal", + "text": "in an opted-in dir, git check-ignore reports __private.jsonl ignored and __public.jsonl tracked; re-materializing writes nothing (idempotent); option OFF is byte-identical to today; shipped in a tagged release eidetic can pin a floor to", + "origin": "llm", + "status": "confirmed", + "honesty_conditions": [ + { + "id": "h12", + "text": "the acceptance trio (check-ignore private-ignored & public-tracked; idempotent re-run writes nothing; OFF is byte-identical) are all expressible as passing tests, and the change ships under a bumped version + CHANGELOG entry", + "status": "confirmed" + } + ], + "hard_questions": [], + "links": [] + }, + { + "id": "c8", + "kind": "requirement", + "text": "expose an opt-in write_gitignore flag (default False) on FilesBackend init, plumbed through the store surface eidetic consumes so the consumer passes only a bool and a base_dir it already owns", + "origin": "llm", + "status": "confirmed", + "honesty_conditions": [ + { + "id": "h1", + "text": "with the flag OFF (the default), a materialized store dir is byte-for-byte identical to today: no .gitignore, no extra files, no behavior change on any existing consumer or dir", + "status": "confirmed" + } + ], + "hard_questions": [], + "links": [] + }, + { + "id": "c9", + "kind": "requirement", + "text": "when on, ensure base_dir/.gitignore holds the fail-closed whitelist exactly: a line '*', then '!.gitignore', then '!*__public.jsonl' \u2014 created only on a write/materialize, never on a read", + "origin": "llm", + "status": "confirmed", + "honesty_conditions": [ + { + "id": "h2", + "text": "in a real git repo, git check-ignore confirms __private.jsonl is ignored AND __public.jsonl is tracked under an opted-in base_dir", + "status": "confirmed" + }, + { + "id": "h3", + "text": "a read-only get()/list() (and a dry-run migrate) never creates the .gitignore; only an actual write/materialize does", + "status": "confirmed" + } + ], + "hard_questions": [], + "links": [] + }, + { + "id": "c10", + "kind": "decision", + "text": "create-when-absent only: if any .gitignore already exists, do nothing (no rewrite, no clobber) \u2014 it may carry user edits; idempotency is existence-based, not content-match", + "origin": "llm", + "status": "confirmed", + "honesty_conditions": [ + { + "id": "h4", + "text": "re-materializing when a .gitignore already exists writes nothing and never overwrites it, even if its content differs from the canonical whitelist", + "status": "confirmed" + } + ], + "hard_questions": [], + "links": [] + }, + { + "id": "c11", + "kind": "assumption", + "text": "eidetic consumes write_gitignore via the importable store surface (store.migrate and/or store.put with base_dir + write_gitignore), which requires fixing files.build to stop dropping kwargs; no new CLI flag is needed for v1", + "origin": "llm", + "status": "confirmed", + "honesty_conditions": [ + { + "id": "h5", + "text": "eidetic can reach write_gitignore through the importable store surface it already uses (store.migrate / store.put), so it never constructs a filesystem write path \u2014 confirming this requires checking eidetic's actual consumption call", + "status": "confirmed" + } + ], + "hard_questions": [], + "links": [] + } + ], + "open_vagueness": [] +} diff --git a/.devague/plans/data-refinery-s-files-backend-can-write-a-fail-clo.json b/.devague/plans/data-refinery-s-files-backend-can-write-a-fail-clo.json new file mode 100644 index 0000000..610ba65 --- /dev/null +++ b/.devague/plans/data-refinery-s-files-backend-can-write-a-fail-clo.json @@ -0,0 +1,206 @@ +{ + "slug": "data-refinery-s-files-backend-can-write-a-fail-clo", + "title": "data-refinery's files backend can write a fail-closed .gitignore on store-dir materialization, so a consumer keeps private shards out of git without ever constructing a write path itself", + "frame_slug": "data-refinery-s-files-backend-can-write-a-fail-clo", + "schema_version": 1, + "status": "exported", + "created": "2026-06-24T14:07:57Z", + "updated": "2026-06-24T14:10:17Z", + "targets": [ + { + "id": "c1", + "kind": "announcement", + "text": "data-refinery's files backend can write a fail-closed .gitignore on store-dir materialization, so a consumer keeps private shards out of git without ever constructing a write path itself" + }, + { + "id": "h6", + "kind": "honesty", + "text": "a files store dir opted in to write_gitignore ends up with private shards untracked by git and public shards tracked, and the consumer supplied only a bool + a base_dir it owns (no write path)" + }, + { + "id": "c2", + "kind": "audience", + "text": "eidetic-cli (the first consumer, moving to repo-contained memory) and the assisting agent; more generally any files-backend consumer that wants repo-contained private shards" + }, + { + "id": "h7", + "kind": "honesty", + "text": "eidetic-cli is a real, named first consumer whose repo-contained-memory cutover (its 2026-06-24 spec) is blocked on this issue, and the surface is generic enough that any other files-backend consumer could opt in identically" + }, + { + "id": "c3", + "kind": "after_state", + "text": "a files store dir is materialized with a fail-closed .gitignore that ignores everything but public shards, so private shards (__private.jsonl) are git-ignored from their first write; the consumer opts in with a single flag and never builds a write path" + }, + { + "id": "h8", + "kind": "honesty", + "text": "the consumer reaches the materialized .gitignore by passing only write_gitignore=True + a base_dir it already owns; a test drives store.put/migrate and asserts the file exists without the caller building any path" + }, + { + "id": "c4", + "kind": "before_state", + "text": "a consumer that wants private shards out of git must construct and write a .gitignore itself, reintroducing exactly the pythonsecurity:S2083 write-path sink that #8 removed by moving path-construction to DR" + }, + { + "id": "h9", + "kind": "honesty", + "text": "without this endpoint the consumer's own .gitignore write is a flagged pythonsecurity:S2083 path sink (eidetic's prior BLOCKER that #8 and this issue move to DR)" + }, + { + "id": "c5", + "kind": "why_it_matters", + "text": "DR owns the __.jsonl on-disk layout, so DR must own the ignore pattern that tracks it; a whitelist (fail-closed) excludes any future private filename or sidecar DR introduces by default rather than silently leaking it" + }, + { + "id": "h10", + "kind": "honesty", + "text": "because the whitelist allows only *__public.jsonl, an arbitrary non-public sidecar name DR might add later is git-ignored by default \u2014 verifiable with git check-ignore on a made-up sidecar filename" + }, + { + "id": "c6", + "kind": "boundary", + "text": "files backend only (mongo/neo4j have no on-disk dir -> no-op); never write on a read (list()/get() must not create files); never clobber an existing .gitignore; default OFF so existing dirs stay byte-identical" + }, + { + "id": "h11", + "kind": "honesty", + "text": "mongo/neo4j is a no-op (no .gitignore), a read get()/list() creates nothing, and an existing .gitignore is never rewritten \u2014 each is a distinct passing test" + }, + { + "id": "c7", + "kind": "success_signal", + "text": "in an opted-in dir, git check-ignore reports __private.jsonl ignored and __public.jsonl tracked; re-materializing writes nothing (idempotent); option OFF is byte-identical to today; shipped in a tagged release eidetic can pin a floor to" + }, + { + "id": "h12", + "kind": "honesty", + "text": "the acceptance trio (check-ignore private-ignored & public-tracked; idempotent re-run writes nothing; OFF is byte-identical) are all expressible as passing tests, and the change ships under a bumped version + CHANGELOG entry" + }, + { + "id": "c8", + "kind": "requirement", + "text": "expose an opt-in write_gitignore flag (default False) on FilesBackend init, plumbed through the store surface eidetic consumes so the consumer passes only a bool and a base_dir it already owns" + }, + { + "id": "h1", + "kind": "honesty", + "text": "with the flag OFF (the default), a materialized store dir is byte-for-byte identical to today: no .gitignore, no extra files, no behavior change on any existing consumer or dir" + }, + { + "id": "c9", + "kind": "requirement", + "text": "when on, ensure base_dir/.gitignore holds the fail-closed whitelist exactly: a line '*', then '!.gitignore', then '!*__public.jsonl' \u2014 created only on a write/materialize, never on a read" + }, + { + "id": "h2", + "kind": "honesty", + "text": "in a real git repo, git check-ignore confirms __private.jsonl is ignored AND __public.jsonl is tracked under an opted-in base_dir" + }, + { + "id": "h3", + "kind": "honesty", + "text": "a read-only get()/list() (and a dry-run migrate) never creates the .gitignore; only an actual write/materialize does" + } + ], + "tasks": [ + { + "id": "t1", + "summary": "Core files-backend .gitignore support + unit/integration tests", + "origin": "llm", + "status": "confirmed", + "acceptance_criteria": [ + "FilesBackend(base_dir, write_gitignore=True) creates base_dir/.gitignore on the first upsert with exactly the bytes '*\\n!.gitignore\\n!*__public.jsonl\\n'", + "default write_gitignore=False writes no .gitignore; the materialized dir is byte-identical to current behavior (regression test)", + "get()/list() never create .gitignore even when write_gitignore=True (gitignore lives on write paths only, never in __init__)", + "an existing .gitignore is never overwritten even when its content differs from the whitelist (create-when-absent)", + "in a temp git repo: git check-ignore reports __private.jsonl and an arbitrary non-public sidecar name ignored, and __public.jsonl tracked", + "files.build(base_dir=..., write_gitignore=...) honors both kwargs (no longer dropped); store.put/get/list forward them through get_backend", + "mongo/neo4j backends remain unaffected (no .gitignore behavior); a re-run after the file exists writes nothing" + ], + "deps": [], + "covers": [ + "c1", + "c5", + "c6", + "c8", + "c9", + "h1", + "h2", + "h3", + "h6", + "h8", + "h10", + "h11", + "h12" + ] + }, + { + "id": "t2", + "summary": "Plumb write_gitignore + base_dir through store.migrate", + "origin": "llm", + "status": "confirmed", + "acceptance_criteria": [ + "store.migrate(transform, backend='files', base_dir=..., write_gitignore=True) materializes base_dir/.gitignore during the apply pass", + "dry_run=True writes nothing, including no .gitignore", + "migrate() signature gains write_gitignore: bool = False; with it off, migrate is byte-identical to today" + ], + "deps": [ + "t1" + ], + "covers": [ + "c3" + ] + }, + { + "id": "t3", + "summary": "Docs + version bump + CHANGELOG for the opt-in surface", + "origin": "llm", + "status": "confirmed", + "acceptance_criteria": [ + "docs/contract.md documents write_gitignore on the files put/migrate surface: the fail-closed whitelist, create-when-absent, and the mongo/neo4j no-op", + "README.md + AGENTS.colleague.md note the opt-in; CHANGELOG.md gains an Added entry; pyproject.toml version is bumped so version-check passes", + "the rationale (DR owns the layout so DR owns the ignore pattern; moves eidetic's S2083 sink) is captured in the contract doc" + ], + "deps": [ + "t1", + "t2" + ], + "covers": [ + "c4", + "c7", + "h9", + "h12" + ] + }, + { + "id": "t4", + "summary": "Cross-check eidetic-cli can reach write_gitignore via the importable surface", + "origin": "llm", + "status": "confirmed", + "acceptance_criteria": [ + "eidetic-cli's store consumption call sites are inspected and confirmed able to pass write_gitignore via store.migrate/store.put with a base_dir it owns (Option B); if not, a follow-up issue is filed naming the surface eidetic needs", + "the tagged-release floor eidetic will pin is identified (the version this ships in)" + ], + "deps": [], + "covers": [ + "c2", + "h7" + ] + } + ], + "risks": [ + { + "id": "r1", + "text": "git check-ignore acceptance tests require a git binary; the test must skip gracefully when git is absent rather than fail", + "kind": "unknown_nonblocking", + "task_id": "t1" + }, + { + "id": "r2", + "text": "t4 inspects sibling repo eidetic-cli, which may not be checked out locally; if absent, cross-check defers to a brief/issue on eidetic-cli rather than blocking the release", + "kind": "unknown_nonblocking", + "task_id": "t4" + } + ] +} diff --git a/docs/plans/2026-06-24-data-refinery-s-files-backend-can-write-a-fail-clo.md b/docs/plans/2026-06-24-data-refinery-s-files-backend-can-write-a-fail-clo.md new file mode 100644 index 0000000..836eeae --- /dev/null +++ b/docs/plans/2026-06-24-data-refinery-s-files-backend-can-write-a-fail-clo.md @@ -0,0 +1,49 @@ +# Build Plan — data-refinery's files backend can write a fail-closed .gitignore on store-dir materialization, so a consumer keeps private shards out of git without ever constructing a write path itself + +slug: `data-refinery-s-files-backend-can-write-a-fail-clo` · status: `exported` · from frame: `data-refinery-s-files-backend-can-write-a-fail-clo` + +> data-refinery's files backend can write a fail-closed .gitignore on store-dir materialization, so a consumer keeps private shards out of git without ever constructing a write path itself + +## Tasks + +### t1 — Core files-backend .gitignore support + unit/integration tests + +- covers: c1, c5, c6, c8, c9, h1, h2, h3, h6, h8, h10, h11, h12 +- acceptance: + - FilesBackend(base_dir, write_gitignore=True) creates base_dir/.gitignore on the first upsert with exactly the bytes '*\n!.gitignore\n!*__public.jsonl\n' + - default write_gitignore=False writes no .gitignore; the materialized dir is byte-identical to current behavior (regression test) + - get()/list() never create .gitignore even when write_gitignore=True (gitignore lives on write paths only, never in __init__) + - an existing .gitignore is never overwritten even when its content differs from the whitelist (create-when-absent) + - in a temp git repo: git check-ignore reports __private.jsonl and an arbitrary non-public sidecar name ignored, and __public.jsonl tracked + - files.build(base_dir=..., write_gitignore=...) honors both kwargs (no longer dropped); store.put/get/list forward them through get_backend + - mongo/neo4j backends remain unaffected (no .gitignore behavior); a re-run after the file exists writes nothing + +### t2 — Plumb write_gitignore + base_dir through store.migrate + +- depends on: t1 +- covers: c3 +- acceptance: + - store.migrate(transform, backend='files', base_dir=..., write_gitignore=True) materializes base_dir/.gitignore during the apply pass + - dry_run=True writes nothing, including no .gitignore + - migrate() signature gains write_gitignore: bool = False; with it off, migrate is byte-identical to today + +### t3 — Docs + version bump + CHANGELOG for the opt-in surface + +- depends on: t1, t2 +- covers: c4, c7, h9, h12 +- acceptance: + - docs/contract.md documents write_gitignore on the files put/migrate surface: the fail-closed whitelist, create-when-absent, and the mongo/neo4j no-op + - README.md + AGENTS.colleague.md note the opt-in; CHANGELOG.md gains an Added entry; pyproject.toml version is bumped so version-check passes + - the rationale (DR owns the layout so DR owns the ignore pattern; moves eidetic's S2083 sink) is captured in the contract doc + +### t4 — Cross-check eidetic-cli can reach write_gitignore via the importable surface + +- covers: c2, h7 +- acceptance: + - eidetic-cli's store consumption call sites are inspected and confirmed able to pass write_gitignore via store.migrate/store.put with a base_dir it owns (Option B); if not, a follow-up issue is filed naming the surface eidetic needs + - the tagged-release floor eidetic will pin is identified (the version this ships in) + +## Risks + +- [unknown_nonblocking] git check-ignore acceptance tests require a git binary; the test must skip gracefully when git is absent rather than fail (task t1) +- [unknown_nonblocking] t4 inspects sibling repo eidetic-cli, which may not be checked out locally; if absent, cross-check defers to a brief/issue on eidetic-cli rather than blocking the release (task t4) diff --git a/docs/specs/2026-06-24-data-refinery-s-files-backend-can-write-a-fail-clo.md b/docs/specs/2026-06-24-data-refinery-s-files-backend-can-write-a-fail-clo.md new file mode 100644 index 0000000..01cfec1 --- /dev/null +++ b/docs/specs/2026-06-24-data-refinery-s-files-backend-can-write-a-fail-clo.md @@ -0,0 +1,52 @@ +# data-refinery's files backend can write a fail-closed .gitignore on store-dir materialization, so a consumer keeps private shards out of git without ever constructing a write path itself + +> data-refinery's files backend can write a fail-closed .gitignore on store-dir materialization, so a consumer keeps private shards out of git without ever constructing a write path itself + +## Audience + +- eidetic-cli (the first consumer, moving to repo-contained memory) and the assisting agent; more generally any files-backend consumer that wants repo-contained private shards + +## Before → After + +- Before: a consumer that wants private shards out of git must construct and write a .gitignore itself, reintroducing exactly the pythonsecurity:S2083 write-path sink that #8 removed by moving path-construction to DR +- After: a files store dir is materialized with a fail-closed .gitignore that ignores everything but public shards, so private shards (__private.jsonl) are git-ignored from their first write; the consumer opts in with a single flag and never builds a write path + +## Why it matters + +- DR owns the __.jsonl on-disk layout, so DR must own the ignore pattern that tracks it; a whitelist (fail-closed) excludes any future private filename or sidecar DR introduces by default rather than silently leaking it + +## Requirements + +- expose an opt-in write_gitignore flag (default False) on FilesBackend init, plumbed through the store surface eidetic consumes so the consumer passes only a bool and a base_dir it already owns + - honesty: with the flag OFF (the default), a materialized store dir is byte-for-byte identical to today: no .gitignore, no extra files, no behavior change on any existing consumer or dir +- when on, ensure base_dir/.gitignore holds the fail-closed whitelist exactly: a line '*', then '!.gitignore', then '!*__public.jsonl' — created only on a write/materialize, never on a read + - honesty: in a real git repo, git check-ignore confirms __private.jsonl is ignored AND __public.jsonl is tracked under an opted-in base_dir + - honesty: a read-only get()/list() (and a dry-run migrate) never creates the .gitignore; only an actual write/materialize does + +## Honesty conditions + +- a files store dir opted in to write_gitignore ends up with private shards untracked by git and public shards tracked, and the consumer supplied only a bool + a base_dir it owns (no write path) +- eidetic-cli is a real, named first consumer whose repo-contained-memory cutover (its 2026-06-24 spec) is blocked on this issue, and the surface is generic enough that any other files-backend consumer could opt in identically +- the consumer reaches the materialized .gitignore by passing only write_gitignore=True + a base_dir it already owns; a test drives store.put/migrate and asserts the file exists without the caller building any path +- without this endpoint the consumer's own .gitignore write is a flagged pythonsecurity:S2083 path sink (eidetic's prior BLOCKER that #8 and this issue move to DR) +- because the whitelist allows only *__public.jsonl, an arbitrary non-public sidecar name DR might add later is git-ignored by default — verifiable with git check-ignore on a made-up sidecar filename +- mongo/neo4j is a no-op (no .gitignore), a read get()/list() creates nothing, and an existing .gitignore is never rewritten — each is a distinct passing test +- the acceptance trio (check-ignore private-ignored & public-tracked; idempotent re-run writes nothing; OFF is byte-identical) are all expressible as passing tests, and the change ships under a bumped version + CHANGELOG entry +- re-materializing when a .gitignore already exists writes nothing and never overwrites it, even if its content differs from the canonical whitelist +- eidetic can reach write_gitignore through the importable store surface it already uses (store.migrate / store.put), so it never constructs a filesystem write path — confirming this requires checking eidetic's actual consumption call + +## Success signals + +- in an opted-in dir, git check-ignore reports __private.jsonl ignored and __public.jsonl tracked; re-materializing writes nothing (idempotent); option OFF is byte-identical to today; shipped in a tagged release eidetic can pin a floor to + +## Scope / boundaries + +- files backend only (mongo/neo4j have no on-disk dir -> no-op); never write on a read (list()/get() must not create files); never clobber an existing .gitignore; default OFF so existing dirs stay byte-identical + +## Assumptions + +- eidetic consumes write_gitignore via the importable store surface (store.migrate and/or store.put with base_dir + write_gitignore), which requires fixing files.build to stop dropping kwargs; no new CLI flag is needed for v1 + +## Decisions + +- create-when-absent only: if any .gitignore already exists, do nothing (no rewrite, no clobber) — it may carry user edits; idempotency is existence-based, not content-match From 3dc68ea40ded79d5f75bae687f3e3946118d75af Mon Sep 17 00:00:00 2001 From: Ori Nachum Date: Wed, 24 Jun 2026 17:21:47 +0300 Subject: [PATCH 2/8] colleague: Implement issue #12 in data-refinery-cli: the files backend o... MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement issue #12 in data-refinery-cli: the files backend optionally writes a fail-closed .gitignore on store-dir materialization. Work TEST-FIRST. Read first: docs/specs/2026-06-24-data-refinery-s-files-backend-can-write-a-fail-clo.md and CLAUDE.md. The target module is data_refinery/store/backends/files.py. Implement EXACTLY this (the agreed "Option B"): 1) data_refinery/store/backends/files.py - Add a keyword param `write_gitignore: bool = False` to FilesBackend.__init__ (after base_dir). Store it as self._write_gitignore. Do NOT change the existing eager `self._base.mkdir(parents=True, exist_ok=True)`. - Add a private method `_ensure_gitignore(self) -> None` that, ONLY when self._write_gitignore is True, creates `/.gitignore` ONLY IF it does not already exist (Path.exists() check), writing exactly these bytes (note trailing newline): *\n!.gitignore\n!*__public.jsonl\n Never overwrite an existing .gitignore (it may carry user edits). If writing it raises OSError, surface a structured CliError(code=EXIT_ENV_ERROR, ...) consistent with _atomic_write — never a traceback. - Call _ensure_gitignore() ONLY on write/materialize paths: at the very start of upsert(), and inside migrate() within the `if not dry_run:` block BEFORE the pass-2 apply loop (so a dry-run NEVER creates it, and an apply materializes it even if the plan is empty). Do NOT call it in __init__, get(), list(), or all(). Reads must never create the file. - Fix the module-level `build()` factory (currently `def build(**_kwargs): return FilesBackend()`, which DROPS kwargs) to honor base_dir and write_gitignore: def build(*, base_dir=None, write_gitignore=False, **_kwargs): return FilesBackend(base_dir, write_gitignore=write_gitignore) Keep accepting/ignoring other kwargs (e.g. timeout_ms) via **_kwargs. 2) Tests — NEW file tests/test_store_gitignore.py (write the tests FIRST, then implement). Cover: - .gitignore content is exactly "*\n!.gitignore\n!*__public.jsonl\n" after an upsert with write_gitignore=True. - Default (write_gitignore=False): after upsert NO .gitignore exists; the dir holds only the scope .jsonl (byte-identical to today). - A read get()/list() with write_gitignore=True does NOT create .gitignore. - An existing .gitignore with DIFFERENT content is never overwritten after an upsert. - In a real temp git repo (git init + set user.email/user.name), with write_gitignore=True after putting one private-scope and one public-scope envelope: `git check-ignore -q __private.jsonl` is ignored (exit 0), an arbitrary non-public sidecar name like `foo__index.bin` is ignored, and `git check-ignore -q __public.jsonl` is NOT ignored (exit 1). Skip this test gracefully when `git` is absent (shutil.which("git") is None -> pytest.skip). - build(base_dir=tmp, write_gitignore=True) returns a FilesBackend honoring both; and data_refinery.store.put(env, backend="files", base_dir=tmp, write_gitignore=True) forwards the kwargs (get_backend->build now honors them) and materializes the .gitignore. Constraints: - stdlib only; do NOT add runtime dependencies (the `dependencies = []` invariant). - black + isort + flake8 clean at line-length 100; bandit clean. - No traceback ever; raise CliError on faults; match the existing file's style/helpers (_atomic_write, _serialize, _VISIBILITIES, etc.). - Before finishing, run: uv run pytest tests/test_store_gitignore.py -q AND uv run black --check data_refinery tests && uv run isort --check-only data_refinery tests && uv run flake8 data_refinery tests and make them all pass. Deliver the change committed on your drive branch. Do not edit pyproject.toml, CHANGELOG.md, README.md, docs/, or data_refinery/store/migrate.py — those are other tasks. Implement the task above in this repository. Rules: - Make the SMALLEST change that correctly satisfies the task. - Follow the repository's existing patterns, style, and conventions — read the neighbouring files first so your change reads like the surrounding code. - Keep edits lint-clean: respect the project's maximum line length and end every text file with exactly one trailing newline. - You may read, create, modify files, and run commands as needed. - Don't widen the scope: do exactly what was asked, nothing more. When you are done, call finish with a short summary of exactly what you changed and why. --- data_refinery/store/backends/files.py | 40 ++++++- tests/test_store_gitignore.py | 144 ++++++++++++++++++++++++++ uv.lock | 2 +- 3 files changed, 181 insertions(+), 5 deletions(-) create mode 100644 tests/test_store_gitignore.py diff --git a/data_refinery/store/backends/files.py b/data_refinery/store/backends/files.py index 63b4b27..0368afa 100644 --- a/data_refinery/store/backends/files.py +++ b/data_refinery/store/backends/files.py @@ -32,16 +32,45 @@ class FilesBackend: """Persist envelopes as JSONL files, one file per scope.""" - def __init__(self, base_dir: str | None = None) -> None: + def __init__(self, base_dir: str | None = None, *, write_gitignore: bool = False) -> None: if base_dir is None: base_dir = os.environ.get(_ENV_DIR) or str(Path.home() / ".data-refinery" / "store") self._base = Path(base_dir) self._base.mkdir(parents=True, exist_ok=True) + self._write_gitignore = write_gitignore # -- Backend protocol ----------------------------------------------- + def _ensure_gitignore(self) -> None: + """Create ``.gitignore`` in *base_dir* when ``write_gitignore`` is set. + + Only creates the file when it does not already exist (never overwrites + user edits). If writing raises ``OSError``, surfaces a structured + ``CliError`` — never a traceback. + """ + if not self._write_gitignore: + return + gi = self._base / ".gitignore" + if gi.exists(): + return + try: + path = self._base / ".gitignore.tmp" + path.write_text("*\n!.gitignore\n!*__public.jsonl\n", encoding="utf-8") + os.replace(path, gi) + except OSError as exc: + try: + path.unlink() + except OSError: # pragma: no cover - best effort + pass + raise CliError( + code=EXIT_ENV_ERROR, + message=f"could not write .gitignore: {exc}", + remediation=f"check permissions on {self._base}", + ) from exc + def upsert(self, envelope: Envelope) -> None: """Insert or replace *envelope* idempotently (by id; dedup by hash on insert).""" + self._ensure_gitignore() path = self._scope_file(envelope.scope) records = self._load(path) @@ -141,6 +170,7 @@ def migrate( # per file (temp sibling + os.replace), so a crash here still leaves each # file either fully old or fully new and the run is safe to resume. if not dry_run: + self._ensure_gitignore() for path, new_text in plan: self._atomic_write(path, new_text) return { @@ -345,6 +375,8 @@ def _to_envelope(obj: dict[str, Any], transform: Transform | None) -> Envelope | return transform(obj) -def build(**_kwargs: object) -> Backend: - """Factory: a default FilesBackend (ignores kwargs like ``timeout_ms``).""" - return FilesBackend() +def build( + *, base_dir: str | None = None, write_gitignore: bool = False, **_kwargs: object +) -> Backend: + """Factory: a FilesBackend honouring ``base_dir`` and ``write_gitignore``.""" + return FilesBackend(base_dir, write_gitignore=write_gitignore) diff --git a/tests/test_store_gitignore.py b/tests/test_store_gitignore.py new file mode 100644 index 0000000..4d2fc71 --- /dev/null +++ b/tests/test_store_gitignore.py @@ -0,0 +1,144 @@ +"""Files-backend .gitignore materialization (issue #12). + +When ``write_gitignore=True`` is passed to ``FilesBackend`` (or forwarded +through ``store.put`` / ``store.migrate``), the backend creates a fail-closed +``.gitignore`` on the first write, ignoring everything except public shards. +Reads never create the file; existing files are never overwritten. +""" + +from __future__ import annotations + +import shutil +import subprocess + +import pytest + +import data_refinery.store as store +from data_refinery.store.backends.files import FilesBackend, build +from data_refinery.store.envelope import Envelope, Scope + +_GITIGNORE_CONTENT = "*\n!.gitignore\n!*__public.jsonl\n" + + +# ------------------------------------------------------------------ +# Content / existence +# ------------------------------------------------------------------ + + +def test_gitignore_content_after_upsert(tmp_path) -> None: + """A write_gitignore=True upsert creates .gitignore with the canonical content.""" + backend = FilesBackend(base_dir=str(tmp_path), write_gitignore=True) + backend.upsert(Envelope(id="a", content="hello")) + gi = tmp_path / ".gitignore" + assert gi.exists() + assert gi.read_text() == _GITIGNORE_CONTENT + + +def test_default_no_gitignore(tmp_path) -> None: + """Default (write_gitignore=False) never creates .gitignore.""" + backend = FilesBackend(base_dir=str(tmp_path)) + backend.upsert(Envelope(id="a", content="hello")) + assert not (tmp_path / ".gitignore").exists() + + +def test_read_does_not_create_gitignore(tmp_path) -> None: + """get()/list() with write_gitignore=True must NOT create .gitignore.""" + backend = FilesBackend(base_dir=str(tmp_path), write_gitignore=True) + # No envelopes yet — reads on an empty store + assert backend.get("nope", Scope("default", "public")) is None + assert backend.list(Scope("default", "public")) == [] + assert not (tmp_path / ".gitignore").exists() + + +def test_existing_gitignore_never_overwritten(tmp_path) -> None: + """A pre-existing .gitignore with different content is never clobbered.""" + gi = tmp_path / ".gitignore" + gi.write_text("my-custom-rules\n") + backend = FilesBackend(base_dir=str(tmp_path), write_gitignore=True) + backend.upsert(Envelope(id="a", content="hello")) + assert gi.read_text() == "my-custom-rules\n" + + +# ------------------------------------------------------------------ +# Real git integration +# ------------------------------------------------------------------ + + +@pytest.mark.skipif( + shutil.which("git") is None, + reason="git not installed", +) +def test_git_check_ignore_private_ignored_public_tracked(tmp_path) -> None: + """In a real git repo, private shards are ignored and public shards are tracked.""" + # Initialise a git repo inside tmp_path + subprocess.run(["git", "init"], cwd=tmp_path, check=True, capture_output=True) + subprocess.run( + ["git", "config", "user.email", "test@test.com"], + cwd=tmp_path, + check=True, + capture_output=True, + ) + subprocess.run( + ["git", "config", "user.name", "Test"], + cwd=tmp_path, + check=True, + capture_output=True, + ) + + backend = FilesBackend(base_dir=str(tmp_path), write_gitignore=True) + backend.upsert(Envelope(id="priv", content="secret", scope=Scope("myapp", "private"))) + backend.upsert(Envelope(id="pub", content="hello", scope=Scope("myapp", "public"))) + + private_file = tmp_path / "myapp__private.jsonl" + public_file = tmp_path / "myapp__public.jsonl" + sidecar = tmp_path / "foo__index.bin" + sidecar.write_text("sidecar") + + # Private shard is ignored + result = subprocess.run( + ["git", "check-ignore", "-q", str(private_file)], + cwd=tmp_path, + capture_output=True, + ) + assert result.returncode == 0, "private shard should be ignored" + + # Arbitrary non-public sidecar is ignored + result = subprocess.run( + ["git", "check-ignore", "-q", str(sidecar)], + cwd=tmp_path, + capture_output=True, + ) + assert result.returncode == 0, "non-public sidecar should be ignored" + + # Public shard is NOT ignored + result = subprocess.run( + ["git", "check-ignore", "-q", str(public_file)], + cwd=tmp_path, + capture_output=True, + ) + assert result.returncode != 0, "public shard should NOT be ignored" + + +# ------------------------------------------------------------------ +# Factory / store.put forwarding +# ------------------------------------------------------------------ + + +def test_build_forwards_write_gitignore(tmp_path) -> None: + """build(base_dir=..., write_gitignore=True) returns a backend that honours the flag.""" + backend = build(base_dir=str(tmp_path), write_gitignore=True) + assert isinstance(backend, FilesBackend) + backend.upsert(Envelope(id="a", content="hello")) + assert (tmp_path / ".gitignore").exists() + + +def test_store_put_forwards_write_gitignore(tmp_path) -> None: + """store.put(..., backend='files', write_gitignore=True) materialises .gitignore.""" + store.put( + Envelope(id="a", content="hello"), + backend="files", + base_dir=str(tmp_path), + write_gitignore=True, + ) + assert (tmp_path / ".gitignore").exists() + assert (tmp_path / ".gitignore").read_text() == _GITIGNORE_CONTENT diff --git a/uv.lock b/uv.lock index 14157a2..f565d16 100644 --- a/uv.lock +++ b/uv.lock @@ -156,7 +156,7 @@ wheels = [ [[package]] name = "data-refinery-cli" -version = "0.6.0" +version = "0.8.0" source = { editable = "." } [package.optional-dependencies] From afe12c72f3df697e45fa2332d0a1e4d122f0f472 Mon Sep 17 00:00:00 2001 From: Ori Nachum Date: Wed, 24 Jun 2026 17:27:19 +0300 Subject: [PATCH 3/8] colleague: Issue #12 in data-refinery-cli, task t2: plumb write_gitignor... MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Issue #12 in data-refinery-cli, task t2: plumb write_gitignore through the importable store.migrate endpoint. Work TEST-FIRST. The files backend already supports write_gitignore (FilesBackend.__init__ accepts it and FilesBackend.migrate() calls _ensure_gitignore on apply) — t2 only wires the top-level migrate() function to forward it. Target file: data_refinery/store/migrate.py 1) data_refinery/store/migrate.py - The current top-level function is: def migrate(transform=None, *, backend="files", base_dir=None, dry_run=False) -> dict[str, Any]: if backend == "files": return FilesBackend(base_dir).migrate(transform, dry_run=dry_run) raise CliError(...) - Add a keyword param `write_gitignore: bool = False` (place it after base_dir, before dry_run). Forward it into the FilesBackend constructor: return FilesBackend(base_dir, write_gitignore=write_gitignore).migrate(transform, dry_run=dry_run) - Update the docstring: with write_gitignore=True the files backend materializes the fail-closed .gitignore (* / !.gitignore / !*__public.jsonl) during the apply pass; a dry_run never writes it; default False is byte-identical to today; files backend only. - Do NOT add a CLI flag — the CLI `store migrate` verb stays unchanged (Option B is import-surface only). 2) Tests — ADD to the EXISTING file tests/test_store_migrate.py (do NOT create a new test file, and do NOT touch tests/test_store_gitignore.py). Add tests covering: - store.migrate(base_dir=tmp, write_gitignore=True) creates tmp/.gitignore after a real (non-dry) migrate. Setup: first store.put(Envelope(id="a", content="x"), backend="files", base_dir=tmp) WITHOUT write_gitignore so a scope file exists and no .gitignore yet; then store.migrate(base_dir=tmp, write_gitignore=True); assert (tmp/".gitignore").exists() and its content == "*\n!.gitignore\n!*__public.jsonl\n". - store.migrate(base_dir=tmp, write_gitignore=True, dry_run=True) does NOT create tmp/.gitignore. - default store.migrate(base_dir=tmp) (write_gitignore omitted) creates no .gitignore. - import data_refinery.store as store; use store.migrate / store.put / store.Envelope (or import Envelope from data_refinery.store.envelope, matching the existing test file's import style — read the file first and match it). Constraints: - stdlib only; no new runtime dependencies. - black + isort + flake8 clean at line-length 100; bandit clean; no traceback (CliError only). - Match the existing migrate.py + test_store_migrate.py style. - Before finishing run: uv run pytest tests/test_store_migrate.py -q AND uv run black --check data_refinery tests && uv run isort --check-only data_refinery tests && uv run flake8 data_refinery tests — all must pass. Deliver committed on your drive branch. Do NOT edit files.py, pyproject.toml, CHANGELOG.md, README.md, docs/, or uv.lock. Implement the task above in this repository. Rules: - Make the SMALLEST change that correctly satisfies the task. - Follow the repository's existing patterns, style, and conventions — read the neighbouring files first so your change reads like the surrounding code. - Keep edits lint-clean: respect the project's maximum line length and end every text file with exactly one trailing newline. - You may read, create, modify files, and run commands as needed. - Don't widen the scope: do exactly what was asked, nothing more. When you are done, call finish with a short summary of exactly what you changed and why. --- data_refinery/store/migrate.py | 10 +++++++++- tests/test_store_migrate.py | 27 +++++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/data_refinery/store/migrate.py b/data_refinery/store/migrate.py index 36929cf..3074d6d 100644 --- a/data_refinery/store/migrate.py +++ b/data_refinery/store/migrate.py @@ -28,6 +28,7 @@ def migrate( *, backend: str = DEFAULT_BACKEND, base_dir: str | None = None, + write_gitignore: bool = False, dry_run: bool = False, ) -> dict[str, Any]: """Upgrade an on-disk store to the current Envelope format. @@ -39,6 +40,11 @@ def migrate( optionally the store root it already owns via *base_dir*) — never a per-file write path. + With ``write_gitignore=True`` the files backend materialises the fail-closed + ``.gitignore`` (``* / !.gitignore / !*__public.jsonl``) during the apply pass; + a dry_run never writes it; default ``False`` is byte-identical to today. + Files backend only. + Idempotent: a second run rewrites nothing. The consumer's transform need **not** itself be idempotent — after the first run every line is a canonical Envelope, and the files backend keeps an already-canonical line **verbatim** @@ -55,7 +61,9 @@ def migrate( structured :class:`CliError`. """ if backend == "files": - return FilesBackend(base_dir).migrate(transform, dry_run=dry_run) + return FilesBackend(base_dir, write_gitignore=write_gitignore).migrate( + transform, dry_run=dry_run + ) raise CliError( code=EXIT_USER_ERROR, message=f"store migration is not yet supported for backend {backend!r}", diff --git a/tests/test_store_migrate.py b/tests/test_store_migrate.py index 0b342f0..4a0a599 100644 --- a/tests/test_store_migrate.py +++ b/tests/test_store_migrate.py @@ -431,3 +431,30 @@ def test_cli_store_migrate_unsupported_backend_exits_1(files_env: str, capsys) - # Text mode: the same error renders the load-bearing `hint:` prefix. assert main(["store", "migrate", "--backend", "mongo"]) == 1 assert "hint:" in capsys.readouterr().err + + +# --- write_gitignore through the importable endpoint ------------------------- + + +def test_migrate_write_gitignore_creates_gitignore(tmp_path: Path) -> None: + # Seed a scope file WITHOUT write_gitignore so no .gitignore exists yet. + store.put(Envelope(id="a", content="x"), backend="files", base_dir=str(tmp_path)) + assert not (tmp_path / ".gitignore").exists() + store.migrate(base_dir=str(tmp_path), write_gitignore=True) + assert (tmp_path / ".gitignore").exists() + assert (tmp_path / ".gitignore").read_text( + encoding="utf-8" + ) == "*\n!.gitignore\n!*__public.jsonl\n" + + +def test_migrate_write_gitignore_dry_run_does_not_create(tmp_path: Path) -> None: + store.put(Envelope(id="a", content="x"), backend="files", base_dir=str(tmp_path)) + assert not (tmp_path / ".gitignore").exists() + store.migrate(base_dir=str(tmp_path), write_gitignore=True, dry_run=True) + assert not (tmp_path / ".gitignore").exists() + + +def test_migrate_default_no_write_gitignore(tmp_path: Path) -> None: + store.put(Envelope(id="a", content="x"), backend="files", base_dir=str(tmp_path)) + store.migrate(base_dir=str(tmp_path)) + assert not (tmp_path / ".gitignore").exists() From 7d0c53fc37cafb9405823d585c11163a5aab48d3 Mon Sep 17 00:00:00 2001 From: Ori Nachum Date: Wed, 24 Jun 2026 17:30:07 +0300 Subject: [PATCH 4/8] chore: bump version 0.8.0 -> 0.9.0 (write_gitignore opt-in, issue #12) Minor bump per the contract versioning policy (new optional param). CHANGELOG documents the fail-closed .gitignore opt-in and the build() kwarg-honoring fix. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01LAEeF8y7RrKft8de7rZfDM --- CHANGELOG.md | 10 ++++++++++ pyproject.toml | 2 +- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2ceace1..d1f2fe0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,16 @@ All notable changes to this project will be documented in this file. Format follows [Keep a Changelog](https://keepachangelog.com/). This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.9.0] - 2026-06-24 + +### Added + +- Files store backend: optional `write_gitignore` flag (default off) that writes a fail-closed `.gitignore` (`*` / `!.gitignore` / `!*__public.jsonl`) into the store `base_dir` on materialization, so a consumer keeps private shards out of git without ever constructing a filesystem write path (issue #12). Reachable via `FilesBackend(base_dir, write_gitignore=True)`, `store.put/get/list(..., backend="files", base_dir=..., write_gitignore=True)`, and `store.migrate(..., write_gitignore=True)`. Written only on a write/materialize (never on a read or a dry-run migrate), create-when-absent (never clobbers an existing `.gitignore`), files backend only (mongo/neo4j are a no-op). + +### Changed + +- `data_refinery.store.backends.files.build()` now honors `base_dir` and `write_gitignore` kwargs (it previously dropped all kwargs), so the importable `store.put`/`get`/`list` surface can target a caller-owned `base_dir`. + ## [0.8.0] - 2026-06-24 ### Added diff --git a/pyproject.toml b/pyproject.toml index 6ad5d0d..2230c67 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data-refinery-cli" -version = "0.8.0" +version = "0.9.0" description = "Agent and CLI for data quality in storage and retrieval — validating, deduplicating, and checking the integrity and freshness of data as it is stored and fetched. Split out of eidetic-cli so eidetic keeps agent-memory; sibling to daria, the Data Refinery Intelligent Agent." readme = "README.md" license = "Apache-2.0" From b599c87771a995fa66bb407f1798b92be24104c1 Mon Sep 17 00:00:00 2001 From: Ori Nachum Date: Wed, 24 Jun 2026 17:31:52 +0300 Subject: [PATCH 5/8] colleague: Issue #12 in data-refinery-cli, task t3 (DOCS ONLY): document... MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Issue #12 in data-refinery-cli, task t3 (DOCS ONLY): document the new opt-in `write_gitignore` files-store surface that was just implemented. Do NOT touch any .py file, pyproject.toml, CHANGELOG.md, or uv.lock — the version bump + CHANGELOG are handled separately. Read the just-merged code first so the docs are accurate: - data_refinery/store/backends/files.py (FilesBackend.__init__ now takes write_gitignore; _ensure_gitignore; build() honors base_dir+write_gitignore) - data_refinery/store/migrate.py (migrate() now takes write_gitignore) - docs/contract.md, README.md, AGENTS.colleague.md (the files you will edit) What the feature is (document it exactly): when a consumer opts in with `write_gitignore=True`, the files backend ensures a fail-closed `.gitignore` exists in the store `base_dir` with EXACTLY this content: * !.gitignore !*__public.jsonl It ignores everything and only ever allows public shards (and the .gitignore itself) to be tracked, so any future private filename or sidecar is excluded by default. Behavior: opt-in (default False; off is byte-identical to today); written only on a write/materialize (upsert + a non-dry store.migrate apply), NEVER on a read (get/list) or a dry-run; create-when-absent only (an existing .gitignore is never overwritten); files backend only (mongo/neo4j are a no-op). It is reachable WITHOUT the consumer constructing any filesystem write path, via: FilesBackend(base_dir, write_gitignore=True); data_refinery.store.put/get/list(..., backend="files", base_dir=..., write_gitignore=True) (get_backend forwards kwargs to the files build()); and data_refinery.store.migrate(transform, *, backend="files", base_dir=..., write_gitignore=..., dry_run=...). Rationale: data-refinery OWNS the `__.jsonl` on-disk layout, so it owns the ignore pattern that tracks it; this keeps the .gitignore write-path sink (the consumer's prior pythonsecurity:S2083) on the storage owner. Continues issues #8 / #1. Edits to make: 1) docs/contract.md - In the Wave 3 section (after "the store-migration endpoint" subsection, before "## Versioning policy"), add a new subsection titled exactly: ### Fail-closed `.gitignore` opt-in (stable) documenting all of the above: the opt-in param, the exact whitelist content (in a ```gitignore code block), the materialize-on-write-not-read rule, create-when-absent/never-clobber, files-only no-op, default-off-byte-identical, the three reachable surfaces (FilesBackend init / store.put|get|list / store.migrate), and the boundary rationale. Keep the prose terse and in the same voice as the rest of the doc. - Update the migrate signature line currently reading: `migrate(transform=None, *, backend="files", base_dir=None, dry_run=False)` to include the new parameter: `migrate(transform=None, *, backend="files", base_dir=None, write_gitignore=False, dry_run=False)` 2) README.md — find the store / store-surface section and add a short note (1-3 sentences) that the files backend can optionally write a fail-closed `.gitignore` via `write_gitignore=True` (default off), so a consumer keeps private shards out of git without constructing a write path. Match the README's existing tone/format. If you cannot find an obviously-right store section, add it near where the store put/get/list or `[store]` extra is described. 3) AGENTS.colleague.md — find where the store surface is described and add a one-line mention of the `write_gitignore` opt-in (files-only, default off, fail-closed). If there is no store section, add a concise line under the most relevant heading. Constraints: - Markdown must pass: markdownlint-cli2 "docs/contract.md" "README.md" "AGENTS.colleague.md" (fix any violations you introduce; match the surrounding heading/list/code-fence style). - Do NOT edit code, tests, pyproject.toml, CHANGELOG.md, or uv.lock. - Be accurate to the code you read — do not invent flags or behavior. Deliver committed on your drive branch. Implement the task above in this repository. Rules: - Make the SMALLEST change that correctly satisfies the task. - Follow the repository's existing patterns, style, and conventions — read the neighbouring files first so your change reads like the surrounding code. - Keep edits lint-clean: respect the project's maximum line length and end every text file with exactly one trailing newline. - You may read, create, modify files, and run commands as needed. - Don't widen the scope: do exactly what was asked, nothing more. When you are done, call finish with a short summary of exactly what you changed and why. --- AGENTS.colleague.md | 4 +++- README.md | 4 ++++ docs/contract.md | 38 +++++++++++++++++++++++++++++++++++++- 3 files changed, 44 insertions(+), 2 deletions(-) diff --git a/AGENTS.colleague.md b/AGENTS.colleague.md index 3b0dbf4..c8cf5f5 100644 --- a/AGENTS.colleague.md +++ b/AGENTS.colleague.md @@ -16,7 +16,9 @@ behavior, update both. data-refinery-cli owns the **storage + data-quality infrastructure layer** split out of eidetic-cli (issue #1): the mongo + neo4j substrate, the docker stack (published to GHCR), a storage-neutral **store** (`store put/get/list` over a -files/mongo/neo4j `Backend`, also importable as `data_refinery.store`), and a +files/mongo/neo4j `Backend`, also importable as `data_refinery.store`; the files +backend accepts an opt-in `write_gitignore=True` to write a fail-closed +`.gitignore`, files-only, default off), and a **consumer-agnostic** data-quality surface (`validate`, `dedup`, `integrity`, `freshness`). It treats stored data as **opaque envelopes** (`{id, hash, content, scope, metadata}`) and never interprets them as "memories" diff --git a/README.md b/README.md index 91f0fab..abc23ea 100644 --- a/README.md +++ b/README.md @@ -62,6 +62,10 @@ store.list() # -> list[Envelope] # Upgrade a populated legacy store to the current Envelope format — the consumer # supplies only a transform, never a filesystem write path (data-refinery owns it): store.migrate(record_to_envelope, base_dir="/path/to/store") + +# Opt-in: write a fail-closed .gitignore so private shards stay out of git +# (files backend only; default off, byte-identical when omitted): +store.put(env, backend="files", base_dir="/path/to/store", write_gitignore=True) ``` ## CLI diff --git a/docs/contract.md b/docs/contract.md index 187b417..4f8f4ed 100644 --- a/docs/contract.md +++ b/docs/contract.md @@ -150,7 +150,7 @@ summary = migrate(record_to_envelope, base_dir="/path/to/store") # -> dict # record_to_envelope: Callable[[dict], Envelope | None] (None drops a record) ``` -`migrate(transform=None, *, backend="files", base_dir=None, dry_run=False)` +`migrate(transform=None, *, backend="files", base_dir=None, write_gitignore=False, dry_run=False)` returns `{backend, files, migrated, migrated_files, skipped, dry_run}`. With `transform=None` it re-canonicalises data-refinery's own Envelope-JSONL (the self-heal path the `store migrate` CLI verb uses). The consumer supplies a @@ -171,6 +171,42 @@ write path. - **Files granularity only** today — `mongo` (vectors) / `neo4j` (graph) migration are a later granularity and exit `1` with a `hint:`. +### Fail-closed `.gitignore` opt-in (stable) + +When a consumer opts in with `write_gitignore=True`, the files backend ensures a +fail-closed `.gitignore` exists in the store `base_dir` with exactly this +content: + +```gitignore +* +!.gitignore +!*__public.jsonl +``` + +It ignores everything and only ever allows public shards (and the `.gitignore` +itself) to be tracked, so any future private filename or sidecar is excluded by +default. + +**Behavior:** + +- **Opt-in** — default `False`; off is byte-identical to today. +- **Materialise on write, never on read** — written only during an upsert or a + non-dry `store migrate` apply; never on `get`/`list` or a dry-run. +- **Create-when-absent only** — an existing `.gitignore` is never overwritten. +- **Files backend only** — `mongo`/`neo4j` are a no-op. + +**Reachable surfaces:** + +- `FilesBackend(base_dir, write_gitignore=True)` +- `data_refinery.store.put/get/list(..., backend="files", base_dir=..., write_gitignore=True)` + (`get_backend` forwards kwargs to the files `build()`) +- `data_refinery.store.migrate(transform, *, backend="files", base_dir=..., write_gitignore=..., dry_run=...)` + +**Rationale:** data-refinery owns the `__.jsonl` on-disk +layout, so it owns the ignore pattern that tracks it; this keeps the `.gitignore` +write-path sink (the consumer's prior pythonsecurity:S2083) on the storage +owner. Continues issues #8 / #1. + ## Versioning policy | Change | Requires | From 158707fb9fcfd26b27ee80cde7122907c8ceb867 Mon Sep 17 00:00:00 2001 From: Ori Nachum Date: Wed, 24 Jun 2026 17:34:50 +0300 Subject: [PATCH 6/8] docs: backtick filename tokens in spec/plan (markdownlint MD033) The devague-exported spec/plan used bare __private.jsonl tokens that markdownlint reads as inline HTML; wrap them in code spans so the CI markdown lint job stays green (docs/specs is not in the ignore list). Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01LAEeF8y7RrKft8de7rZfDM --- ...-data-refinery-s-files-backend-can-write-a-fail-clo.md | 2 +- ...-data-refinery-s-files-backend-can-write-a-fail-clo.md | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/plans/2026-06-24-data-refinery-s-files-backend-can-write-a-fail-clo.md b/docs/plans/2026-06-24-data-refinery-s-files-backend-can-write-a-fail-clo.md index 836eeae..f5f2a90 100644 --- a/docs/plans/2026-06-24-data-refinery-s-files-backend-can-write-a-fail-clo.md +++ b/docs/plans/2026-06-24-data-refinery-s-files-backend-can-write-a-fail-clo.md @@ -14,7 +14,7 @@ slug: `data-refinery-s-files-backend-can-write-a-fail-clo` · status: `exported` - default write_gitignore=False writes no .gitignore; the materialized dir is byte-identical to current behavior (regression test) - get()/list() never create .gitignore even when write_gitignore=True (gitignore lives on write paths only, never in __init__) - an existing .gitignore is never overwritten even when its content differs from the whitelist (create-when-absent) - - in a temp git repo: git check-ignore reports __private.jsonl and an arbitrary non-public sidecar name ignored, and __public.jsonl tracked + - in a temp git repo: git check-ignore reports `__private.jsonl` and an arbitrary non-public sidecar name ignored, and `__public.jsonl` tracked - files.build(base_dir=..., write_gitignore=...) honors both kwargs (no longer dropped); store.put/get/list forward them through get_backend - mongo/neo4j backends remain unaffected (no .gitignore behavior); a re-run after the file exists writes nothing diff --git a/docs/specs/2026-06-24-data-refinery-s-files-backend-can-write-a-fail-clo.md b/docs/specs/2026-06-24-data-refinery-s-files-backend-can-write-a-fail-clo.md index 01cfec1..39aa5b0 100644 --- a/docs/specs/2026-06-24-data-refinery-s-files-backend-can-write-a-fail-clo.md +++ b/docs/specs/2026-06-24-data-refinery-s-files-backend-can-write-a-fail-clo.md @@ -9,18 +9,18 @@ ## Before → After - Before: a consumer that wants private shards out of git must construct and write a .gitignore itself, reintroducing exactly the pythonsecurity:S2083 write-path sink that #8 removed by moving path-construction to DR -- After: a files store dir is materialized with a fail-closed .gitignore that ignores everything but public shards, so private shards (__private.jsonl) are git-ignored from their first write; the consumer opts in with a single flag and never builds a write path +- After: a files store dir is materialized with a fail-closed .gitignore that ignores everything but public shards, so private shards (`__private.jsonl`) are git-ignored from their first write; the consumer opts in with a single flag and never builds a write path ## Why it matters -- DR owns the __.jsonl on-disk layout, so DR must own the ignore pattern that tracks it; a whitelist (fail-closed) excludes any future private filename or sidecar DR introduces by default rather than silently leaking it +- DR owns the `__.jsonl` on-disk layout, so DR must own the ignore pattern that tracks it; a whitelist (fail-closed) excludes any future private filename or sidecar DR introduces by default rather than silently leaking it ## Requirements - expose an opt-in write_gitignore flag (default False) on FilesBackend init, plumbed through the store surface eidetic consumes so the consumer passes only a bool and a base_dir it already owns - honesty: with the flag OFF (the default), a materialized store dir is byte-for-byte identical to today: no .gitignore, no extra files, no behavior change on any existing consumer or dir - when on, ensure base_dir/.gitignore holds the fail-closed whitelist exactly: a line '*', then '!.gitignore', then '!*__public.jsonl' — created only on a write/materialize, never on a read - - honesty: in a real git repo, git check-ignore confirms __private.jsonl is ignored AND __public.jsonl is tracked under an opted-in base_dir + - honesty: in a real git repo, git check-ignore confirms `__private.jsonl` is ignored AND `__public.jsonl` is tracked under an opted-in base_dir - honesty: a read-only get()/list() (and a dry-run migrate) never creates the .gitignore; only an actual write/materialize does ## Honesty conditions @@ -37,7 +37,7 @@ ## Success signals -- in an opted-in dir, git check-ignore reports __private.jsonl ignored and __public.jsonl tracked; re-materializing writes nothing (idempotent); option OFF is byte-identical to today; shipped in a tagged release eidetic can pin a floor to +- in an opted-in dir, git check-ignore reports `__private.jsonl` ignored and `__public.jsonl` tracked; re-materializing writes nothing (idempotent); option OFF is byte-identical to today; shipped in a tagged release eidetic can pin a floor to ## Scope / boundaries From 5d9904304e742ea3467b3a8094b06cd002d807a5 Mon Sep 17 00:00:00 2001 From: Ori Nachum Date: Wed, 24 Jun 2026 17:44:45 +0300 Subject: [PATCH 7/8] harden: reap orphan .gitignore.tmp + DRY _ensure_gitignore via _atomic_write Addresses the colleague review's one concrete finding: a crash mid-.gitignore write could leave a .gitignore.tmp that _reap_orphan_tmp (globbing only *.jsonl.tmp) never cleaned. _ensure_gitignore now reuses the shared _atomic_write (consistent structured CliError, no duplicated temp+replace), and _reap_orphan_tmp also reaps the .gitignore temp. New test locks in the reaping. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01LAEeF8y7RrKft8de7rZfDM --- data_refinery/store/backends/files.py | 41 +++++++++++++-------------- tests/test_store_gitignore.py | 17 +++++++++++ 2 files changed, 37 insertions(+), 21 deletions(-) diff --git a/data_refinery/store/backends/files.py b/data_refinery/store/backends/files.py index 0368afa..c97206e 100644 --- a/data_refinery/store/backends/files.py +++ b/data_refinery/store/backends/files.py @@ -20,6 +20,10 @@ _ENV_DIR = "DR_DATA_DIR" _JSONL_GLOB = "*.jsonl" # one scope file per (name, visibility) _TMP_SUFFIX = ".tmp" # atomic-write temp sibling: ".jsonl.tmp" +_GITIGNORE_NAME = ".gitignore" +# Fail-closed whitelist (issue #12): ignore everything but public shards, so any +# future private filename or sidecar is excluded by default rather than leaked. +_GITIGNORE_BODY = "*\n!.gitignore\n!*__public.jsonl\n" # Re-derived from the public `Visibility` type so it never drifts from it. _VISIBILITIES: tuple[str, ...] = get_args(Visibility) @@ -44,29 +48,18 @@ def __init__(self, base_dir: str | None = None, *, write_gitignore: bool = False def _ensure_gitignore(self) -> None: """Create ``.gitignore`` in *base_dir* when ``write_gitignore`` is set. - Only creates the file when it does not already exist (never overwrites - user edits). If writing raises ``OSError``, surfaces a structured - ``CliError`` — never a traceback. + Create-when-absent only (never overwrites user edits). Reuses the shared + :meth:`_atomic_write` (temp sibling + ``os.replace``), so a write fault + surfaces as a structured ``CliError`` — never a traceback — and a crash + leaves either no file or the complete whitelist (the orphan temp is + reaped by :meth:`_reap_orphan_tmp`). """ if not self._write_gitignore: return - gi = self._base / ".gitignore" + gi = self._base / _GITIGNORE_NAME if gi.exists(): return - try: - path = self._base / ".gitignore.tmp" - path.write_text("*\n!.gitignore\n!*__public.jsonl\n", encoding="utf-8") - os.replace(path, gi) - except OSError as exc: - try: - path.unlink() - except OSError: # pragma: no cover - best effort - pass - raise CliError( - code=EXIT_ENV_ERROR, - message=f"could not write .gitignore: {exc}", - remediation=f"check permissions on {self._base}", - ) from exc + self._atomic_write(gi, _GITIGNORE_BODY) def upsert(self, envelope: Envelope) -> None: """Insert or replace *envelope* idempotently (by id; dedup by hash on insert).""" @@ -245,13 +238,19 @@ def _assert_contained(path: Path, root: Path) -> None: @staticmethod def _reap_orphan_tmp(root: Path) -> None: - """Remove ``*.jsonl.tmp`` left by a prior interrupted rewrite. + """Remove ``*.jsonl.tmp`` / ``.gitignore.tmp`` left by an interrupted write. ``os.replace`` consumes the temp on success, so a surviving temp is the residue of a crash *before* the swap — the real file is intact. Reaping - keeps the store dir tidy and the ``*.jsonl`` glob unambiguous. + keeps the store dir tidy and the ``*.jsonl`` glob unambiguous. The + ``.gitignore`` temp shares the same atomic-write path, so it is reaped + here too (it falls outside the ``*.jsonl.tmp`` glob). """ - for tmp in root.glob(_JSONL_GLOB + _TMP_SUFFIX): + temps = list(root.glob(_JSONL_GLOB + _TMP_SUFFIX)) + gi_tmp = root / (_GITIGNORE_NAME + _TMP_SUFFIX) + if gi_tmp.exists(): + temps.append(gi_tmp) + for tmp in temps: try: tmp.unlink() except OSError: # pragma: no cover - best effort diff --git a/tests/test_store_gitignore.py b/tests/test_store_gitignore.py index 4d2fc71..b1e4190 100644 --- a/tests/test_store_gitignore.py +++ b/tests/test_store_gitignore.py @@ -142,3 +142,20 @@ def test_store_put_forwards_write_gitignore(tmp_path) -> None: ) assert (tmp_path / ".gitignore").exists() assert (tmp_path / ".gitignore").read_text() == _GITIGNORE_CONTENT + + +# ------------------------------------------------------------------ +# Crash hygiene +# ------------------------------------------------------------------ + + +def test_orphan_gitignore_tmp_is_reaped_by_migrate(tmp_path) -> None: + """A stray .gitignore.tmp (a crashed write's debris) is reaped on migrate.""" + backend = FilesBackend(base_dir=str(tmp_path), write_gitignore=True) + backend.upsert(Envelope(id="a", content="hello")) # scope file + .gitignore + stray = tmp_path / ".gitignore.tmp" + stray.write_text("debris") + backend.migrate() # reaps orphan temps before planning + assert not stray.exists() + # The real .gitignore is untouched (create-when-absent on a re-materialise). + assert (tmp_path / ".gitignore").read_text() == _GITIGNORE_CONTENT From dce88526708c17d7ca7a504463bb9a1ac7880753 Mon Sep 17 00:00:00 2001 From: Ori Nachum Date: Wed, 24 Jun 2026 17:48:37 +0300 Subject: [PATCH 8/8] chore: sync uv.lock data-refinery-cli version to 0.9.0 The version-bump skill updates pyproject.toml + CHANGELOG but not uv.lock; sync the lockfile's editable-package version so it matches the 0.9.0 bump. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01LAEeF8y7RrKft8de7rZfDM --- uv.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/uv.lock b/uv.lock index f565d16..12b9cb8 100644 --- a/uv.lock +++ b/uv.lock @@ -156,7 +156,7 @@ wheels = [ [[package]] name = "data-refinery-cli" -version = "0.8.0" +version = "0.9.0" source = { editable = "." } [package.optional-dependencies]