bounded-systems · bdelanghe · Jun 29, 2026 · Jun 29, 2026
diff --git a/README.md b/README.md
@@ -14,7 +14,7 @@ hardcodes `robertdelanghe.dev`, `bounded.tools`, an account, or an email.
 integrity/    verify-site · verify (sigstore) · gen-sitemanifest · gen-provenance · structure-audit · http-probe
 gates/        sbom (gen + completeness) · shacl-runner · seo-gate · axe-gate (axe-core a11y) · vuln-gate (npm audit) · html-validator-gate (vnu) · baseline-gate (web-features) · readability-gate · commonmark-runner · semantic (lone)
 gates/conformance/  conformance-report — lone's conformance() projection (Node port of jsr:@bounded-systems/lone@0.4) + a generic HTML renderer
-generators/   gen-cid (IPFS UnixFS) · gen-identity (did:web + VC) · openapi (static-API helper core)
+generators/   gen-cid (IPFS UnixFS) · gen-identity (did:web + VC) · gen-snapshots (reader/markdown) · openapi (static-API helper core)
 emitters/     reprDigest (RFC 9530) · securityTxt (RFC 9116) · webManifest · markdown-sibling headers
 lib/          schema-validate (zero-dep JSON Schema) · config (env/arg helpers)
 fixtures/ test/  isolated verification of the generic logic
@@ -88,6 +88,7 @@ criteria are reported + summarised per area but never widen the headline claim.
 |---|---|---|
 | `gen-cid.mjs` | `DIST=dist node …/gen-cid.mjs` | `$DIST`. Walks the `site.sha256` file set (or `dist`), computes the IPFS UnixFS dir CIDv1 with no daemon, records it into `$DIST/provenance.json`. |
 | `gen-identity.mjs` | `IDENTITY_DOMAIN=… IDENTITY_REPO=owner/repo node …/gen-identity.mjs` | `$IDENTITY_DOMAIN`, `$IDENTITY_REPO` (cert-identity regexp), `$IDENTITY_SUBJECT` (the credentialSubject JSON, default `$DIST/resume.json`), optional `$IDENTITY_SUBJECT_SCHEMA`, `$IDENTITY_VC_NAME/DESCRIPTION`, `$IDENTITY_VALID_FROM_PATH`. Emits `did.json` + a W3C VC 2.0. |
+| `gen-snapshots.mjs` | `node …/gen-snapshots.mjs [distDir]` | `$SNAPSHOT_DIST` (default `dist`). Optional `$SNAPSHOT_PAGES`, `$SNAPSHOT_BASE_URL` (recorded as `source` in the front-matter), `$SNAPSHOT_SUFFIX` (default `.reader`). For every built page, runs **@mozilla/readability** (the Firefox/Safari Reader engine, via `linkedom` — headless, no browser) and writes a clean reader **`<page>.reader.html`** + an analysis-friendly **`<page>.reader.md`** (YAML front-matter + Markdown via `turndown`). The Markdown is the durable, diffable twin of the page — far easier to run NLP/LLM analysis over than scraping live HTML — and doubles as the AI-readable Markdown sibling. (The printed/PDF view needs a print-CSS renderer and is a separate generator.) |
 | `openapi.mjs` | `import { sortKeys, writeApiFile, embedSchema, jsonResponse, validateOpenapi }` | The **generic core** of a static-API generator. The per-endpoint projection of a site's contracts (profile/posts/corpus/VC, etc.) stays in the site's build; this module provides deterministic JSON output, schema embedding, and OpenAPI 3.1/3.2 well-formedness validation. Pair with `lib/schema-validate.mjs` to self-check emitted docs. |
 
 ### emitters/

diff --git a/fixtures/snapshots/article.html b/fixtures/snapshots/article.html
@@ -0,0 +1,20 @@
+<!DOCTYPE html>
+<html lang="en">
+  <head><meta charset="utf-8" /><title>The Bet — Bounded Systems</title></head>
+  <body>
+    <nav><a href="/">Home</a> <a href="/about">About</a></nav>
+    <article>
+      <h1>The Bet</h1>
+      <p>Good software splits into parts. Each good idea becomes its own piece
+      with clear edges and a defined <strong>contract</strong> — a promise about
+      what it does. Those clear edges are also what let you scope an agent to one
+      piece, and that is the whole wager of this project.</p>
+      <h2>Why it matters</h2>
+      <p>An agent can author one of those abstractions cheaply now. What it cannot
+      do yet is keep many of them honest against each other as they evolve. The
+      unsolved work lives in the <em>seams</em> between ideas, not within one.</p>
+      <ul><li>one checkpoint per capability</li><li>signed, attributable effects</li></ul>
+    </article>
+    <footer>© Bounded Systems</footer>
+  </body>
+</html>
diff --git a/generators/gen-snapshots.mjs b/generators/gen-snapshots.mjs
@@ -0,0 +1,108 @@
+#!/usr/bin/env node
+// Reader-view snapshot generator — for every built page, emit a clean READER
+// extraction (the same Readability engine that powers Firefox/Safari Reader) as
+// both HTML and Markdown. The Markdown is the durable, analysis-friendly twin of
+// the page: machine-readable, diffable, and far easier to run NLP / LLM analysis
+// over than scraping live HTML — and it doubles as the AI-readable Markdown sibling
+// (`semantic.ai-readability`). A non-empty extraction is also the PROOF of the
+// "reader survivability" the structure-audit grades (`readerOk`).
+//
+//   node generators/gen-snapshots.mjs [distDir]    # write <page>.reader.{html,md}
+//
+// Pure (no browser, no network): linkedom parses the DOM, @mozilla/readability
+// extracts the article, turndown renders Markdown. (The PRINTED/PDF view needs a
+// real print-CSS renderer — tezcatl --pdf locally — and is a separate generator.)
+//
+// Config-driven; NOTHING about any one site is hard-coded:
+//   argv[2] / $SNAPSHOT_DIST   built output dir                  (default: "dist")
+//   $SNAPSHOT_PAGES           comma list of page paths under dist (default: every *.html)
+//   $SNAPSHOT_BASE_URL        site origin, recorded as `source` in the front-matter
+//   $SNAPSHOT_SUFFIX          output basename suffix              (default: ".reader")
+//
+// The pure extract/markdown functions are exported for unit testing.
+import { writeFile, readFile, readdir, access } from "node:fs/promises";
+import { resolve, join, relative, dirname, basename, extname } from "node:path";
+import { parseHTML } from "linkedom";
+import { Readability } from "@mozilla/readability";
+import TurndownService from "turndown";
+
+// ── Pure core (browser-free; unit-testable) ──────────────────────────────────
+
+/** Extract the reader view of an HTML document. Returns null when Readability
+ *  cannot find article content (e.g. a nav-only or empty page). */
+export function extractReader(html, { url = "" } = {}) {
+  const { document } = parseHTML(html);
+  const article = new Readability(document).parse();
+  if (!article || !article.content) return null;
+  return {
+    url,
+    title: article.title || "",
+    byline: article.byline || "",
+    excerpt: article.excerpt || "",
+    siteName: article.siteName || "",
+    length: article.length || 0,
+    contentHtml: article.content,
+    text: article.textContent || "",
+  };
+}
+
+const turndown = new TurndownService({ headingStyle: "atx", codeBlockStyle: "fenced", bulletListMarker: "-" });
+
+/** Render a reader extraction to Markdown with a small YAML front-matter (title,
+ *  byline, excerpt, source) so the snapshot is self-describing for analysis. */
+export function toMarkdown(reader) {
+  const q = (s) => JSON.stringify(String(s));
+  const fm = [
+    "---",
+    `title: ${q(reader.title)}`,
+    reader.byline ? `byline: ${q(reader.byline)}` : null,
+    reader.excerpt ? `excerpt: ${q(reader.excerpt)}` : null,
+    reader.url ? `source: ${reader.url}` : null,
+    "---",
+  ].filter((x) => x != null).join("\n");
+  return `${fm}\n\n${turndown.turndown(reader.contentHtml).trim()}\n`;
+}
+
+// ── Impure runner ────────────────────────────────────────────────────────────
+
+async function walkHtml(dir, base = dir) {
+  const out = [];
+  for (const e of await readdir(dir, { withFileTypes: true })) {
+    const p = join(dir, e.name);
+    if (e.isDirectory()) out.push(...await walkHtml(p, base));
+    else if (e.name.endsWith(".html")) out.push(p);
+  }
+  return out;
+}
+
+// ── CLI ──────────────────────────────────────────────────────────────────────
+
+async function main() {
+  const dist = resolve(process.argv[2] && !process.argv[2].startsWith("--") ? process.argv[2] : process.env.SNAPSHOT_DIST || "dist");
+  const exists = async (p) => { try { await access(p); return true; } catch { return false; } };
+  if (!(await exists(dist))) { console.error(`✗ gen-snapshots: ${dist} not found — build first.`); process.exit(2); }
+
+  const suffix = process.env.SNAPSHOT_SUFFIX || ".reader";
+  const baseUrl = (process.env.SNAPSHOT_BASE_URL || "").replace(/\/$/, "");
+  let pages = (process.env.SNAPSHOT_PAGES || "").split(",").map((s) => s.trim().replace(/^\//, "")).filter(Boolean);
+  pages = pages.length ? pages.map((p) => resolve(dist, p)) : (await walkHtml(dist)).sort();
+
+  let wrote = 0, skipped = 0;
+  for (const file of pages) {
+    const rel = relative(dist, file);
+    const url = baseUrl ? `${baseUrl}/${rel.replace(/index\.html$/, "").replace(/\.html$/, "")}` : "";
+    const reader = extractReader(await readFile(file, "utf8"), { url });
+    if (!reader) { console.error(`  · skipped ${rel} (no article content)`); skipped++; continue; }
+    const stem = join(dirname(file), basename(file, extname(file)) + suffix);
+    await writeFile(`${stem}.html`, reader.contentHtml.trim() + "\n");
+    await writeFile(`${stem}.md`, toMarkdown(reader));
+    console.log(`  ✓ ${rel} → ${relative(dist, stem)}.{html,md} (${reader.length} chars)`);
+    wrote++;
+  }
+  console.log(`✓ gen-snapshots: ${wrote} reader snapshot(s) written${skipped ? `, ${skipped} skipped` : ""}.`);
+}
+
+// Only run the CLI when invoked directly (not when imported by a test).
+if (import.meta.url === `file://${process.argv[1]}`) {
+  main().catch((e) => { console.error("✗ gen-snapshots: error —", e.stack || e.message); process.exit(1); });
+}
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -25,7 +25,8 @@
     "ck-readability-gate": "./gates/readability-gate.mjs",
     "ck-commonmark-runner": "./gates/commonmark-runner.mjs",
     "ck-gen-cid": "./generators/gen-cid.mjs",
-    "ck-gen-identity": "./generators/gen-identity.mjs"
+    "ck-gen-identity": "./generators/gen-identity.mjs",
+    "ck-gen-snapshots": "./generators/gen-snapshots.mjs"
   },
   "scripts": {
     "test": "node test/run.mjs"
@@ -41,6 +42,7 @@
     "sigstore": "^5.0.0",
     "stylelint": "^17.14.0",
     "stylelint-plugin-use-baseline": "^1.4.4",
+    "turndown": "^7.2.4",
     "vnu-jar": "^26.6.24"
   }
 }
diff --git a/test/run.mjs b/test/run.mjs
@@ -413,6 +413,31 @@ await test("gates/baseline-gate: classify + threshold, e2e on fixtures", async (
     `pure logic asserted · e2e (stylelint): good=widely, bad=${bad.status} (${bad.offenders.length} below-widely)`);
 });
 
+// 17. gen-snapshots: reader extraction → Markdown (pure, deterministic — runs in CI).
+await test("generators/gen-snapshots: reader extraction + markdown", async () => {
+  const { extractReader, toMarkdown } = await import(join(KIT, "generators", "gen-snapshots.mjs"));
+  const html = await readFile(join(FIX, "snapshots", "article.html"), "utf8");
+
+  const reader = extractReader(html, { url: "https://fixture.example/the-bet" });
+  if (!reader) throw new Error("article fixture must extract a reader view");
+  if (!/The Bet/.test(reader.title)) throw new Error(`title not extracted (got ${reader.title})`);
+  if (/About<\/a>|<footer/i.test(reader.contentHtml)) throw new Error("reader content must strip nav/footer chrome");
+  if (!/clear edges/.test(reader.text)) throw new Error("reader text must carry the article body");
+
+  const md = toMarkdown(reader);
+  if (!/^---\n/.test(md)) throw new Error("markdown must lead with YAML front-matter");
+  if (!/source: https:\/\/fixture\.example\/the-bet/.test(md)) throw new Error("front-matter must record the source url");
+  if (!/## Why it matters/.test(md)) throw new Error("markdown must carry headings (## Why it matters)");
+  if (!/-\s+one checkpoint per capability/.test(md)) throw new Error("markdown must carry list items");
+  if (/<nav>|<footer>/.test(md)) throw new Error("markdown must not contain page chrome");
+
+  // a contentless page extracts to null, which the generator skips gracefully.
+  if (extractReader("<!DOCTYPE html><html><head><title>x</title></head><body></body></html>") !== null) {
+    throw new Error("a contentless page should yield no reader view (null)");
+  }
+  ok("generators/gen-snapshots: reader extraction + markdown", `title + front-matter + ${md.split("\n").length}-line markdown`);
+});
+
 await rm(work, { recursive: true, force: true });
 console.log(`\n${failed ? "✗" : "✓"} conformance-kit tests: ${passed} passed, ${failed} failed`);
 process.exit(failed ? 1 : 0);