diff --git a/README.md b/README.md index 4477514..c241777 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ hardcodes `robertdelanghe.dev`, `bounded.tools`, an account, or an email. integrity/ verify-site · verify (sigstore) · gen-sitemanifest · gen-provenance · structure-audit · http-probe gates/ sbom (gen + completeness) · shacl-runner · seo-gate · axe-gate (axe-core a11y) · vuln-gate (npm audit) · html-validator-gate (vnu) · baseline-gate (web-features) · readability-gate · commonmark-runner · semantic (lone) gates/conformance/ conformance-report — lone's conformance() projection (Node port of jsr:@bounded-systems/lone@0.4) + a generic HTML renderer -generators/ gen-cid (IPFS UnixFS) · gen-identity (did:web + VC) · openapi (static-API helper core) +generators/ gen-cid (IPFS UnixFS) · gen-identity (did:web + VC) · gen-snapshots (reader/markdown) · openapi (static-API helper core) emitters/ reprDigest (RFC 9530) · securityTxt (RFC 9116) · webManifest · markdown-sibling headers lib/ schema-validate (zero-dep JSON Schema) · config (env/arg helpers) fixtures/ test/ isolated verification of the generic logic @@ -88,6 +88,7 @@ criteria are reported + summarised per area but never widen the headline claim. |---|---|---| | `gen-cid.mjs` | `DIST=dist node …/gen-cid.mjs` | `$DIST`. Walks the `site.sha256` file set (or `dist`), computes the IPFS UnixFS dir CIDv1 with no daemon, records it into `$DIST/provenance.json`. | | `gen-identity.mjs` | `IDENTITY_DOMAIN=… IDENTITY_REPO=owner/repo node …/gen-identity.mjs` | `$IDENTITY_DOMAIN`, `$IDENTITY_REPO` (cert-identity regexp), `$IDENTITY_SUBJECT` (the credentialSubject JSON, default `$DIST/resume.json`), optional `$IDENTITY_SUBJECT_SCHEMA`, `$IDENTITY_VC_NAME/DESCRIPTION`, `$IDENTITY_VALID_FROM_PATH`. Emits `did.json` + a W3C VC 2.0. | +| `gen-snapshots.mjs` | `node …/gen-snapshots.mjs [distDir]` | `$SNAPSHOT_DIST` (default `dist`). Optional `$SNAPSHOT_PAGES`, `$SNAPSHOT_BASE_URL` (recorded as `source` in the front-matter), `$SNAPSHOT_SUFFIX` (default `.reader`). For every built page, runs **@mozilla/readability** (the Firefox/Safari Reader engine, via `linkedom` — headless, no browser) and writes a clean reader **`.reader.html`** + an analysis-friendly **`.reader.md`** (YAML front-matter + Markdown via `turndown`). The Markdown is the durable, diffable twin of the page — far easier to run NLP/LLM analysis over than scraping live HTML — and doubles as the AI-readable Markdown sibling. (The printed/PDF view needs a print-CSS renderer and is a separate generator.) | | `openapi.mjs` | `import { sortKeys, writeApiFile, embedSchema, jsonResponse, validateOpenapi }` | The **generic core** of a static-API generator. The per-endpoint projection of a site's contracts (profile/posts/corpus/VC, etc.) stays in the site's build; this module provides deterministic JSON output, schema embedding, and OpenAPI 3.1/3.2 well-formedness validation. Pair with `lib/schema-validate.mjs` to self-check emitted docs. | ### emitters/ diff --git a/fixtures/snapshots/article.html b/fixtures/snapshots/article.html new file mode 100644 index 0000000..b873e9e --- /dev/null +++ b/fixtures/snapshots/article.html @@ -0,0 +1,20 @@ + + + The Bet — Bounded Systems + + +
+

The Bet

+

Good software splits into parts. Each good idea becomes its own piece + with clear edges and a defined contract — a promise about + what it does. Those clear edges are also what let you scope an agent to one + piece, and that is the whole wager of this project.

+

Why it matters

+

An agent can author one of those abstractions cheaply now. What it cannot + do yet is keep many of them honest against each other as they evolve. The + unsolved work lives in the seams between ideas, not within one.

+
  • one checkpoint per capability
  • signed, attributable effects
+
+
© Bounded Systems
+ + diff --git a/generators/gen-snapshots.mjs b/generators/gen-snapshots.mjs new file mode 100644 index 0000000..d85b479 --- /dev/null +++ b/generators/gen-snapshots.mjs @@ -0,0 +1,108 @@ +#!/usr/bin/env node +// Reader-view snapshot generator — for every built page, emit a clean READER +// extraction (the same Readability engine that powers Firefox/Safari Reader) as +// both HTML and Markdown. The Markdown is the durable, analysis-friendly twin of +// the page: machine-readable, diffable, and far easier to run NLP / LLM analysis +// over than scraping live HTML — and it doubles as the AI-readable Markdown sibling +// (`semantic.ai-readability`). A non-empty extraction is also the PROOF of the +// "reader survivability" the structure-audit grades (`readerOk`). +// +// node generators/gen-snapshots.mjs [distDir] # write .reader.{html,md} +// +// Pure (no browser, no network): linkedom parses the DOM, @mozilla/readability +// extracts the article, turndown renders Markdown. (The PRINTED/PDF view needs a +// real print-CSS renderer — tezcatl --pdf locally — and is a separate generator.) +// +// Config-driven; NOTHING about any one site is hard-coded: +// argv[2] / $SNAPSHOT_DIST built output dir (default: "dist") +// $SNAPSHOT_PAGES comma list of page paths under dist (default: every *.html) +// $SNAPSHOT_BASE_URL site origin, recorded as `source` in the front-matter +// $SNAPSHOT_SUFFIX output basename suffix (default: ".reader") +// +// The pure extract/markdown functions are exported for unit testing. +import { writeFile, readFile, readdir, access } from "node:fs/promises"; +import { resolve, join, relative, dirname, basename, extname } from "node:path"; +import { parseHTML } from "linkedom"; +import { Readability } from "@mozilla/readability"; +import TurndownService from "turndown"; + +// ── Pure core (browser-free; unit-testable) ────────────────────────────────── + +/** Extract the reader view of an HTML document. Returns null when Readability + * cannot find article content (e.g. a nav-only or empty page). */ +export function extractReader(html, { url = "" } = {}) { + const { document } = parseHTML(html); + const article = new Readability(document).parse(); + if (!article || !article.content) return null; + return { + url, + title: article.title || "", + byline: article.byline || "", + excerpt: article.excerpt || "", + siteName: article.siteName || "", + length: article.length || 0, + contentHtml: article.content, + text: article.textContent || "", + }; +} + +const turndown = new TurndownService({ headingStyle: "atx", codeBlockStyle: "fenced", bulletListMarker: "-" }); + +/** Render a reader extraction to Markdown with a small YAML front-matter (title, + * byline, excerpt, source) so the snapshot is self-describing for analysis. */ +export function toMarkdown(reader) { + const q = (s) => JSON.stringify(String(s)); + const fm = [ + "---", + `title: ${q(reader.title)}`, + reader.byline ? `byline: ${q(reader.byline)}` : null, + reader.excerpt ? `excerpt: ${q(reader.excerpt)}` : null, + reader.url ? `source: ${reader.url}` : null, + "---", + ].filter((x) => x != null).join("\n"); + return `${fm}\n\n${turndown.turndown(reader.contentHtml).trim()}\n`; +} + +// ── Impure runner ──────────────────────────────────────────────────────────── + +async function walkHtml(dir, base = dir) { + const out = []; + for (const e of await readdir(dir, { withFileTypes: true })) { + const p = join(dir, e.name); + if (e.isDirectory()) out.push(...await walkHtml(p, base)); + else if (e.name.endsWith(".html")) out.push(p); + } + return out; +} + +// ── CLI ────────────────────────────────────────────────────────────────────── + +async function main() { + const dist = resolve(process.argv[2] && !process.argv[2].startsWith("--") ? process.argv[2] : process.env.SNAPSHOT_DIST || "dist"); + const exists = async (p) => { try { await access(p); return true; } catch { return false; } }; + if (!(await exists(dist))) { console.error(`✗ gen-snapshots: ${dist} not found — build first.`); process.exit(2); } + + const suffix = process.env.SNAPSHOT_SUFFIX || ".reader"; + const baseUrl = (process.env.SNAPSHOT_BASE_URL || "").replace(/\/$/, ""); + let pages = (process.env.SNAPSHOT_PAGES || "").split(",").map((s) => s.trim().replace(/^\//, "")).filter(Boolean); + pages = pages.length ? pages.map((p) => resolve(dist, p)) : (await walkHtml(dist)).sort(); + + let wrote = 0, skipped = 0; + for (const file of pages) { + const rel = relative(dist, file); + const url = baseUrl ? `${baseUrl}/${rel.replace(/index\.html$/, "").replace(/\.html$/, "")}` : ""; + const reader = extractReader(await readFile(file, "utf8"), { url }); + if (!reader) { console.error(` · skipped ${rel} (no article content)`); skipped++; continue; } + const stem = join(dirname(file), basename(file, extname(file)) + suffix); + await writeFile(`${stem}.html`, reader.contentHtml.trim() + "\n"); + await writeFile(`${stem}.md`, toMarkdown(reader)); + console.log(` ✓ ${rel} → ${relative(dist, stem)}.{html,md} (${reader.length} chars)`); + wrote++; + } + console.log(`✓ gen-snapshots: ${wrote} reader snapshot(s) written${skipped ? `, ${skipped} skipped` : ""}.`); +} + +// Only run the CLI when invoked directly (not when imported by a test). +if (import.meta.url === `file://${process.argv[1]}`) { + main().catch((e) => { console.error("✗ gen-snapshots: error —", e.stack || e.message); process.exit(1); }); +} diff --git a/package-lock.json b/package-lock.json index 583c42b..88cb763 100644 --- a/package-lock.json +++ b/package-lock.json @@ -19,10 +19,12 @@ "sigstore": "^5.0.0", "stylelint": "^17.14.0", "stylelint-plugin-use-baseline": "^1.4.4", + "turndown": "^7.2.4", "vnu-jar": "^26.6.24" }, "bin": { "ck-axe-gate": "gates/axe-gate.mjs", + "ck-baseline-gate": "gates/baseline-gate.mjs", "ck-check-sbom": "gates/sbom/check-sbom.mjs", "ck-commonmark-runner": "gates/commonmark-runner.mjs", "ck-gen-cid": "generators/gen-cid.mjs", @@ -296,6 +298,12 @@ "integrity": "sha512-dXn3FZhPv0US+7dtJsIi2R+c7qWYiReoEh5zUntWCf4oSpMNib8FDhSoed6m3QyZdx5hK7iLFkYk3rNxwt8vTA==", "license": "MIT" }, + "node_modules/@mixmark-io/domino": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/@mixmark-io/domino/-/domino-2.2.0.tgz", + "integrity": "sha512-Y28PR25bHXUg88kCV7nivXrP2Nj2RueZ3/l/jdx6J9f8J4nsEGcgX0Qe6lt7Pa+J79+kPiJU3LguR6O/6zrLOw==", + "license": "BSD-2-Clause" + }, "node_modules/@mozilla/readability": { "version": "0.5.0", "resolved": "https://registry.npmjs.org/@mozilla/readability/-/readability-0.5.0.tgz", @@ -3468,6 +3476,19 @@ "node": "^22.22.2 || ^24.15.0 || >=26.0.0" } }, + "node_modules/turndown": { + "version": "7.2.4", + "resolved": "https://registry.npmjs.org/turndown/-/turndown-7.2.4.tgz", + "integrity": "sha512-I8yFsfRzmzK0WV1pNNOA4A7y4RDfFxPRxb3t+e3ui14qSGOxGtiSP6GjeX+Y6CHb7HYaFj7ECUD7VE5kQMZWGQ==", + "license": "MIT", + "dependencies": { + "@mixmark-io/domino": "^2.2.0" + }, + "engines": { + "node": ">=18", + "npm": ">=9" + } + }, "node_modules/uhyphen": { "version": "0.2.0", "resolved": "https://registry.npmjs.org/uhyphen/-/uhyphen-0.2.0.tgz", diff --git a/package.json b/package.json index ddbfee2..dde1fd7 100644 --- a/package.json +++ b/package.json @@ -25,7 +25,8 @@ "ck-readability-gate": "./gates/readability-gate.mjs", "ck-commonmark-runner": "./gates/commonmark-runner.mjs", "ck-gen-cid": "./generators/gen-cid.mjs", - "ck-gen-identity": "./generators/gen-identity.mjs" + "ck-gen-identity": "./generators/gen-identity.mjs", + "ck-gen-snapshots": "./generators/gen-snapshots.mjs" }, "scripts": { "test": "node test/run.mjs" @@ -41,6 +42,7 @@ "sigstore": "^5.0.0", "stylelint": "^17.14.0", "stylelint-plugin-use-baseline": "^1.4.4", + "turndown": "^7.2.4", "vnu-jar": "^26.6.24" } } diff --git a/test/run.mjs b/test/run.mjs index 0d32e05..f15f9c9 100755 --- a/test/run.mjs +++ b/test/run.mjs @@ -413,6 +413,31 @@ await test("gates/baseline-gate: classify + threshold, e2e on fixtures", async ( `pure logic asserted · e2e (stylelint): good=widely, bad=${bad.status} (${bad.offenders.length} below-widely)`); }); +// 17. gen-snapshots: reader extraction → Markdown (pure, deterministic — runs in CI). +await test("generators/gen-snapshots: reader extraction + markdown", async () => { + const { extractReader, toMarkdown } = await import(join(KIT, "generators", "gen-snapshots.mjs")); + const html = await readFile(join(FIX, "snapshots", "article.html"), "utf8"); + + const reader = extractReader(html, { url: "https://fixture.example/the-bet" }); + if (!reader) throw new Error("article fixture must extract a reader view"); + if (!/The Bet/.test(reader.title)) throw new Error(`title not extracted (got ${reader.title})`); + if (/About<\/a>|