diff --git a/README.md b/README.md index dbe7b60..57d40b2 100644 --- a/README.md +++ b/README.md @@ -191,6 +191,8 @@ npx dotaios ingest call-notes.txt --to signal # working Every Markdown file gets full provenance frontmatter (`source`, `ingested_at`, `kind`, `parser`, `title`). Documents are parsed locally, nothing is uploaded. PDFs use the bundled `unpdf` extractor by default. Install [marker-pdf](https://github.com/datalab-to/marker) for high-fidelity PDF / DOCX / PPTX / EPUB parsing. +Web pages are fetched using [Lightpanda](https://github.com/lightpanda-io/browser), a lightweight headless browser that renders JavaScript. It installs automatically during `dotaios setup`. + ## Daily Brief ```bash diff --git a/docs/superpowers/plans/2026-05-18-lightpanda-ingest.md b/docs/superpowers/plans/2026-05-18-lightpanda-ingest.md new file mode 100644 index 0000000..394442c --- /dev/null +++ b/docs/superpowers/plans/2026-05-18-lightpanda-ingest.md @@ -0,0 +1,1042 @@ +# Lightpanda Web Ingest Integration — Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Make `dotaios ingest ` use Lightpanda (a headless browser) by default, installed automatically during `dotaios setup`, with silent fallback to plain fetch when missing or failing. + +**Architecture:** A new zero-dep `packages/core/src/lightpanda.mjs` module owns platform detection, download to `~/.dotaios/bin/lightpanda`, and resolver lookup. `packages/cli/src/ingest/web.mjs` gets a `fetchHtml()` dispatcher that prefers Lightpanda (via `spawnSync`) and falls back to plain `fetch`. `setup.mjs` calls the downloader once. The `AGENTS.md` template gains a rule routing all URL reading through `dotaios ingest`. + +**Tech Stack:** Node 20 ESM, zero-dep core, `node:test`, `node:child_process` (`spawnSync`), `node:https`/`node:fs`. Lightpanda binary fetched from `https://github.com/lightpanda-io/browser/releases/latest/download/`. + +**Conventions:** +- Tests run with `node --test tests/**/*.test.mjs`. All 249 existing tests must stay green. +- Use `import` (ESM). No `require`. +- `packages/core` has zero npm deps — `lightpanda.mjs` must use only `node:` built-ins. +- All new fetch/spawn/clock calls accept injectable overrides for testing (`fetchImpl`, `spawnImpl`, `now`, `resolveLightpandaImpl`). + +--- + +## File Map + +| File | Change | +|---|---| +| `packages/core/src/paths.mjs` | Add `dotaiosBinDir()`, `lightpandaBinPath()` | +| `packages/core/src/lightpanda.mjs` | **New** — `lightpandaPlatformBinary()`, `downloadLightpanda()`, `resolveLightpanda()` | +| `packages/cli/src/ingest/web.mjs` | Add `fetchHtml()` dispatcher, thread `parser` field through, write one-time hint flag | +| `packages/cli/src/commands/setup.mjs` | Call `downloadLightpanda({ silent: false })` after wizard, before final summary | +| `templates/AGENTS.md.hbs` | Add URL routing rule under `## Rules` | +| `README.md` | One-line mention in ingest section | +| `tests/core/lightpanda.test.mjs` | **New** — platform binary, download (stubbed fetch+fs), resolve fallback chain | +| `tests/cli/ingest_routing.test.mjs` | Add 3 direct `ingestUrl()` tests for lightpanda success / lightpanda crash → fallback / lightpanda missing | +| `tests/core/render.test.mjs` | Add assertion that `AGENTS.md` template includes the routing rule | + +--- + +## Task 1: `dotaiosBinDir()` + `lightpandaBinPath()` in core paths + +**Files:** +- Modify: `packages/core/src/paths.mjs` +- Test: `tests/core/paths.test.mjs` (new file) + +- [ ] **Step 1: Write failing test** + +Create `tests/core/paths.test.mjs`: + +```js +import os from "node:os"; +import path from "node:path"; +import test from "node:test"; +import assert from "node:assert/strict"; +import { dotaiosBinDir, lightpandaBinPath } from "../../packages/core/src/paths.mjs"; + +test("dotaiosBinDir returns ~/.dotaios/bin", () => { + assert.equal(dotaiosBinDir(), path.join(os.homedir(), ".dotaios", "bin")); +}); + +test("lightpandaBinPath returns ~/.dotaios/bin/lightpanda on unix", { skip: process.platform === "win32" }, () => { + assert.equal(lightpandaBinPath(), path.join(os.homedir(), ".dotaios", "bin", "lightpanda")); +}); + +test("lightpandaBinPath returns ~/.dotaios/bin/lightpanda.exe on windows", { skip: process.platform !== "win32" }, () => { + assert.equal(lightpandaBinPath(), path.join(os.homedir(), ".dotaios", "bin", "lightpanda.exe")); +}); +``` + +- [ ] **Step 2: Run test, expect failure** + +```bash +node --test tests/core/paths.test.mjs +``` +Expected: `Cannot find ... dotaiosBinDir` or `is not a function`. + +- [ ] **Step 3: Implement in `packages/core/src/paths.mjs`** + +Add at the bottom of the file: + +```js +export function dotaiosBinDir() { + return path.join(os.homedir(), ".dotaios", "bin"); +} + +export function lightpandaBinPath() { + const ext = process.platform === "win32" ? ".exe" : ""; + return path.join(dotaiosBinDir(), `lightpanda${ext}`); +} +``` + +- [ ] **Step 4: Run test, expect pass** + +```bash +node --test tests/core/paths.test.mjs +``` +Expected: 3 tests passing (with 1 skipped depending on platform). + +- [ ] **Step 5: Run full test suite to confirm no regression** + +```bash +node --test tests/**/*.test.mjs 2>&1 | tail -20 +``` +Expected: 252 tests passing (249 + 3 new). + +- [ ] **Step 6: Commit** + +```bash +git add packages/core/src/paths.mjs tests/core/paths.test.mjs +git commit -m "feat(core): add dotaiosBinDir and lightpandaBinPath helpers" +``` + +--- + +## Task 2: `lightpandaPlatformBinary()` — platform → binary name + +**Files:** +- Create: `packages/core/src/lightpanda.mjs` +- Test: `tests/core/lightpanda.test.mjs` (new) + +- [ ] **Step 1: Write failing test** + +Create `tests/core/lightpanda.test.mjs`: + +```js +import test from "node:test"; +import assert from "node:assert/strict"; +import { lightpandaPlatformBinary } from "../../packages/core/src/lightpanda.mjs"; + +test("lightpandaPlatformBinary maps darwin+arm64 to aarch64-macos", () => { + assert.equal(lightpandaPlatformBinary({ platform: "darwin", arch: "arm64" }), "lightpanda-aarch64-macos"); +}); + +test("lightpandaPlatformBinary maps darwin+x64 to x86_64-macos", () => { + assert.equal(lightpandaPlatformBinary({ platform: "darwin", arch: "x64" }), "lightpanda-x86_64-macos"); +}); + +test("lightpandaPlatformBinary maps linux+arm64 to aarch64-linux", () => { + assert.equal(lightpandaPlatformBinary({ platform: "linux", arch: "arm64" }), "lightpanda-aarch64-linux"); +}); + +test("lightpandaPlatformBinary maps linux+x64 to x86_64-linux", () => { + assert.equal(lightpandaPlatformBinary({ platform: "linux", arch: "x64" }), "lightpanda-x86_64-linux"); +}); + +test("lightpandaPlatformBinary returns null on win32", () => { + assert.equal(lightpandaPlatformBinary({ platform: "win32", arch: "x64" }), null); +}); + +test("lightpandaPlatformBinary returns null on unknown platform/arch", () => { + assert.equal(lightpandaPlatformBinary({ platform: "linux", arch: "ppc64" }), null); + assert.equal(lightpandaPlatformBinary({ platform: "freebsd", arch: "x64" }), null); +}); + +test("lightpandaPlatformBinary defaults to current process when no arg", () => { + const out = lightpandaPlatformBinary(); + if (process.platform === "win32") assert.equal(out, null); + else assert.match(out, /^lightpanda-/); +}); +``` + +- [ ] **Step 2: Run test, expect failure** + +```bash +node --test tests/core/lightpanda.test.mjs +``` +Expected: `Cannot find module .../lightpanda.mjs`. + +- [ ] **Step 3: Implement** + +Create `packages/core/src/lightpanda.mjs`: + +```js +const PLATFORM_BINARIES = { + "darwin:arm64": "lightpanda-aarch64-macos", + "darwin:x64": "lightpanda-x86_64-macos", + "linux:arm64": "lightpanda-aarch64-linux", + "linux:x64": "lightpanda-x86_64-linux" +}; + +export function lightpandaPlatformBinary({ platform = process.platform, arch = process.arch } = {}) { + return PLATFORM_BINARIES[`${platform}:${arch}`] ?? null; +} +``` + +- [ ] **Step 4: Run test, expect pass** + +```bash +node --test tests/core/lightpanda.test.mjs +``` +Expected: 7 tests passing. + +- [ ] **Step 5: Commit** + +```bash +git add packages/core/src/lightpanda.mjs tests/core/lightpanda.test.mjs +git commit -m "feat(core): map platform+arch to lightpanda binary name" +``` + +--- + +## Task 3: `downloadLightpanda()` — fetch binary to `~/.dotaios/bin/` + +**Files:** +- Modify: `packages/core/src/lightpanda.mjs` +- Test: `tests/core/lightpanda.test.mjs` + +- [ ] **Step 1: Write failing tests** (append to `tests/core/lightpanda.test.mjs`) + +```js +import fs from "node:fs/promises"; +import os from "node:os"; +import path from "node:path"; +import { downloadLightpanda } from "../../packages/core/src/lightpanda.mjs"; + +function makeFakeFetch({ status = 200, body = "FAKE_BINARY_BYTES" } = {}) { + return async () => ({ + ok: status >= 200 && status < 300, + status, + statusText: status === 200 ? "OK" : "ERR", + arrayBuffer: async () => new TextEncoder().encode(body).buffer + }); +} + +test("downloadLightpanda writes binary to destBinPath and chmods +x on unix", { skip: process.platform === "win32" }, async () => { + const tmp = await fs.mkdtemp(path.join(os.tmpdir(), "dotaios-lp-dl-")); + const destBinPath = path.join(tmp, "bin", "lightpanda"); + try { + const result = await downloadLightpanda({ + silent: true, + fetchImpl: makeFakeFetch({ body: "BINARY" }), + destBinPath, + platformBinary: "lightpanda-x86_64-linux" + }); + assert.equal(result.ok, true); + const written = await fs.readFile(destBinPath, "utf8"); + assert.equal(written, "BINARY"); + const stat = await fs.stat(destBinPath); + assert.ok((stat.mode & 0o111) !== 0, "executable bit should be set"); + } finally { + await fs.rm(tmp, { recursive: true, force: true }); + } +}); + +test("downloadLightpanda returns { ok:false, reason } on HTTP error without throwing", async () => { + const tmp = await fs.mkdtemp(path.join(os.tmpdir(), "dotaios-lp-dl-")); + const destBinPath = path.join(tmp, "bin", "lightpanda"); + try { + const result = await downloadLightpanda({ + silent: true, + fetchImpl: makeFakeFetch({ status: 404 }), + destBinPath, + platformBinary: "lightpanda-x86_64-linux" + }); + assert.equal(result.ok, false); + assert.match(result.reason, /404/); + } finally { + await fs.rm(tmp, { recursive: true, force: true }); + } +}); + +test("downloadLightpanda returns { ok:false, reason } on network error without throwing", async () => { + const tmp = await fs.mkdtemp(path.join(os.tmpdir(), "dotaios-lp-dl-")); + const destBinPath = path.join(tmp, "bin", "lightpanda"); + try { + const result = await downloadLightpanda({ + silent: true, + fetchImpl: async () => { throw new Error("ECONNRESET"); }, + destBinPath, + platformBinary: "lightpanda-x86_64-linux" + }); + assert.equal(result.ok, false); + assert.match(result.reason, /ECONNRESET/); + } finally { + await fs.rm(tmp, { recursive: true, force: true }); + } +}); + +test("downloadLightpanda returns { ok:false, reason:'unsupported-platform' } when platformBinary null", async () => { + const result = await downloadLightpanda({ + silent: true, + fetchImpl: makeFakeFetch(), + destBinPath: path.join(os.tmpdir(), "noop"), + platformBinary: null + }); + assert.equal(result.ok, false); + assert.equal(result.reason, "unsupported-platform"); +}); +``` + +- [ ] **Step 2: Run tests, expect failure** + +```bash +node --test tests/core/lightpanda.test.mjs +``` +Expected: `Cannot find ... downloadLightpanda`. + +- [ ] **Step 3: Implement in `packages/core/src/lightpanda.mjs`** + +Add imports at top: + +```js +import fs from "node:fs/promises"; +import path from "node:path"; +import { lightpandaBinPath } from "./paths.mjs"; +``` + +Add after `lightpandaPlatformBinary`: + +```js +const RELEASE_BASE = "https://github.com/lightpanda-io/browser/releases/latest/download"; + +export async function downloadLightpanda({ + silent = false, + fetchImpl = globalThis.fetch, + destBinPath = lightpandaBinPath(), + platformBinary = lightpandaPlatformBinary() +} = {}) { + if (!platformBinary) { + return { ok: false, reason: "unsupported-platform" }; + } + + const url = `${RELEASE_BASE}/${platformBinary}`; + if (!silent) console.log(`⬇ Installing Lightpanda for web browsing...`); + + try { + const response = await fetchImpl(url); + if (!response.ok) { + return { ok: false, reason: `HTTP ${response.status} ${response.statusText || ""}`.trim() }; + } + const buf = Buffer.from(await response.arrayBuffer()); + await fs.mkdir(path.dirname(destBinPath), { recursive: true }); + await fs.writeFile(destBinPath, buf); + if (process.platform !== "win32") { + await fs.chmod(destBinPath, 0o755); + } + if (!silent) console.log(` Installed Lightpanda → ${destBinPath}`); + return { ok: true, path: destBinPath }; + } catch (err) { + return { ok: false, reason: err.message || String(err) }; + } +} +``` + +- [ ] **Step 4: Run tests, expect pass** + +```bash +node --test tests/core/lightpanda.test.mjs +``` +Expected: 11 tests passing (7 from Task 2 + 4 new). + +- [ ] **Step 5: Commit** + +```bash +git add packages/core/src/lightpanda.mjs tests/core/lightpanda.test.mjs +git commit -m "feat(core): download lightpanda binary to ~/.dotaios/bin/" +``` + +--- + +## Task 4: `resolveLightpanda()` — local bin → PATH → null + +**Files:** +- Modify: `packages/core/src/lightpanda.mjs` +- Test: `tests/core/lightpanda.test.mjs` + +- [ ] **Step 1: Write failing tests** (append) + +```js +import { resolveLightpanda } from "../../packages/core/src/lightpanda.mjs"; + +test("resolveLightpanda returns local bin path when it exists and is executable", async () => { + const tmp = await fs.mkdtemp(path.join(os.tmpdir(), "dotaios-lp-res-")); + const localBin = path.join(tmp, "lightpanda"); + await fs.writeFile(localBin, "#!/bin/sh\necho fake"); + await fs.chmod(localBin, 0o755); + try { + const result = await resolveLightpanda({ + localBinPath: localBin, + whichImpl: () => null + }); + assert.equal(result, localBin); + } finally { + await fs.rm(tmp, { recursive: true, force: true }); + } +}); + +test("resolveLightpanda falls back to PATH when local missing", async () => { + const result = await resolveLightpanda({ + localBinPath: "/nonexistent/lightpanda", + whichImpl: () => "/usr/local/bin/lightpanda" + }); + assert.equal(result, "/usr/local/bin/lightpanda"); +}); + +test("resolveLightpanda returns null when neither local nor PATH has it", async () => { + const result = await resolveLightpanda({ + localBinPath: "/nonexistent/lightpanda", + whichImpl: () => null + }); + assert.equal(result, null); +}); +``` + +- [ ] **Step 2: Run tests, expect failure** + +```bash +node --test tests/core/lightpanda.test.mjs +``` +Expected: `Cannot find ... resolveLightpanda`. + +- [ ] **Step 3: Implement** in `packages/core/src/lightpanda.mjs` + +Add import at top: + +```js +import { spawnSync } from "node:child_process"; +``` + +Add after `downloadLightpanda`: + +```js +function defaultWhich() { + const cmd = process.platform === "win32" ? "where" : "which"; + const result = spawnSync(cmd, ["lightpanda"], { encoding: "utf8" }); + if (result.status !== 0) return null; + const first = (result.stdout || "").split(/\r?\n/).map((s) => s.trim()).find(Boolean); + return first || null; +} + +export async function resolveLightpanda({ + localBinPath = lightpandaBinPath(), + whichImpl = defaultWhich +} = {}) { + try { + await fs.access(localBinPath); + return localBinPath; + } catch { + // not present, try PATH + } + const fromPath = whichImpl(); + return fromPath || null; +} +``` + +- [ ] **Step 4: Run tests, expect pass** + +```bash +node --test tests/core/lightpanda.test.mjs +``` +Expected: 14 tests passing. + +- [ ] **Step 5: Run full suite** + +```bash +node --test tests/**/*.test.mjs 2>&1 | tail -5 +``` +Expected: 252 + 3 + 4 + 3 = 256 tests passing (depending on platform-skipped count). + +- [ ] **Step 6: Commit** + +```bash +git add packages/core/src/lightpanda.mjs tests/core/lightpanda.test.mjs +git commit -m "feat(core): resolve lightpanda from local bin or PATH" +``` + +--- + +## Task 5: Lightpanda-backed `fetchHtml()` in ingest/web.mjs + +**Files:** +- Modify: `packages/cli/src/ingest/web.mjs` +- Test: `tests/cli/ingest_routing.test.mjs` + +This task threads a `parser` value through `ingestUrl()` so the frontmatter, `placeMarkdown`, and event log reflect which fetcher was used. + +- [ ] **Step 1: Write failing tests** (append to `tests/cli/ingest_routing.test.mjs`) + +Add at top of file: + +```js +import { ingestUrl } from "../../packages/cli/src/ingest/web.mjs"; +``` + +Add at bottom of file: + +```js +function htmlFixture(title = "Lightpanda Rendered") { + return `${title}

${title}

${"Body paragraph. ".repeat(60)}

`; +} + +function makeFakeFetch({ body = htmlFixture(), status = 200 } = {}) { + return async () => ({ + ok: status >= 200 && status < 300, + status, + statusText: "OK", + text: async () => body, + arrayBuffer: async () => new TextEncoder().encode(body).buffer, + headers: { get: () => "text/html; charset=utf-8" } + }); +} + +function makeWebWorkspace() { + const root = fs.mkdtempSync(path.join(os.tmpdir(), "dotaios-lp-web-")); + return { + root, + rawDir: path.join(root, "vault", "raw"), + assetsDir: path.join(root, "vault", "assets"), + eventsPath: path.join(root, "memory", "events.jsonl"), + hintFlagPath: path.join(root, "lightpanda_hint_shown") + }; +} + +test("ingestUrl uses lightpanda when resolver returns a path and spawn succeeds", async () => { + const ws = makeWebWorkspace(); + const html = htmlFixture("Lightpanda Win"); + const result = await ingestUrl("https://example.com/lp-win", { + rawDir: ws.rawDir, + eventsPath: ws.eventsPath, + fetchImpl: makeFakeFetch({ body: "SHOULD NOT BE USED" }), + resolveLightpandaImpl: async () => "/fake/lightpanda", + spawnImpl: () => ({ status: 0, stdout: html, stderr: "" }), + hintFlagPath: ws.hintFlagPath, + now: () => new Date("2026-05-18T12:00:00Z") + }); + assert.equal(result.action, "written"); + assert.equal(result.parser, "lightpanda+readability+turndown"); + const written = fs.readFileSync(result.destination, "utf8"); + assert.match(written, /parser: lightpanda\+readability\+turndown/); + assert.match(written, /Lightpanda Win/); +}); + +test("ingestUrl falls back to plain fetch when lightpanda spawn fails", async () => { + const ws = makeWebWorkspace(); + const html = htmlFixture("Plain Fetch Fallback"); + const result = await ingestUrl("https://example.com/fallback", { + rawDir: ws.rawDir, + eventsPath: ws.eventsPath, + fetchImpl: makeFakeFetch({ body: html }), + resolveLightpandaImpl: async () => "/fake/lightpanda", + spawnImpl: () => ({ status: 1, stdout: "", stderr: "boom" }), + hintFlagPath: ws.hintFlagPath, + now: () => new Date("2026-05-18T12:00:00Z") + }); + assert.equal(result.action, "written"); + assert.equal(result.parser, "readability+turndown"); + const written = fs.readFileSync(result.destination, "utf8"); + assert.match(written, /parser: readability\+turndown/); + assert.match(written, /Plain Fetch Fallback/); +}); + +test("ingestUrl uses plain fetch when lightpanda not found and writes hint flag once", async () => { + const ws = makeWebWorkspace(); + const html = htmlFixture("Plain No Lightpanda"); + const opts = { + rawDir: ws.rawDir, + eventsPath: ws.eventsPath, + fetchImpl: makeFakeFetch({ body: html }), + resolveLightpandaImpl: async () => null, + spawnImpl: () => { throw new Error("must not spawn"); }, + hintFlagPath: ws.hintFlagPath, + lightpandaPlatformSupported: true, + now: () => new Date("2026-05-18T12:00:00Z") + }; + const result = await ingestUrl("https://example.com/nope", opts); + assert.equal(result.parser, "readability+turndown"); + assert.equal(fs.existsSync(ws.hintFlagPath), true); + + // Second call must not re-create / re-print (flag already exists) + const html2 = htmlFixture("Second Call"); + const second = await ingestUrl("https://example.com/nope2", { + ...opts, + fetchImpl: makeFakeFetch({ body: html2 }) + }); + assert.equal(second.parser, "readability+turndown"); +}); +``` + +- [ ] **Step 2: Run tests, expect failure** + +```bash +node --test tests/cli/ingest_routing.test.mjs +``` +Expected: tests fail because `resolveLightpandaImpl` / `spawnImpl` / `hintFlagPath` options are ignored and `parser` is always `"readability+turndown"`. + +- [ ] **Step 3: Implement in `packages/cli/src/ingest/web.mjs`** + +Add imports at top: + +```js +import { spawnSync as nodeSpawnSync } from "node:child_process"; +import { resolveLightpanda, lightpandaPlatformBinary } from "../../../core/src/lightpanda.mjs"; +``` + +Replace the destructure block in `ingestUrl()` (lines ~58-73) — add four new option lines: + +```js + const { + rawDir, + eventsPath, + overwrite = false, + dryRun = false, + timeoutMs = DEFAULT_FETCH_TIMEOUT_MS, + fetchImpl = globalThis.fetch, + documentOptions = {}, + shelf = "raw", + name = null, + vaultRoot = null, + signalsDir = null, + apply = false, + interactive = false, + now = () => new Date(), + resolveLightpandaImpl = resolveLightpanda, + spawnImpl = nodeSpawnSync, + hintFlagPath = path.join(os.homedir(), ".dotaios", ".lightpanda_hint_shown"), + lightpandaPlatformSupported = lightpandaPlatformBinary() !== null + } = options; +``` + +Replace the existing `const response = await fetchWithTimeout(...)` block (currently line ~93) with a dispatcher. Replace lines 93-152 (everything from `const response = ...` through the closing of `ingestUrl`) with: + +```js + const fetched = await fetchHtml(canonical, { + timeoutMs, + fetchImpl, + resolveLightpandaImpl, + spawnImpl, + hintFlagPath, + lightpandaPlatformSupported + }); + + // PDF branch still goes through plain fetch (fetched.response is set when lightpanda was skipped) + if (fetched.via === "plain") { + const response = fetched.response; + if (!response.ok) { + throw new IngestError( + `Fetch failed: ${canonical} returned ${response.status} ${response.statusText || ""}`.trim(), + "FETCH_FAILED" + ); + } + const contentType = (response.headers.get("content-type") || "").toLowerCase(); + if (contentType.includes("application/pdf")) { + return await ingestPdfResponse({ + response, + canonical, + rawDir, + assetsDir: options.assetsDir, + eventsPath, + overwrite, + documentOptions, + shelf, + name, + vaultRoot, + signalsDir, + apply, + interactive, + now + }); + } + } + + const html = fetched.html; + const parser = fetched.parser; + const { title, markdown } = await extractArticle(html, canonical); + + const baseSlug = slugify(title); + const frontmatter = buildFrontmatter({ + source: canonical, + kind: "web", + parser, + title, + ingestedAt: now().toISOString() + }); + + return await placeMarkdown({ + shelf, + name, + vaultRoot, + rawDir, + signalsDir, + eventsPath, + baseSlug, + source: canonical, + title, + body: `${frontmatter}\n${markdown.trimEnd()}`, + kind: "web", + parser, + overwrite, + apply, + interactive, + now + }); +} +``` + +Add a new helper above `fetchWithTimeout`: + +```js +async function fetchHtml(url, { + timeoutMs, + fetchImpl, + resolveLightpandaImpl, + spawnImpl, + hintFlagPath, + lightpandaPlatformSupported +}) { + const lp = await resolveLightpandaImpl(); + + if (lp) { + try { + const result = spawnImpl(lp, ["fetch", "--dump", url], { + timeout: timeoutMs, + encoding: "utf8", + maxBuffer: 64 * 1024 * 1024 + }); + if (result && result.status === 0 && typeof result.stdout === "string" && result.stdout.trim()) { + return { via: "lightpanda", html: result.stdout, parser: "lightpanda+readability+turndown" }; + } + console.warn(`[lightpanda] fetch failed for ${url} (exit ${result?.status ?? "?"}), falling back to plain fetch`); + } catch (err) { + console.warn(`[lightpanda] spawn error for ${url}: ${err.message}, falling back to plain fetch`); + } + } else if (lightpandaPlatformSupported) { + await maybeShowLightpandaHint(hintFlagPath); + } + + const response = await fetchWithTimeout(url, { timeoutMs, fetchImpl }); + if (!response.ok) { + return { via: "plain", response, html: "", parser: "readability+turndown" }; + } + const contentType = (response.headers.get("content-type") || "").toLowerCase(); + if (contentType.includes("application/pdf")) { + return { via: "plain", response, html: "", parser: "readability+turndown" }; + } + const html = await response.text(); + return { via: "plain", response, html, parser: "readability+turndown" }; +} + +async function maybeShowLightpandaHint(hintFlagPath) { + try { + await fs.access(hintFlagPath); + return; // already shown + } catch { + // fall through + } + try { + await fs.mkdir(path.dirname(hintFlagPath), { recursive: true }); + await fs.writeFile(hintFlagPath, new Date().toISOString()); + console.log("Tip: run `dotaios setup` to install Lightpanda for better web scraping."); + } catch { + // non-fatal — never block ingest because of the hint + } +} +``` + +Also update the `dryRun` branch (lines ~77-91) to compute parser via resolver: + +```js + if (dryRun) { + const lp = await resolveLightpandaImpl(); + const parser = lp ? "lightpanda+readability+turndown" : "readability+turndown"; + return { + action: "dry-run", + kind: "web", + parser, + canonical, + plan: { kind: "web", parser, canonical, shelf, rawDir } + }; + } +``` + +- [ ] **Step 4: Run lightpanda routing tests, expect pass** + +```bash +node --test tests/cli/ingest_routing.test.mjs +``` +Expected: all routing tests pass, including 3 new lightpanda tests. + +- [ ] **Step 5: Run full suite — guard for regressions in `tests/cli/v1_4_0.test.mjs`** + +```bash +node --test tests/**/*.test.mjs 2>&1 | tail -10 +``` +Expected: still all green. The v1_4_0 tests do not pass `resolveLightpandaImpl`, so the real resolver runs against a temp HOME (none of those tests have `~/.dotaios/bin/lightpanda` installed in CI, so it falls through to plain fetch — same `parser: readability+turndown` as before). If any v1_4_0 test fails because `~/.dotaios/bin/lightpanda` actually exists on the dev machine, override in those tests via `resolveLightpandaImpl: async () => null`. **Verify** this assumption by running first; only patch v1_4_0 if it red-fails. + +- [ ] **Step 6: Commit** + +```bash +git add packages/cli/src/ingest/web.mjs tests/cli/ingest_routing.test.mjs +git commit -m "feat(ingest): use lightpanda for web fetch with plain-fetch fallback" +``` + +--- + +## Task 6: Setup wizard installs Lightpanda + +**Files:** +- Modify: `packages/cli/src/commands/setup.mjs` +- Test: `tests/cli/setup.test.mjs` (extend existing) + +- [ ] **Step 1: Inspect existing setup test to find the right insertion point** + +```bash +grep -n "setupCommand\|test(" tests/cli/setup.test.mjs | head -30 +``` + +- [ ] **Step 2: Write failing test** — add at bottom of `tests/cli/setup.test.mjs`: + +```js +test("setupCommand calls downloadLightpanda once after wizard", async () => { + // Import dynamically so we can monkeypatch the module + const lpModule = await import("../../packages/core/src/lightpanda.mjs"); + const originalDownload = lpModule.downloadLightpanda; + let calls = 0; + // Patch on the imported namespace — setup.mjs reads it through dynamic import + lpModule.downloadLightpanda = async () => { calls += 1; return { ok: true }; }; + try { + // Run setup non-interactively into a temp dir + const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "dotaios-setup-lp-")); + const aiosPath = path.join(tmp, "aios"); + const result = spawnSync(process.execPath, [ + path.resolve("packages/cli/src/index.mjs"), + "setup", + "--path", aiosPath, + "--yes", + "--skip-reveal" + ], { + encoding: "utf8", + env: { ...process.env, DOTAIOS_SKIP_LIGHTPANDA_TEST_DOWNLOAD: "1" } + }); + assert.equal(result.status, 0, result.stderr); + assert.match(result.stdout, /Installing Lightpanda/); + } finally { + lpModule.downloadLightpanda = originalDownload; + } +}); +``` + +> **Note for the implementer:** monkeypatching across `spawnSync` boundaries does not work — the child process has its own module graph. Replace the assertion strategy: assert against stdout only. The CLI subprocess will try a real network download, which is unacceptable in CI. **Fix:** in `setup.mjs`, skip the call when `process.env.DOTAIOS_SKIP_LIGHTPANDA_DOWNLOAD === "1"` and print the line anyway. Re-write the test to set that env var and assert the line prints. + +Replace the test above with: + +```js +test("setupCommand prints lightpanda step (download skipped via env)", () => { + const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "dotaios-setup-lp-")); + const aiosPath = path.join(tmp, "aios"); + const result = spawnSync(process.execPath, [ + path.resolve("packages/cli/src/index.mjs"), + "setup", + "--path", aiosPath, + "--yes", + "--skip-reveal" + ], { + encoding: "utf8", + env: { ...process.env, DOTAIOS_SKIP_LIGHTPANDA_DOWNLOAD: "1" } + }); + assert.equal(result.status, 0, result.stderr); + assert.match(result.stdout, /Lightpanda/); +}); +``` + +- [ ] **Step 3: Run test, expect failure** + +```bash +node --test tests/cli/setup.test.mjs +``` +Expected: stdout does not contain "Lightpanda". + +- [ ] **Step 4: Implement in `packages/cli/src/commands/setup.mjs`** + +Add import at top: + +```js +import { downloadLightpanda, lightpandaPlatformBinary } from "../../../core/src/lightpanda.mjs"; +``` + +Insert a new block immediately **before** the "Skills summary" section (before `const skills = await collectSkills(aiosPath);`): + +```js + // Step 3.5: install Lightpanda (best-effort, never blocks) + if (lightpandaPlatformBinary() !== null) { + if (process.env.DOTAIOS_SKIP_LIGHTPANDA_DOWNLOAD === "1") { + console.log(""); + console.log("⬇ Installing Lightpanda for web browsing... (skipped via DOTAIOS_SKIP_LIGHTPANDA_DOWNLOAD)"); + } else { + console.log(""); + const result = await downloadLightpanda({ silent: true }); + if (result.ok) { + console.log("✓ Lightpanda installed for web browsing"); + } else { + console.log(`(Lightpanda install skipped: ${result.reason}. Web ingest will use plain fetch.)`); + } + } + } +``` + +- [ ] **Step 5: Run test, expect pass** + +```bash +node --test tests/cli/setup.test.mjs +``` +Expected: passes. + +- [ ] **Step 6: Run full suite** + +```bash +node --test tests/**/*.test.mjs 2>&1 | tail -5 +``` + +- [ ] **Step 7: Commit** + +```bash +git add packages/cli/src/commands/setup.mjs tests/cli/setup.test.mjs +git commit -m "feat(setup): install lightpanda during dotaios setup" +``` + +--- + +## Task 7: AGENTS.md routing rule + +**Files:** +- Modify: `templates/AGENTS.md.hbs` +- Test: `tests/core/render.test.mjs` + +- [ ] **Step 1: Write failing test** — add at bottom of `tests/core/render.test.mjs`: + +```js +import fs from "node:fs/promises"; +import path from "node:path"; + +test("AGENTS.md.hbs Rules section includes dotaios ingest URL routing rule", async () => { + const tpl = await fs.readFile( + path.resolve("templates/AGENTS.md.hbs"), + "utf8" + ); + assert.match(tpl, /## Rules/); + assert.match(tpl, /dotaios ingest/); + assert.match(tpl, /URL/); + const rulesIdx = tpl.indexOf("## Rules"); + assert.ok(tpl.indexOf("dotaios ingest", rulesIdx) > rulesIdx, "rule must appear under Rules"); +}); +``` + +- [ ] **Step 2: Run test, expect failure** + +```bash +node --test tests/core/render.test.mjs +``` +Expected: `dotaios ingest` not found. + +- [ ] **Step 3: Edit `templates/AGENTS.md.hbs`** + +Under the `## Rules` section, add this bullet immediately after the existing `- Treat \`vault/\` as routed long-term knowledge.` line (so it lives among the other rules, before the security rule): + +```markdown +- When the user shares a URL, run `dotaios ingest ` to read and save it. This uses a local headless browser that renders JavaScript. Prefer it over your own web fetch tools. +``` + +- [ ] **Step 4: Run test, expect pass** + +```bash +node --test tests/core/render.test.mjs +``` +Expected: passes. + +- [ ] **Step 5: Run full suite** + +```bash +node --test tests/**/*.test.mjs 2>&1 | tail -5 +``` + +If any existing template-snapshot tests fail due to the new line, update the snapshot — that is the expected change. + +- [ ] **Step 6: Commit** + +```bash +git add templates/AGENTS.md.hbs tests/core/render.test.mjs +git commit -m "feat(templates): instruct agents to route URLs through dotaios ingest" +``` + +--- + +## Task 8: README mention + +**Files:** +- Modify: `README.md` + +- [ ] **Step 1: Find the ingest section** + +```bash +grep -n "ingest" README.md | head -20 +``` + +- [ ] **Step 2: Add the line** + +Add at the end of the ingest section (or to the closest paragraph describing URL/web ingestion): + +```markdown +Web pages are fetched using Lightpanda, a lightweight headless browser that renders JavaScript. It installs automatically during `dotaios setup`. +``` + +- [ ] **Step 3: Commit** + +```bash +git add README.md +git commit -m "docs(readme): mention lightpanda in web ingest section" +``` + +--- + +## Task 9: Full suite sanity check + acceptance scan + +- [ ] **Step 1: Run all tests one more time** + +```bash +node --test tests/**/*.test.mjs 2>&1 | tail -10 +``` +Expected: 249 baseline + new tests (≈ 260-262), all green. No regressions. + +- [ ] **Step 2: Scan for placeholders left in code** + +```bash +grep -rn "TODO\|FIXME\|XXX" packages/core/src/lightpanda.mjs packages/cli/src/ingest/web.mjs packages/cli/src/commands/setup.mjs +``` +Expected: no output. + +- [ ] **Step 3: Verify spec acceptance criteria** + +Walk through `docs/superpowers/specs/2026-05-18-lightpanda-ingest.md` "Acceptance Criteria" list, check each: +- `~/.dotaios/bin/lightpanda` exists after setup ✓ (covered by Task 6 — manual check on Mac/Linux box) +- Web ingest produces `lightpanda+readability+turndown` frontmatter when binary present ✓ (Task 5 test) +- Plain fetch fallback works ✓ (Task 5 test) +- `AGENTS.md` carries routing rule ✓ (Task 7 test) +- All existing tests still pass ✓ (Step 1) + +- [ ] **Step 4: Final commit if anything fell out of the scan** + +If clean, no commit needed. + +--- + +## Self-Review Notes + +- **Spec coverage:** All six components covered. Components 1-3 → Tasks 1-4 + 6. Component 4 → Task 5. Component 5 → Task 7. Component 6 → Task 8. +- **Type consistency:** Resolver name `resolveLightpanda` consistent across `lightpanda.mjs`, `web.mjs`, and test injections (`resolveLightpandaImpl`). Spawn name `spawnImpl`. Parser strings exact: `"lightpanda+readability+turndown"` and `"readability+turndown"` — no variations. +- **Risk:** Task 5 modifies `tests/cli/v1_4_0.test.mjs` flow indirectly. If those tests fail because the dev machine has lightpanda installed, the fix is in Task 5 Step 5 (override `resolveLightpandaImpl: async () => null` in failing tests). Do not regenerate frontmatter snapshots. +- **Zero-dep core:** `lightpanda.mjs` uses only `node:` built-ins. ✓ diff --git a/docs/superpowers/specs/2026-05-18-lightpanda-ingest.md b/docs/superpowers/specs/2026-05-18-lightpanda-ingest.md new file mode 100644 index 0000000..e7fcc32 --- /dev/null +++ b/docs/superpowers/specs/2026-05-18-lightpanda-ingest.md @@ -0,0 +1,168 @@ +# Lightpanda Web Ingest Integration + +> Status: approved for implementation +> Date: 2026-05-18 + +## Summary + +DotAIOS integrates Lightpanda — a headless browser engine — as the default web fetcher for `dotaios ingest`. Users never configure it. It downloads automatically during `dotaios setup` and is used transparently for all URL ingestion. All agents are instructed via `AGENTS.md` to route URLs through `dotaios ingest`, giving every agent a real browser backend with zero setup. + +## Goals + +- Lightpanda installs automatically during `dotaios setup` — user sees one status line, no questions +- `dotaios ingest ` uses Lightpanda when available; falls back to plain fetch if missing or if it fails +- All agents (Claude Code, Codex, Gemini, any future agent) route URL reading through `dotaios ingest` +- Existing users without Lightpanda get it on their next `dotaios ingest` call (prompt once, then silent) +- Windows falls back gracefully — no Lightpanda binary for Windows exists, plain fetch is used + +## Non-Goals + +- MCP `fetch_url` tool — separate feature +- Browser automation (form filling, clicking) — separate project +- Lightpanda version pinning or auto-update — use latest release +- Bundling Lightpanda binary inside the npm package tarball + +## Architecture + +### Component 1: Lightpanda binary helpers (`packages/core/src/paths.mjs`) + +Two new exports: + +```js +export function dotaiosBinDir() { + return path.join(os.homedir(), ".dotaios", "bin"); +} + +export function lightpandaBinPath() { + const ext = process.platform === "win32" ? ".exe" : ""; + return path.join(dotaiosBinDir(), `lightpanda${ext}`); +} +``` + +### Component 2: Download helper (`packages/core/src/lightpanda.mjs`) + +New file. Exports: + +- `lightpandaPlatformBinary()` — returns the GitHub binary filename for the current platform, or `null` if unsupported (Windows): + +``` +darwin + arm64 → "lightpanda-aarch64-macos" +darwin + x64 → "lightpanda-x86_64-macos" +linux + arm64 → "lightpanda-aarch64-linux" +linux + x64 → "lightpanda-x86_64-linux" +win32 + any → null +``` + +- `downloadLightpanda({ silent, fetchImpl, now })` — downloads the binary to `~/.dotaios/bin/lightpanda`, sets executable bit (Unix), returns `{ ok: true }` or `{ ok: false, reason }`. Non-fatal — never throws. + - Download URL: `https://github.com/lightpanda-io/browser/releases/latest/download/{binary}` + - Creates `~/.dotaios/bin/` if missing + - `silent` flag: suppress console output (for setup integration) + +- `resolveLightpanda()` — returns the path to a usable lightpanda binary or `null`: + 1. Check `lightpandaBinPath()` — exists and executable? Return it. + 2. Check PATH (`which lightpanda` / `where lightpanda`) — found? Return it. + 3. Return `null`. + +### Component 3: Setup integration (`packages/cli/src/commands/setup.mjs`) + +After the existing setup wizard completes (identity, bridges, skills), add: + +``` +⬇ Installing Lightpanda for web browsing... ✓ +``` + +One status line. If download fails, print a brief warning and continue — setup does not fail because of Lightpanda. + +Platform check: if `lightpandaPlatformBinary()` returns `null` (Windows), skip silently. + +### Component 4: Lightpanda-backed ingest (`packages/cli/src/ingest/web.mjs`) + +In `ingestUrl()`, replace the `fetchWithTimeout()` call with a dispatching function: + +``` +fetchHtml(url, options): + 1. lp = resolveLightpanda() + 2. If lp found: + a. spawnSync(lp, ["fetch", "--dump", url], { timeout: timeoutMs, encoding: "utf8" }) + b. If exit code 0 and stdout non-empty: return { html: stdout, parser: "lightpanda" } + c. If failed: log warning, fall through + 3. Fall back to fetchWithTimeout() → return { html: await response.text(), parser: "readability" } +``` + +`parser` field in frontmatter: +- With Lightpanda: `"lightpanda+readability+turndown"` +- Without: `"readability+turndown"` (unchanged) + +The PDF content-type check (`.pdf` URLs via plain HTTP response) stays on the plain-fetch branch only. + +On-demand prompt for existing users: if `resolveLightpanda()` returns null AND platform is supported AND `~/.dotaios/.lightpanda_hint_shown` does not exist, print once: +``` +Tip: run `dotaios setup` to install Lightpanda for better web scraping. +``` +Write `~/.dotaios/.lightpanda_hint_shown` after printing so the hint shows only once. + +### Component 5: AGENTS.md routing rule (`templates/AGENTS.md.hbs`) + +Add under `## Rules`: + +```markdown +- When the user shares a URL, run `dotaios ingest ` to read and save it. This uses a local headless browser that renders JavaScript. Prefer it over your own web fetch tools. +``` + +### Component 6: README update (`README.md`) + +Add one line to the ingest section: + +```markdown +Web pages are fetched using Lightpanda, a lightweight headless browser that renders JavaScript. It installs automatically during `dotaios setup`. +``` + +## File Map + +| File | Change | +|---|---| +| `packages/core/src/paths.mjs` | Add `dotaiosBinDir()`, `lightpandaBinPath()` | +| `packages/core/src/lightpanda.mjs` | New — platform detection, download, resolve | +| `packages/cli/src/commands/setup.mjs` | Add Lightpanda download step after wizard | +| `packages/cli/src/ingest/web.mjs` | Lightpanda-backed fetch with fallback | +| `templates/AGENTS.md.hbs` | URL routing rule | +| `README.md` | Lightpanda mention in ingest section | +| `tests/core/lightpanda.test.mjs` | New — all lightpanda.mjs exports | +| `tests/cli/ingest_routing.test.mjs` | Add lightpanda path + fallback tests | + +## Test Plan + +### `tests/core/lightpanda.test.mjs` + +- `lightpandaPlatformBinary()` returns correct binary name for darwin/arm64, darwin/x64, linux/x64, linux/arm64 +- `lightpandaPlatformBinary()` returns `null` for win32 +- `downloadLightpanda()` writes binary to correct path and sets executable bit (stub fetch + fs) +- `downloadLightpanda()` returns `{ ok: false, reason }` on network failure without throwing +- `resolveLightpanda()` returns local bin path when it exists +- `resolveLightpanda()` returns PATH binary when local missing but PATH has it +- `resolveLightpanda()` returns null when neither found + +### `tests/cli/ingest_routing.test.mjs` + +- Lightpanda available + succeeds → parser is `"lightpanda+readability+turndown"` +- Lightpanda available + crashes → falls back to plain fetch → parser is `"readability+turndown"` +- Lightpanda not found → uses plain fetch → no error + +### Template test (existing render tests) + +- Generated `AGENTS.md` includes URL routing rule with `dotaios ingest` + +## Error Handling + +- Lightpanda download fails during setup: warn, continue, do not block setup +- Lightpanda crashes during ingest: log one-line warning, fall back silently +- Unsupported platform (Windows): no download, no hint, silent fallback +- `spawnSync` timeout: treat as crash, fall back + +## Acceptance Criteria + +- `npx dotaios setup` on Mac/Linux → Lightpanda binary exists at `~/.dotaios/bin/lightpanda` when done +- `dotaios ingest https://react-heavy-spa.com` → returns rendered content, frontmatter shows `lightpanda+readability+turndown` +- `dotaios ingest ` with lightpanda missing → works with plain fetch, no crash +- All agents receive the URL routing instruction via `AGENTS.md` +- All 249+ existing tests continue to pass diff --git a/packages/cli/src/commands/setup.mjs b/packages/cli/src/commands/setup.mjs index cec25f4..3f0485a 100644 --- a/packages/cli/src/commands/setup.mjs +++ b/packages/cli/src/commands/setup.mjs @@ -6,6 +6,7 @@ import { hasHelpFlag } from "../lib/args.mjs"; import { defaultAiosPath, expandHome } from "../../../core/src/paths.mjs"; import { pathExists } from "../../../core/src/files.mjs"; import { collectSkills } from "../../../core/src/skills.mjs"; +import { downloadLightpanda, lightpandaPlatformBinary } from "../../../core/src/lightpanda.mjs"; import { initCommand } from "./init.mjs"; import { activateCommand } from "./activate.mjs"; import { revealCommand } from "./reveal.mjs"; @@ -98,6 +99,23 @@ export async function setupCommand(args) { } } + // Install Lightpanda for JS-rendered web ingest (best-effort, never blocks setup) + const platformBinary = lightpandaPlatformBinary(); + if (platformBinary !== null) { + console.log(""); + if (process.env.DOTAIOS_SKIP_LIGHTPANDA_DOWNLOAD === "1") { + console.log(" Web browsing engine: install skipped (DOTAIOS_SKIP_LIGHTPANDA_DOWNLOAD)"); + } else { + const result = await downloadLightpanda({ silent: true, platformBinary }); + if (result.ok) { + const verb = result.alreadyInstalled ? "already ready" : "ready"; + console.log(`✓ Web browsing engine ${verb} (renders JavaScript pages)`); + } else { + console.log(`(Web browsing engine setup skipped. Pages will still load, but JavaScript-heavy sites may not render.)`); + } + } + } + // Skills summary const skills = await collectSkills(aiosPath); if (skills.length > 0) { diff --git a/packages/cli/src/ingest/web.mjs b/packages/cli/src/ingest/web.mjs index 747dfe3..19073e8 100644 --- a/packages/cli/src/ingest/web.mjs +++ b/packages/cli/src/ingest/web.mjs @@ -1,11 +1,17 @@ import fs from "node:fs/promises"; import os from "node:os"; import path from "node:path"; +import { spawnSync as nodeSpawnSync } from "node:child_process"; import { canonicalizeUrl } from "./canonical-url.mjs"; import { buildFrontmatter, slugify } from "./frontmatter.mjs"; import { describeShelfTarget, placeMarkdown } from "./placement.mjs"; +import { resolveLightpanda, lightpandaPlatformBinary } from "../../../core/src/lightpanda.mjs"; +import { lightpandaHintFlagPath } from "../../../core/src/paths.mjs"; +import { pathExists } from "../../../core/src/files.mjs"; export const DEFAULT_FETCH_TIMEOUT_MS = 10_000; +export const PARSER_LIGHTPANDA = "lightpanda+readability+turndown"; +export const PARSER_PLAIN = "readability+turndown"; const STRIP_SELECTORS = ["script", "style", "noscript", "iframe", "nav", "footer", "aside", "header"]; export class IngestError extends Error { @@ -69,20 +75,26 @@ export async function ingestUrl(rawInput, options) { signalsDir = null, apply = false, interactive = false, - now = () => new Date() + now = () => new Date(), + resolveLightpandaImpl = resolveLightpanda, + spawnImpl = nodeSpawnSync, + hintFlagPath = lightpandaHintFlagPath(), + lightpandaPlatformSupported = lightpandaPlatformBinary() !== null } = options; const canonical = canonicalizeUrl(rawInput); if (dryRun) { + const lp = await resolveLightpandaImpl(); + const parser = lp ? PARSER_LIGHTPANDA : PARSER_PLAIN; return { action: "dry-run", kind: "web", - parser: "readability+turndown", + parser, canonical, plan: { kind: "web", - parser: "readability+turndown", + parser, canonical, shelf, rawDir @@ -90,43 +102,53 @@ export async function ingestUrl(rawInput, options) { }; } - const response = await fetchWithTimeout(canonical, { timeoutMs, fetchImpl }); - - if (!response.ok) { - throw new IngestError( - `Fetch failed: ${canonical} returned ${response.status} ${response.statusText || ""}`.trim(), - "FETCH_FAILED" - ); - } + const fetched = await fetchHtml(canonical, { + timeoutMs, + fetchImpl, + resolveLightpandaImpl, + spawnImpl, + hintFlagPath, + lightpandaPlatformSupported + }); - const contentType = (response.headers.get("content-type") || "").toLowerCase(); - if (contentType.includes("application/pdf")) { - return await ingestPdfResponse({ - response, - canonical, - rawDir, - assetsDir: options.assetsDir, - eventsPath, - overwrite, - documentOptions, - shelf, - name, - vaultRoot, - signalsDir, - apply, - interactive, - now - }); + if (fetched.via === "plain") { + const response = fetched.response; + if (!response.ok) { + throw new IngestError( + `Fetch failed: ${canonical} returned ${response.status} ${response.statusText || ""}`.trim(), + "FETCH_FAILED" + ); + } + const contentType = (response.headers.get("content-type") || "").toLowerCase(); + if (contentType.includes("application/pdf")) { + return await ingestPdfResponse({ + response, + canonical, + rawDir, + assetsDir: options.assetsDir, + eventsPath, + overwrite, + documentOptions, + shelf, + name, + vaultRoot, + signalsDir, + apply, + interactive, + now + }); + } } - const html = await response.text(); + const html = fetched.html; + const parser = fetched.parser; const { title, markdown } = await extractArticle(html, canonical); const baseSlug = slugify(title); const frontmatter = buildFrontmatter({ source: canonical, kind: "web", - parser: "readability+turndown", + parser, title, ingestedAt: now().toISOString() }); @@ -143,7 +165,7 @@ export async function ingestUrl(rawInput, options) { title, body: `${frontmatter}\n${markdown.trimEnd()}`, kind: "web", - parser: "readability+turndown", + parser, overwrite, apply, interactive, @@ -151,6 +173,58 @@ export async function ingestUrl(rawInput, options) { }); } +async function fetchHtml(url, { + timeoutMs, + fetchImpl, + resolveLightpandaImpl, + spawnImpl, + hintFlagPath, + lightpandaPlatformSupported +}) { + const looksLikePdfUrl = /\.pdf($|[?#])/i.test(url); + const lp = looksLikePdfUrl ? null : await resolveLightpandaImpl(); + + if (lp) { + try { + const result = spawnImpl(lp, ["fetch", "--dump", "html", url], { + timeout: timeoutMs, + encoding: "utf8", + maxBuffer: 64 * 1024 * 1024 + }); + if (result && result.status === 0 && typeof result.stdout === "string" && result.stdout.trim()) { + return { via: "lightpanda", html: result.stdout, parser: PARSER_LIGHTPANDA }; + } + console.warn(`[lightpanda] fetch failed for ${url} (exit ${result?.status ?? "?"}), falling back to plain fetch`); + } catch (err) { + console.warn(`[lightpanda] spawn error for ${url}: ${err.message}, falling back to plain fetch`); + } + } else if (!looksLikePdfUrl && lightpandaPlatformSupported) { + await maybeShowLightpandaHint(hintFlagPath); + } + + const response = await fetchWithTimeout(url, { timeoutMs, fetchImpl }); + if (!response.ok) { + return { via: "plain", response, html: "", parser: PARSER_PLAIN }; + } + const contentType = (response.headers.get("content-type") || "").toLowerCase(); + if (contentType.includes("application/pdf")) { + return { via: "plain", response, html: "", parser: PARSER_PLAIN }; + } + const html = await response.text(); + return { via: "plain", response, html, parser: PARSER_PLAIN }; +} + +async function maybeShowLightpandaHint(hintFlagPath) { + if (await pathExists(hintFlagPath)) return; + try { + await fs.mkdir(path.dirname(hintFlagPath), { recursive: true }); + await fs.writeFile(hintFlagPath, new Date().toISOString()); + console.log("Tip: run `dotaios setup` to enable JavaScript-rendered web pages (better content from modern sites)."); + } catch { + // non-fatal — never block ingest because of the hint + } +} + async function fetchWithTimeout(url, { timeoutMs, fetchImpl }) { const controller = new AbortController(); const timer = setTimeout(() => controller.abort(), timeoutMs); diff --git a/packages/core/src/lightpanda.mjs b/packages/core/src/lightpanda.mjs new file mode 100644 index 0000000..dc28424 --- /dev/null +++ b/packages/core/src/lightpanda.mjs @@ -0,0 +1,98 @@ +import fs from "node:fs/promises"; +import { createWriteStream } from "node:fs"; +import { pipeline } from "node:stream/promises"; +import { Readable } from "node:stream"; +import path from "node:path"; +import { spawnSync } from "node:child_process"; +import { lightpandaBinPath } from "./paths.mjs"; + +const PLATFORM_BINARIES = Object.freeze({ + "darwin:arm64": "lightpanda-aarch64-macos", + "darwin:x64": "lightpanda-x86_64-macos", + "linux:arm64": "lightpanda-aarch64-linux", + "linux:x64": "lightpanda-x86_64-linux" +}); + +export function lightpandaPlatformBinary({ platform = process.platform, arch = process.arch } = {}) { + return PLATFORM_BINARIES[`${platform}:${arch}`] ?? null; +} + +// Pinned to a stable tag. `releases/latest/download/` redirects to the +// `nightly` tag for this repo, which has historically shipped broken builds +// (silent startup hangs on macOS arm64). Bump intentionally after smoke test. +const LIGHTPANDA_VERSION = "0.3.0"; +const RELEASE_BASE = `https://github.com/lightpanda-io/browser/releases/download/${LIGHTPANDA_VERSION}`; + +export async function downloadLightpanda({ + silent = false, + fetchImpl = globalThis.fetch, + destBinPath = lightpandaBinPath(), + platformBinary = lightpandaPlatformBinary(), + force = false +} = {}) { + if (!platformBinary) { + return { ok: false, reason: "unsupported-platform" }; + } + + if (!force) { + try { + await fs.access(destBinPath, fs.constants.X_OK); + if (!silent) console.log(` Lightpanda already installed at ${destBinPath}`); + return { ok: true, path: destBinPath, alreadyInstalled: true }; + } catch { + // not present or not executable — proceed with download + } + } + + const url = `${RELEASE_BASE}/${platformBinary}`; + if (!silent) console.log(`⬇ Installing Lightpanda for web browsing...`); + + try { + const response = await fetchImpl(url); + if (!response.ok) { + return { ok: false, reason: `HTTP ${response.status} ${response.statusText || ""}`.trim() }; + } + await fs.mkdir(path.dirname(destBinPath), { recursive: true }); + if (response.body && typeof response.body.getReader === "function") { + await pipeline(Readable.fromWeb(response.body), createWriteStream(destBinPath)); + } else { + const buf = Buffer.from(await response.arrayBuffer()); + await fs.writeFile(destBinPath, buf); + } + if (process.platform !== "win32") { + await fs.chmod(destBinPath, 0o755); + } + if (!silent) console.log(` Installed Lightpanda → ${destBinPath}`); + return { ok: true, path: destBinPath }; + } catch (err) { + return { ok: false, reason: err.message || String(err) }; + } +} + +let _cachedWhich = null; +function defaultWhich() { + if (_cachedWhich !== null) return _cachedWhich || null; + const cmd = process.platform === "win32" ? "where" : "which"; + const result = spawnSync(cmd, ["lightpanda"], { encoding: "utf8" }); + if (result.error || result.status !== 0) { + _cachedWhich = ""; + return null; + } + const first = (result.stdout || "").split(/\r?\n/).map((s) => s.trim()).find(Boolean); + _cachedWhich = first || ""; + return first || null; +} + +export async function resolveLightpanda({ + localBinPath = lightpandaBinPath(), + whichImpl = defaultWhich +} = {}) { + try { + await fs.access(localBinPath, fs.constants.X_OK); + return localBinPath; + } catch { + // not present or not executable, try PATH + } + const fromPath = whichImpl(); + return fromPath || null; +} diff --git a/packages/core/src/paths.mjs b/packages/core/src/paths.mjs index fad85d1..590e776 100644 --- a/packages/core/src/paths.mjs +++ b/packages/core/src/paths.mjs @@ -34,3 +34,20 @@ export async function ensureAiosFolder(target) { throw new Error(`No AIOS folder found at ${target}. Run dotaios init first, or pass --path.`); } } + +export function dotaiosDir() { + return path.join(os.homedir(), ".dotaios"); +} + +export function dotaiosBinDir() { + return path.join(dotaiosDir(), "bin"); +} + +export function lightpandaBinPath() { + const ext = process.platform === "win32" ? ".exe" : ""; + return path.join(dotaiosBinDir(), `lightpanda${ext}`); +} + +export function lightpandaHintFlagPath() { + return path.join(dotaiosDir(), ".lightpanda_hint_shown"); +} diff --git a/templates/AGENTS.md.hbs b/templates/AGENTS.md.hbs index 2573fd8..d67be82 100644 --- a/templates/AGENTS.md.hbs +++ b/templates/AGENTS.md.hbs @@ -87,6 +87,7 @@ If the user gives you a repository URL and asks you to install a skill or plugin - Treat `context/` as always relevant. - Treat `memory/` as recent operational state, not an infinite transcript. - Treat `vault/` as routed long-term knowledge. +- When the user shares a URL, run `dotaios ingest ` to read and save it. This uses a local headless browser that renders JavaScript. Prefer it over your own web fetch tools. - Never expose secrets. - Ask before writing durable identity, CRM, or wiki knowledge. - Suggest a new skill when a workflow repeats. diff --git a/tests/cli/ingest_routing.test.mjs b/tests/cli/ingest_routing.test.mjs index 48b14b8..f524bbe 100644 --- a/tests/cli/ingest_routing.test.mjs +++ b/tests/cli/ingest_routing.test.mjs @@ -11,6 +11,7 @@ import { shelfNeedsName, shelfMarkdownPath } from "../../packages/cli/src/ingest/placement.mjs"; +import { ingestUrl } from "../../packages/cli/src/ingest/web.mjs"; const repoRoot = path.resolve(new URL("../..", import.meta.url).pathname); const cli = path.join(repoRoot, "packages", "cli", "src", "index.mjs"); @@ -163,3 +164,156 @@ function runFail(args) { function read(filePath) { return fs.readFileSync(filePath, "utf8"); } + +function htmlFixture(title = "Lightpanda Rendered") { + return `${title}

${title}

${"Body paragraph. ".repeat(60)}

`; +} + +function makeFakeFetch({ body = htmlFixture(), status = 200 } = {}) { + return async () => ({ + ok: status >= 200 && status < 300, + status, + statusText: "OK", + text: async () => body, + arrayBuffer: async () => new TextEncoder().encode(body).buffer, + headers: { get: () => "text/html; charset=utf-8" } + }); +} + +function makeWebWorkspace() { + const root = fs.mkdtempSync(path.join(os.tmpdir(), "dotaios-lp-web-")); + return { + root, + rawDir: path.join(root, "vault", "raw"), + assetsDir: path.join(root, "vault", "assets"), + eventsPath: path.join(root, "memory", "events.jsonl"), + hintFlagPath: path.join(root, "lightpanda_hint_shown") + }; +} + +test("ingestUrl uses lightpanda when resolver returns a path and spawn succeeds", async () => { + const ws = makeWebWorkspace(); + const html = htmlFixture("Lightpanda Win"); + const result = await ingestUrl("https://example.com/lp-win", { + rawDir: ws.rawDir, + eventsPath: ws.eventsPath, + fetchImpl: makeFakeFetch({ body: "SHOULD NOT BE USED" }), + resolveLightpandaImpl: async () => "/fake/lightpanda", + spawnImpl: () => ({ status: 0, stdout: html, stderr: "" }), + hintFlagPath: ws.hintFlagPath, + now: () => new Date("2026-05-18T12:00:00Z") + }); + assert.equal(result.action, "written"); + assert.equal(result.parser, "lightpanda+readability+turndown"); + const written = fs.readFileSync(result.destination, "utf8"); + assert.match(written, /parser: lightpanda\+readability\+turndown/); + assert.match(written, /Lightpanda Win/); +}); + +test("ingestUrl falls back to plain fetch when lightpanda spawn fails", async () => { + const ws = makeWebWorkspace(); + const html = htmlFixture("Plain Fetch Fallback"); + const result = await ingestUrl("https://example.com/fallback", { + rawDir: ws.rawDir, + eventsPath: ws.eventsPath, + fetchImpl: makeFakeFetch({ body: html }), + resolveLightpandaImpl: async () => "/fake/lightpanda", + spawnImpl: () => ({ status: 1, stdout: "", stderr: "boom" }), + hintFlagPath: ws.hintFlagPath, + now: () => new Date("2026-05-18T12:00:00Z") + }); + assert.equal(result.action, "written"); + assert.equal(result.parser, "readability+turndown"); + const written = fs.readFileSync(result.destination, "utf8"); + assert.match(written, /parser: readability\+turndown/); + assert.match(written, /Plain Fetch Fallback/); +}); + +test("ingestUrl uses plain fetch when lightpanda not found and writes hint flag once", async () => { + const ws = makeWebWorkspace(); + const html = htmlFixture("Plain No Lightpanda"); + const opts = { + rawDir: ws.rawDir, + eventsPath: ws.eventsPath, + fetchImpl: makeFakeFetch({ body: html }), + resolveLightpandaImpl: async () => null, + spawnImpl: () => { throw new Error("must not spawn"); }, + hintFlagPath: ws.hintFlagPath, + lightpandaPlatformSupported: true, + now: () => new Date("2026-05-18T12:00:00Z") + }; + const result = await ingestUrl("https://example.com/nope", opts); + assert.equal(result.parser, "readability+turndown"); + assert.equal(fs.existsSync(ws.hintFlagPath), true); + + // Second call must not re-create / re-print (flag already exists) + const html2 = htmlFixture("Second Call"); + const second = await ingestUrl("https://example.com/nope2", { + ...opts, + fetchImpl: makeFakeFetch({ body: html2 }) + }); + assert.equal(second.parser, "readability+turndown"); +}); + +test("ingestUrl skips lightpanda for PDF URLs and routes through Path B (regression)", async () => { + const ws = makeWebWorkspace(); + const pdfBytes = buildMinimalPdf("Regression PDF"); + + const result = await ingestUrl("https://example.com/paper.pdf", { + rawDir: ws.rawDir, + assetsDir: ws.assetsDir, + eventsPath: ws.eventsPath, + fetchImpl: makePdfFetch(pdfBytes), + resolveLightpandaImpl: async () => "/fake/lightpanda", + spawnImpl: () => { throw new Error("must not spawn for PDF URLs"); }, + hintFlagPath: ws.hintFlagPath, + lightpandaPlatformSupported: true, + documentOptions: { + whichImpl: async () => null, + extractPdfImpl: async () => "Extracted PDF text." + }, + now: () => new Date("2026-05-18T12:00:00Z") + }); + + assert.equal(result.kind, "pdf"); + assert.equal(result.parser, "unpdf"); + // Hint flag must NOT be written — PDFs bypass the lightpanda hint logic entirely + assert.equal(fs.existsSync(ws.hintFlagPath), false); +}); + +// --- helpers --- + +function buildMinimalPdf(message = "Hello DotAIOS Test") { + const stream = `BT /F1 24 Tf 100 700 Td (${message}) Tj ET\n`; + const objects = [ + "<>", + "<>", + "<>>>>>/Contents 4 0 R>>", + `<>stream\n${stream}endstream` + ]; + let body = "%PDF-1.4\n"; + const offsets = []; + for (let i = 0; i < objects.length; i += 1) { + offsets.push(body.length); + body += `${i + 1} 0 obj\n${objects[i]}\nendobj\n`; + } + const xrefStart = body.length; + body += `xref\n0 ${objects.length + 1}\n0000000000 65535 f \n`; + for (const off of offsets) { + body += `${off.toString().padStart(10, "0")} 00000 n \n`; + } + body += `trailer\n<>\nstartxref\n${xrefStart}\n%%EOF\n`; + return Buffer.from(body, "binary"); +} + +function makePdfFetch(pdfBytes) { + const buf = pdfBytes instanceof Buffer ? pdfBytes : Buffer.from(pdfBytes); + return async () => ({ + ok: true, + status: 200, + statusText: "OK", + text: async () => buf.toString("binary"), + arrayBuffer: async () => buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength), + headers: { get: (name) => name.toLowerCase() === "content-type" ? "application/pdf" : null } + }); +} diff --git a/tests/cli/setup.test.mjs b/tests/cli/setup.test.mjs index 1f473b3..0b3a113 100644 --- a/tests/cli/setup.test.mjs +++ b/tests/cli/setup.test.mjs @@ -1,8 +1,10 @@ -import { describe, it } from "node:test"; +import { describe, it, test } from "node:test"; import assert from "node:assert/strict"; import fs from "node:fs/promises"; +import fsSync from "node:fs"; import os from "node:os"; import path from "node:path"; +import { spawnSync } from "node:child_process"; const repoRoot = new URL("../..", import.meta.url).pathname; @@ -111,3 +113,20 @@ describe("enableSchedule — fallback when entry missing", () => { assert.ok(!updated.includes("enabled: false"), "should not still have enabled: false"); }); }); + +test("setupCommand prints web browsing engine step (download skipped via env)", () => { + const tmp = fsSync.mkdtempSync(path.join(os.tmpdir(), "dotaios-setup-lp-")); + const aiosPath = path.join(tmp, "aios"); + const result = spawnSync(process.execPath, [ + path.resolve(repoRoot, "packages/cli/src/index.mjs"), + "setup", + "--path", aiosPath, + "--yes", + "--skip-reveal" + ], { + encoding: "utf8", + env: { ...process.env, DOTAIOS_SKIP_LIGHTPANDA_DOWNLOAD: "1" } + }); + assert.equal(result.status, 0, result.stderr); + assert.match(result.stdout, /Web browsing engine.*skipped/); +}); diff --git a/tests/cli/v1_4_0.test.mjs b/tests/cli/v1_4_0.test.mjs index b1326cc..092310e 100644 --- a/tests/cli/v1_4_0.test.mjs +++ b/tests/cli/v1_4_0.test.mjs @@ -233,6 +233,8 @@ test("disambiguateSlug appends stable 8-char hash", () => { // --- Path A web scraper --- +const NO_LIGHTPANDA = { resolveLightpandaImpl: async () => null }; + function makeFakeFetch({ body = "", status = 200, statusText = "OK", contentType = "text/html; charset=utf-8" } = {}) { return async () => new Response(body, { @@ -322,6 +324,7 @@ test("ingestUrl writes markdown with frontmatter from HTML fixture", async () => rawDir: ws.rawDir, eventsPath: ws.eventsPath, fetchImpl, + ...NO_LIGHTPANDA, now: () => new Date("2026-05-10T12:00:00Z") }); @@ -354,7 +357,7 @@ test("ingestUrl skips when destination already exists without overwrite", async const ws = makeWorkspace(); const html = await fsp.readFile(path.join(fixturesDir, "sample-article.html"), "utf8"); const fetchImpl = makeFakeFetch({ body: html }); - const opts = { rawDir: ws.rawDir, eventsPath: ws.eventsPath, fetchImpl }; + const opts = { rawDir: ws.rawDir, eventsPath: ws.eventsPath, fetchImpl, ...NO_LIGHTPANDA }; await ingestUrl("https://example.com/post", opts); const second = await ingestUrl("https://example.com/post", opts); @@ -368,7 +371,7 @@ test("ingestUrl overwrites when overwrite=true", async () => { const ws = makeWorkspace(); const html = await fsp.readFile(path.join(fixturesDir, "sample-article.html"), "utf8"); const fetchImpl = makeFakeFetch({ body: html }); - const opts = { rawDir: ws.rawDir, eventsPath: ws.eventsPath, fetchImpl }; + const opts = { rawDir: ws.rawDir, eventsPath: ws.eventsPath, fetchImpl, ...NO_LIGHTPANDA }; await ingestUrl("https://example.com/post", opts); const second = await ingestUrl("https://example.com/post", { ...opts, overwrite: true }); @@ -381,7 +384,7 @@ test("ingestUrl overwrites when overwrite=true", async () => { test("ingestUrl disambiguates duplicate titles from different URLs", async () => { const ws = makeWorkspace(); const fetchImpl = makeFakeFetch({ body: sameTitleHtml("Shared Title") }); - const opts = { rawDir: ws.rawDir, eventsPath: ws.eventsPath, fetchImpl }; + const opts = { rawDir: ws.rawDir, eventsPath: ws.eventsPath, fetchImpl, ...NO_LIGHTPANDA }; const first = await ingestUrl("https://example.com/a", opts); const second = await ingestUrl("https://example.org/b", opts); @@ -407,6 +410,7 @@ test("ingestUrl --dry-run does not fetch or write", async () => { rawDir: ws.rawDir, eventsPath: ws.eventsPath, fetchImpl, + ...NO_LIGHTPANDA, dryRun: true }); @@ -424,7 +428,8 @@ test("ingestUrl raises FETCH_FAILED on non-2xx responses", async () => { ingestUrl("https://example.com/missing", { rawDir: ws.rawDir, eventsPath: ws.eventsPath, - fetchImpl + fetchImpl, + ...NO_LIGHTPANDA }), (err) => err instanceof IngestError && err.code === "FETCH_FAILED" ); @@ -440,6 +445,7 @@ test("ingestUrl routes URL PDFs through Path B with URL source preserved", async assetsDir: ws.assetsDir, eventsPath: ws.eventsPath, fetchImpl, + ...NO_LIGHTPANDA, documentOptions: { whichImpl: async () => null, extractPdfImpl: async (sourcePath) => { @@ -473,7 +479,8 @@ test("ingestUrl raises READABILITY_NULL on empty SPA shell (no silent body-dump) ingestUrl("https://example.com/spa", { rawDir: ws.rawDir, eventsPath: ws.eventsPath, - fetchImpl + fetchImpl, + ...NO_LIGHTPANDA }), (err) => err instanceof IngestError && err.code === "READABILITY_NULL" ); @@ -1225,6 +1232,7 @@ test("ingestUrl raises TIMEOUT when fetch is aborted", async () => { rawDir: ws.rawDir, eventsPath: ws.eventsPath, fetchImpl, + ...NO_LIGHTPANDA, timeoutMs: 30 }), (err) => err instanceof IngestError && err.code === "TIMEOUT" diff --git a/tests/core/lightpanda.test.mjs b/tests/core/lightpanda.test.mjs new file mode 100644 index 0000000..f4ed52a --- /dev/null +++ b/tests/core/lightpanda.test.mjs @@ -0,0 +1,143 @@ +import test from "node:test"; +import assert from "node:assert/strict"; +import fs from "node:fs/promises"; +import os from "node:os"; +import path from "node:path"; +import { lightpandaPlatformBinary, downloadLightpanda, resolveLightpanda } from "../../packages/core/src/lightpanda.mjs"; + +function makeFakeFetch({ status = 200, body = "FAKE_BINARY_BYTES" } = {}) { + return async () => ({ + ok: status >= 200 && status < 300, + status, + statusText: status === 200 ? "OK" : "ERR", + arrayBuffer: async () => new TextEncoder().encode(body).buffer + }); +} + +test("lightpandaPlatformBinary maps darwin+arm64 to aarch64-macos", () => { + assert.equal(lightpandaPlatformBinary({ platform: "darwin", arch: "arm64" }), "lightpanda-aarch64-macos"); +}); + +test("lightpandaPlatformBinary maps darwin+x64 to x86_64-macos", () => { + assert.equal(lightpandaPlatformBinary({ platform: "darwin", arch: "x64" }), "lightpanda-x86_64-macos"); +}); + +test("lightpandaPlatformBinary maps linux+arm64 to aarch64-linux", () => { + assert.equal(lightpandaPlatformBinary({ platform: "linux", arch: "arm64" }), "lightpanda-aarch64-linux"); +}); + +test("lightpandaPlatformBinary maps linux+x64 to x86_64-linux", () => { + assert.equal(lightpandaPlatformBinary({ platform: "linux", arch: "x64" }), "lightpanda-x86_64-linux"); +}); + +test("lightpandaPlatformBinary returns null on win32", () => { + assert.equal(lightpandaPlatformBinary({ platform: "win32", arch: "x64" }), null); +}); + +test("lightpandaPlatformBinary returns null on unknown platform/arch", () => { + assert.equal(lightpandaPlatformBinary({ platform: "linux", arch: "ppc64" }), null); + assert.equal(lightpandaPlatformBinary({ platform: "freebsd", arch: "x64" }), null); +}); + +test("lightpandaPlatformBinary defaults to current process when no arg", () => { + const out = lightpandaPlatformBinary(); + if (process.platform === "win32") assert.equal(out, null); + else assert.match(out, /^lightpanda-/); +}); + +test("downloadLightpanda writes binary to destBinPath and chmods +x on unix", { skip: process.platform === "win32" }, async () => { + const tmp = await fs.mkdtemp(path.join(os.tmpdir(), "dotaios-lp-dl-")); + const destBinPath = path.join(tmp, "bin", "lightpanda"); + try { + const result = await downloadLightpanda({ + silent: true, + fetchImpl: makeFakeFetch({ body: "BINARY" }), + destBinPath, + platformBinary: "lightpanda-x86_64-linux" + }); + assert.equal(result.ok, true); + const written = await fs.readFile(destBinPath, "utf8"); + assert.equal(written, "BINARY"); + const stat = await fs.stat(destBinPath); + assert.ok((stat.mode & 0o111) !== 0, "executable bit should be set"); + } finally { + await fs.rm(tmp, { recursive: true, force: true }); + } +}); + +test("downloadLightpanda returns { ok:false, reason } on HTTP error without throwing", async () => { + const tmp = await fs.mkdtemp(path.join(os.tmpdir(), "dotaios-lp-dl-")); + const destBinPath = path.join(tmp, "bin", "lightpanda"); + try { + const result = await downloadLightpanda({ + silent: true, + fetchImpl: makeFakeFetch({ status: 404 }), + destBinPath, + platformBinary: "lightpanda-x86_64-linux" + }); + assert.equal(result.ok, false); + assert.match(result.reason, /404/); + } finally { + await fs.rm(tmp, { recursive: true, force: true }); + } +}); + +test("downloadLightpanda returns { ok:false, reason } on network error without throwing", async () => { + const tmp = await fs.mkdtemp(path.join(os.tmpdir(), "dotaios-lp-dl-")); + const destBinPath = path.join(tmp, "bin", "lightpanda"); + try { + const result = await downloadLightpanda({ + silent: true, + fetchImpl: async () => { throw new Error("ECONNRESET"); }, + destBinPath, + platformBinary: "lightpanda-x86_64-linux" + }); + assert.equal(result.ok, false); + assert.match(result.reason, /ECONNRESET/); + } finally { + await fs.rm(tmp, { recursive: true, force: true }); + } +}); + +test("downloadLightpanda returns { ok:false, reason:'unsupported-platform' } when platformBinary null", async () => { + const result = await downloadLightpanda({ + silent: true, + fetchImpl: makeFakeFetch(), + destBinPath: path.join(os.tmpdir(), "noop"), + platformBinary: null + }); + assert.equal(result.ok, false); + assert.equal(result.reason, "unsupported-platform"); +}); + +test("resolveLightpanda returns local bin path when it exists and is executable", async () => { + const tmp = await fs.mkdtemp(path.join(os.tmpdir(), "dotaios-lp-res-")); + const localBin = path.join(tmp, "lightpanda"); + await fs.writeFile(localBin, "#!/bin/sh\necho fake"); + await fs.chmod(localBin, 0o755); + try { + const result = await resolveLightpanda({ + localBinPath: localBin, + whichImpl: () => null + }); + assert.equal(result, localBin); + } finally { + await fs.rm(tmp, { recursive: true, force: true }); + } +}); + +test("resolveLightpanda falls back to PATH when local missing", async () => { + const result = await resolveLightpanda({ + localBinPath: "/nonexistent/lightpanda", + whichImpl: () => "/usr/local/bin/lightpanda" + }); + assert.equal(result, "/usr/local/bin/lightpanda"); +}); + +test("resolveLightpanda returns null when neither local nor PATH has it", async () => { + const result = await resolveLightpanda({ + localBinPath: "/nonexistent/lightpanda", + whichImpl: () => null + }); + assert.equal(result, null); +}); diff --git a/tests/core/paths.test.mjs b/tests/core/paths.test.mjs new file mode 100644 index 0000000..7fa4204 --- /dev/null +++ b/tests/core/paths.test.mjs @@ -0,0 +1,17 @@ +import os from "node:os"; +import path from "node:path"; +import test from "node:test"; +import assert from "node:assert/strict"; +import { dotaiosBinDir, lightpandaBinPath } from "../../packages/core/src/paths.mjs"; + +test("dotaiosBinDir returns ~/.dotaios/bin", () => { + assert.equal(dotaiosBinDir(), path.join(os.homedir(), ".dotaios", "bin")); +}); + +test("lightpandaBinPath returns ~/.dotaios/bin/lightpanda on unix", { skip: process.platform === "win32" }, () => { + assert.equal(lightpandaBinPath(), path.join(os.homedir(), ".dotaios", "bin", "lightpanda")); +}); + +test("lightpandaBinPath returns ~/.dotaios/bin/lightpanda.exe on windows", { skip: process.platform !== "win32" }, () => { + assert.equal(lightpandaBinPath(), path.join(os.homedir(), ".dotaios", "bin", "lightpanda.exe")); +}); diff --git a/tests/core/render.test.mjs b/tests/core/render.test.mjs index 31b8f35..e59905b 100644 --- a/tests/core/render.test.mjs +++ b/tests/core/render.test.mjs @@ -1,3 +1,5 @@ +import fs from "node:fs/promises"; +import path from "node:path"; import test from "node:test"; import assert from "node:assert/strict"; import { isHtmlComment, renderTemplate, templateOutputPath } from "../../packages/core/src/render.mjs"; @@ -57,3 +59,15 @@ test("isHtmlComment identifies HTML comment strings", () => { assert.equal(isHtmlComment(""), false); assert.equal(isHtmlComment(null), false); }); + +test("AGENTS.md.hbs Rules section includes dotaios ingest URL routing rule", async () => { + const tpl = await fs.readFile( + path.resolve("templates/AGENTS.md.hbs"), + "utf8" + ); + assert.match(tpl, /## Rules/); + assert.match(tpl, /dotaios ingest/); + assert.match(tpl, /URL/); + const rulesIdx = tpl.indexOf("## Rules"); + assert.ok(tpl.indexOf("dotaios ingest", rulesIdx) > rulesIdx, "rule must appear under Rules"); +});