From 47124a6a634cecb7a01f51491357a2b8d9985a98 Mon Sep 17 00:00:00 2001 From: Aidan McAlister Date: Wed, 6 May 2026 08:35:37 -0400 Subject: [PATCH] feat(docs): improve llms mdx component fidelity --- apps/docs/package.json | 1 + .../scripts/test-llm-markdown-fidelity.ts | 176 +++++++ apps/docs/src/lib/get-llm-text.ts | 60 +-- apps/docs/src/lib/llm-markdown.ts | 449 ++++++++++++++++++ 4 files changed, 627 insertions(+), 59 deletions(-) create mode 100644 apps/docs/scripts/test-llm-markdown-fidelity.ts create mode 100644 apps/docs/src/lib/llm-markdown.ts diff --git a/apps/docs/package.json b/apps/docs/package.json index 24618f0beb..9d9b47bb12 100644 --- a/apps/docs/package.json +++ b/apps/docs/package.json @@ -16,6 +16,7 @@ "lint:external-links": "tsx ./scripts/lint-external-links.ts", "lint:images": "tsx ./scripts/lint-images.ts", "lint:code": "tsx ./scripts/lint-code-blocks.ts", + "test:llm-markdown": "tsx ./scripts/test-llm-markdown-fidelity.ts", "audit:redirects": "node ./scripts/audit-redirects.mjs", "audit:redirects:strict": "node ./scripts/audit-redirects.mjs --strict", "lint:spellcheck": "cspell \"content/docs/**/*.mdx\" \"content/docs/**/*.json\" --show-context" diff --git a/apps/docs/scripts/test-llm-markdown-fidelity.ts b/apps/docs/scripts/test-llm-markdown-fidelity.ts new file mode 100644 index 0000000000..4db2db330f --- /dev/null +++ b/apps/docs/scripts/test-llm-markdown-fidelity.ts @@ -0,0 +1,176 @@ +import { strict as assert } from "node:assert"; +import { normalizeProcessedMarkdown } from "../src/lib/llm-markdown"; + +const rawComponentPattern = + /<(?:APIPage|CodeBlockTabs|CodeBlockTab|Tabs|Tab|Cards|Card|Accordions|Accordion|Youtube|Button|SharedContent|Steps|Step)\b/; + +type SnapshotCase = { + name: string; + input: string; + expected: string; +}; + +const snapshots: SnapshotCase[] = [ + { + name: "APIPage", + input: ``, + expected: `## API reference + +### GET /v1/example + +\`GET /v1/example\``, + }, + { + name: "CodeBlockTabs", + input: ` + + npm + pnpm + + + + \`\`\`bash + npm install @prisma/client + \`\`\` + + + + \`\`\`bash + pnpm add @prisma/client + \`\`\` + +`, + expected: `#### npm + +\`\`\`bash +npm install @prisma/client +\`\`\` + +#### pnpm + +\`\`\`bash +pnpm add @prisma/client +\`\`\``, + }, + { + name: "manual Tabs", + input: ` + + 1. Create \`seed.ts\`. + + \`\`\`ts + console.log("seed"); + \`\`\` + + + + 1. Create \`seed.js\`. + +`, + expected: `#### TypeScript + +1. Create \`seed.ts\`. + + \`\`\`ts + console.log("seed"); + \`\`\` + +#### JavaScript + +1. Create \`seed.js\`.`, + }, + { + name: "admonition", + input: ` + Before you continue + + Keep both paragraphs. + + - Parent item + - Nested item +`, + expected: `> [!WARNING] +> Before you continue +> +> Keep both paragraphs. +> +> - Parent item +> - Nested item`, + }, + { + name: "Accordion and Youtube", + input: ` + + + +`, + expected: `### Watch video: Multi-file Prisma schema + +[How to split your Prisma schema](https://www.youtube.com/watch?v=abc123)`, + }, + { + name: "Cards", + input: ` + }> + Provision a short-lived Prisma Postgres database. + + + + Choose the right connection string. + +`, + expected: `- [Create a temporary database](/postgres/npx-create-db): Provision a short-lived Prisma Postgres database. + +- [Connect to your database](/postgres/database/connecting-to-your-database): Choose the right connection string.`, + }, + { + name: "Button", + input: ``, + expected: `[Install the Prisma plugin for Cursor](https://cursor.com/marketplace/prisma)`, + }, + { + name: "SharedContent and Steps", + input: ` + Shared paragraph. + + + + + Run the command. + +`, + expected: `Shared paragraph. + +### Install + +Run the command.`, + }, +]; + +function stripFencedCodeBlocks(markdown: string) { + return markdown.replace(/^([ \t]*)([`~]{3,})[^\n]*\n[\s\S]*?^\1\2\s*$/gm, ""); +} + +for (const snapshot of snapshots) { + const actual = normalizeProcessedMarkdown(snapshot.input); + assert.equal(actual, snapshot.expected, snapshot.name); + assert.equal( + rawComponentPattern.test(stripFencedCodeBlocks(actual)), + false, + `${snapshot.name} leaves raw MDX component JSX in markdown output`, + ); +} + +const codeFenceInput = `\`\`\`tsx +Keep component examples intact inside code fences. +\`\`\``; + +assert.equal( + normalizeProcessedMarkdown(codeFenceInput), + codeFenceInput, + "code fences are preserved", +); + +console.log(`LLM markdown fidelity snapshots passed (${snapshots.length + 1} cases).`); diff --git a/apps/docs/src/lib/get-llm-text.ts b/apps/docs/src/lib/get-llm-text.ts index d3a843f060..0a5ea9eef2 100644 --- a/apps/docs/src/lib/get-llm-text.ts +++ b/apps/docs/src/lib/get-llm-text.ts @@ -1,4 +1,5 @@ import { source } from "@/lib/source"; +import { normalizeProcessedMarkdown } from "@/lib/llm-markdown"; import { getPageTitleText } from "@/lib/page-title"; import { getBaseUrl, withDocsBasePath } from "@/lib/urls"; import type { InferPageType } from "fumadocs-core/source"; @@ -172,65 +173,6 @@ function formatRelatedPages(relatedPages: RelatedPageLink[]) { return `\n\n## Related pages\n\n${links}`; } -function trimComponentContent(value: string) { - const lines = value.replace(/^\n+|\n+$/g, "").split("\n"); - const indent = lines - .filter((line) => line.trim().length > 0) - .reduce((minimum, line) => Math.min(minimum, line.match(/^ */)?.[0].length ?? 0), Infinity); - - return lines - .map((line) => (Number.isFinite(indent) ? line.slice(indent) : line)) - .join("\n") - .trim(); -} - -function cleanCalloutContent(value: string) { - return trimComponentContent(value) - .replace( - /([\s\S]*?)<\/Callout(?:Title|Description)>/g, - (_match, content: string) => trimComponentContent(content), - ) - .replace(/<\/?(?:CalloutTitle|CalloutDescription)>/g, "") - .replace(/^(?:[ \t]*\n)+|(?:\n[ \t]*)+$/g, "") - .split("\n") - .map((line) => line.replace(/[ \t]+$/g, "")) - .join("\n"); -} - -function formatCallout(type: string, content: string) { - const label = type.trim().toUpperCase() || "NOTE"; - const text = cleanCalloutContent(content); - if (!text) return ""; - - return `> [!${label}]\n${text - .split("\n") - .map((line) => `> ${line}`) - .join("\n")}`; -} - -function formatCodeBlockTab(value: string, content: string) { - const text = trimComponentContent(content); - if (!text) return ""; - - return `#### ${value.trim()}\n\n${text}`; -} - -function normalizeProcessedMarkdown(markdown: string) { - return markdown - .replace( - /]*>([\s\S]*?)<\/CalloutContainer>/g, - (_match, type: string, content: string) => formatCallout(type, content), - ) - .replace(/[\s\S]*?<\/CodeBlockTabsList>/g, "") - .replace( - /]*>([\s\S]*?)<\/CodeBlockTab>/g, - (_match, value: string, content: string) => formatCodeBlockTab(value, content), - ) - .replace(/<\/?CodeBlockTabs[^>]*>/g, "") - .replace(/\n{3,}/g, "\n\n") - .trim(); -} - export async function getLLMText(page: DocsPage) { const processed = normalizeProcessedMarkdown(await page.data.getText("processed")); const breadcrumbLine = getBreadcrumbLine(page); diff --git a/apps/docs/src/lib/llm-markdown.ts b/apps/docs/src/lib/llm-markdown.ts new file mode 100644 index 0000000000..8dad6a84d1 --- /dev/null +++ b/apps/docs/src/lib/llm-markdown.ts @@ -0,0 +1,449 @@ +import { readFileSync } from "node:fs"; +import { join } from "node:path"; + +type OpenApiSpec = { + paths?: Record>; +}; + +type OpenApiOperation = { + summary?: string; + description?: string; + parameters?: OpenApiParameter[]; + requestBody?: { + content?: Record; + }; + responses?: Record; +}; + +type OpenApiParameter = { + name?: string; + in?: string; + description?: string; + required?: boolean; + schema?: JsonSchema; +}; + +type JsonSchema = { + type?: string | string[]; + description?: string; + default?: unknown; + enum?: unknown[]; + properties?: Record; + required?: string[]; + $ref?: string; +}; + +type ApiPageOperation = { + path?: string; + method?: string; +}; + +let openApiSpecCache: OpenApiSpec | null | undefined; + +function getAttribute(attrs: string, name: string) { + const pattern = new RegExp( + `${name}\\s*=\\s*(?:"([^"]*)"|'([^']*)'|\\{\\s*"([^"]*)"\\s*\\}|\\{\\s*'([^']*)'\\s*\\})`, + ); + const match = attrs.match(pattern); + return match?.slice(1).find((value) => value !== undefined); +} + +function cleanInlineText(value: string | undefined) { + return value?.replace(/\s+/g, " ").trim(); +} + +function formatDefaultValue(value: unknown) { + if (value === undefined) return undefined; + return typeof value === "string" ? value : JSON.stringify(value); +} + +function formatSchemaType(schema: JsonSchema | undefined) { + if (!schema) return undefined; + if (schema.$ref) return schema.$ref.split("/").at(-1); + if (Array.isArray(schema.type)) return schema.type.join(" | "); + if (schema.type) return schema.type; + if (schema.enum) return "enum"; + if (schema.properties) return "object"; + return undefined; +} + +function getJsonSchema(content: OpenApiOperation["requestBody"] | undefined) { + return content?.content?.["application/json"]?.schema; +} + +function loadOpenApiSpec() { + if (openApiSpecCache !== undefined) return openApiSpecCache; + + for (const cachePath of [ + join(process.cwd(), "cache", "openapi.json"), + join(process.cwd(), "apps/docs/cache/openapi.json"), + ]) { + try { + openApiSpecCache = JSON.parse(readFileSync(cachePath, "utf8")) as OpenApiSpec; + return openApiSpecCache; + } catch {} + } + + openApiSpecCache = null; + return openApiSpecCache; +} + +function getOpenApiOperation(path: string, method: string) { + const spec = loadOpenApiSpec(); + return spec?.paths?.[path]?.[method.toLowerCase()]; +} + +function formatParameter(parameter: OpenApiParameter) { + const name = parameter.name ?? "parameter"; + const location = parameter.in ? `${parameter.in}` : "parameter"; + const required = parameter.required ? "required" : "optional"; + const type = formatSchemaType(parameter.schema); + const details = [location, type, required].filter(Boolean).join(", "); + const description = cleanInlineText(parameter.description ?? parameter.schema?.description); + const suffix = description ? `: ${description}` : ""; + + return `- \`${name}\`${details ? ` (${details})` : ""}${suffix}`; +} + +function formatRequestBody(operation: OpenApiOperation) { + const schema = getJsonSchema(operation.requestBody); + const properties = schema?.properties; + if (!properties) return ""; + + const required = new Set(schema.required ?? []); + const lines = Object.entries(properties).map(([name, property]) => { + const type = formatSchemaType(property); + const defaultValue = formatDefaultValue(property?.default); + const description = cleanInlineText(property?.description); + const details = [type, required.has(name) ? "required" : "optional"].filter(Boolean).join(", "); + const metadata = [ + description, + defaultValue !== undefined ? `Default: \`${defaultValue}\`.` : undefined, + ].filter(Boolean); + + return `- \`${name}\`${details ? ` (${details})` : ""}${metadata.length > 0 ? `: ${metadata.join(" ")}` : ""}`; + }); + + return lines.length > 0 ? `\n\n#### Request body\n\n${lines.join("\n")}` : ""; +} + +function formatResponses(operation: OpenApiOperation) { + const responses = operation.responses; + if (!responses) return ""; + + const lines = Object.entries(responses).map(([status, response]) => { + const description = cleanInlineText(response?.description); + return `- \`${status}\`${description ? `: ${description}` : ""}`; + }); + + return lines.length > 0 ? `\n\n#### Responses\n\n${lines.join("\n")}` : ""; +} + +function parseApiPageOperations(value: string): ApiPageOperation[] { + const match = value.match(/operations=\{\s*(\[[\s\S]*?\])\s*\}/); + if (!match) return []; + + try { + const operations = JSON.parse(match[1]) as ApiPageOperation[]; + return Array.isArray(operations) ? operations : []; + } catch { + return []; + } +} + +function formatApiOperation(operation: ApiPageOperation) { + if (!operation.path || !operation.method) return ""; + + const method = operation.method.toUpperCase(); + const apiOperation = getOpenApiOperation(operation.path, operation.method); + const summary = cleanInlineText(apiOperation?.summary); + const description = cleanInlineText(apiOperation?.description); + const parameters = apiOperation?.parameters ?? []; + const parameterText = + parameters.length > 0 + ? `\n\n#### Parameters\n\n${parameters.map(formatParameter).join("\n")}` + : ""; + const requestBodyText = apiOperation ? formatRequestBody(apiOperation) : ""; + const responsesText = apiOperation ? formatResponses(apiOperation) : ""; + const title = summary ? `### ${summary}` : `### ${method} ${operation.path}`; + const endpoint = `\`${method} ${operation.path}\``; + + return `${title}\n\n${endpoint}${description ? `\n\n${description}` : ""}${parameterText}${requestBodyText}${responsesText}`; +} + +function formatApiPage(value: string) { + const operations = parseApiPageOperations(value); + const text = operations.map(formatApiOperation).filter(Boolean).join("\n\n"); + + if (!text) return "## API reference\n\n_API reference details unavailable in markdown output._"; + + return `## API reference\n\n${text}`; +} + +function trimComponentContent(value: string) { + const lines = value.replace(/^\n+|\n+$/g, "").split("\n"); + const indent = lines + .filter((line) => line.trim().length > 0) + .reduce((minimum, line) => Math.min(minimum, line.match(/^ */)?.[0].length ?? 0), Infinity); + + return lines + .map((line) => (Number.isFinite(indent) ? line.slice(indent) : line)) + .join("\n") + .trim(); +} + +function cleanCalloutContent(value: string) { + return trimComponentContent(value) + .replace( + /([\s\S]*?)<\/Callout(?:Title|Description)>/g, + (_match, content: string) => trimComponentContent(content), + ) + .replace(/<\/?(?:CalloutTitle|CalloutDescription)>/g, "") + .replace(/^(?:[ \t]*\n)+|(?:\n[ \t]*)+$/g, "") + .split("\n") + .map((line) => line.replace(/[ \t]+$/g, "")) + .join("\n"); +} + +function formatCallout(type: string, content: string) { + const labelMap: Record = { + danger: "CAUTION", + error: "CAUTION", + info: "NOTE", + note: "NOTE", + ppg: "NOTE", + success: "TIP", + tip: "TIP", + warn: "WARNING", + warning: "WARNING", + }; + const label = labelMap[type.trim().toLowerCase()] ?? "NOTE"; + const text = cleanCalloutContent(content); + if (!text) return ""; + + return `> [!${label}]\n${text + .split("\n") + .map((line) => `> ${line}`) + .join("\n")}`; +} + +function formatCodeBlockTab(value: string, content: string) { + const text = trimComponentContent(content); + if (!text) return ""; + + return `#### ${value.trim()}\n\n${text}`; +} + +function formatSectionComponent(attrs: string, content: string, fallbackTitle: string) { + const title = getAttribute(attrs, "title") ?? getAttribute(attrs, "value") ?? fallbackTitle; + const text = trimComponentContent(content); + + return text ? `### ${title}\n\n${text}` : `### ${title}`; +} + +function formatYoutube(attrs: string) { + const videoId = getAttribute(attrs, "videoId"); + const title = getAttribute(attrs, "title") ?? "Watch video"; + if (!videoId) return title; + + return `[${title}](https://www.youtube.com/watch?v=${videoId})`; +} + +function convertHtmlLinks(value: string) { + return value.replace(/]*)>([\s\S]*?)<\/a>/g, (_match, attrs: string, content: string) => { + const href = getAttribute(attrs, "href"); + const label = trimComponentContent(content).replace(/\s+/g, " "); + return href ? `[${label}](${href})` : label; + }); +} + +function stripJsxTags(value: string) { + return convertHtmlLinks(value) + .replace(/<\/?[A-Z][A-Za-z0-9]*(?:\s[^>]*)?>/g, "") + .replace(/<\/?a(?:\s[^>]*)?>/g, "") + .replace(/\{["']\s*["']\}/g, " ") + .trim(); +} + +function formatCard(attrs: string, content: string) { + const title = getAttribute(attrs, "title") ?? "Card"; + const href = getAttribute(attrs, "href"); + const text = stripJsxTags(trimComponentContent(content)).replace(/\n+/g, " "); + const label = href ? `[${title}](${href})` : title; + + return `- ${label}${text ? `: ${text}` : ""}`; +} + +function formatButton(_attrs: string, content: string) { + return stripJsxTags(trimComponentContent(content)); +} + +function findOpeningTagEnd(value: string, startIndex: number) { + let quote: string | undefined; + let braceDepth = 0; + + for (let index = startIndex; index < value.length; index++) { + const char = value[index]; + const previous = value[index - 1]; + + if (quote) { + if (char === quote && previous !== "\\") quote = undefined; + continue; + } + + if (char === '"' || char === "'") { + quote = char; + continue; + } + + if (char === "{") { + braceDepth++; + continue; + } + + if (char === "}" && braceDepth > 0) { + braceDepth--; + continue; + } + + if (char === ">" && braceDepth === 0) return index; + } + + return -1; +} + +function isComponentTag(value: string, index: number, name: string) { + const next = value[index + name.length + 1]; + return value.startsWith(`<${name}`, index) && !/[A-Za-z0-9]/.test(next ?? ""); +} + +function replaceComponentBlocks( + markdown: string, + name: string, + format: (attrs: string, content: string) => string, +) { + let result = ""; + let cursor = 0; + + while (cursor < markdown.length) { + const start = markdown.indexOf(`<${name}`, cursor); + if (start === -1) { + result += markdown.slice(cursor); + break; + } + + if (!isComponentTag(markdown, start, name)) { + result += markdown.slice(cursor, start + 1); + cursor = start + 1; + continue; + } + + const openingEnd = findOpeningTagEnd(markdown, start); + if (openingEnd === -1) { + result += markdown.slice(cursor); + break; + } + + const openingTag = markdown.slice(start, openingEnd + 1); + const attrs = openingTag + .replace(new RegExp(`^<${name}\\b`), "") + .replace(/\/?>$/, "") + .trim(); + const isSelfClosing = openingTag.replace(/\s+$/, "").endsWith("/>"); + + result += markdown.slice(cursor, start); + + if (isSelfClosing) { + result += format(attrs, ""); + cursor = openingEnd + 1; + continue; + } + + const closingTag = ``; + const closingStart = markdown.indexOf(closingTag, openingEnd + 1); + if (closingStart === -1) { + result += openingTag; + cursor = openingEnd + 1; + continue; + } + + result += format(attrs, markdown.slice(openingEnd + 1, closingStart)); + cursor = closingStart + closingTag.length; + } + + return result; +} + +function protectFencedCodeBlocks(markdown: string) { + const blocks: string[] = []; + const protectedMarkdown = markdown.replace( + /^([ \t]*)([`~]{3,})[^\n]*\n[\s\S]*?^\1\2\s*$/gm, + (match) => { + const token = `__LLM_FENCED_CODE_BLOCK_${blocks.length}__`; + blocks.push(match); + return token; + }, + ); + + return { + markdown: protectedMarkdown, + restore(value: string) { + return blocks.reduce( + (text, block, index) => text.replace(`__LLM_FENCED_CODE_BLOCK_${index}__`, block), + value, + ); + }, + }; +} + +export function normalizeProcessedMarkdown(markdown: string) { + const componentMarkdown = markdown + .replace(/\{\/\*[\s\S]*?\*\/\}/g, "") + .replace( + /]*>([\s\S]*?)<\/CalloutContainer>/g, + (_match, type: string, content: string) => formatCallout(type, content), + ) + .replace(/[\s\S]*?<\/CodeBlockTabsList>/g, "") + .replace( + /]*>([\s\S]*?)<\/CodeBlockTab>/g, + (_match, value: string, content: string) => formatCodeBlockTab(value, content), + ) + .replace(/<\/?CodeBlockTabs[^>]*>/g, "") + .replace( + /]*>([\s\S]*?)<\/Tab>/g, + (_match, value: string, content: string) => formatCodeBlockTab(value, content), + ) + .replace(//g, "") + .replace(/<\/?(?:Tabs|TabsContent)[^>]*>/g, "") + .replace( + /]*)>([\s\S]*?)<\/Accordion>/g, + (_match, attrs: string, content: string) => + formatSectionComponent(attrs, content, "Accordion"), + ) + .replace(/<\/?Accordions[^>]*>/g, "") + .replace(/]*)>([\s\S]*?)<\/Step>/g, (_match, attrs: string, content: string) => + formatSectionComponent(attrs, content, "Step"), + ) + .replace(/<\/?Steps[^>]*>/g, "") + .replace(/]*>([\s\S]*?)<\/SharedContent>/g, (_match, content: string) => + trimComponentContent(content), + ) + .replace(/]*\/>/g, ""); + + const protectedCode = protectFencedCodeBlocks(componentMarkdown); + const withoutJsxComponents = replaceComponentBlocks( + replaceComponentBlocks(protectedCode.markdown, "Card", formatCard) + .replace(/<\/?Cards[^>]*>/g, "") + .replace(//g, (match: string) => formatApiPage(match)) + .replace(//g, (_match, attrs: string) => formatYoutube(attrs)), + "Button", + formatButton, + ); + + return protectedCode + .restore(withoutJsxComponents) + .replace(/^[ \t]+(#{3,4} )/gm, "$1") + .replace(/^[ \t]+(- \[)/gm, "$1") + .replace(/\n{3,}/g, "\n\n") + .trim(); +}