diff --git a/core-ingestion/src/__tests__/queries.latex.test.ts b/core-ingestion/src/__tests__/queries.latex.test.ts new file mode 100644 index 0000000..f211265 --- /dev/null +++ b/core-ingestion/src/__tests__/queries.latex.test.ts @@ -0,0 +1,253 @@ +import { describe, it, expect } from 'vitest'; +import { parseFile } from '../index.js'; +import { SupportedLanguages } from '../languages.js'; + +const probe = parseFile('/repo/a.tex', '\\documentclass{article}\n'); +const describeFn = probe && probe.language === SupportedLanguages.LaTeX ? describe : describe.skip; + +const rels = (r: ReturnType, p: string) => + r!.relationships.filter(x => x.predicate === p); + +describeFn('LaTeX parser', () => { + it('detects LaTeX by extension (.tex/.sty/.cls/.ltx/.latex)', () => { + for (const ext of ['tex', 'sty', 'cls', 'ltx', 'latex']) { + expect(parseFile(`/r/m.${ext}`, '\\documentclass{article}')!.language).toBe(SupportedLanguages.LaTeX); + } + }); + + it('builds a nested sectioning hierarchy via CONTAINS', () => { + const r = parseFile('/r/main.tex', ` +\\part{One} +\\chapter{Intro} +\\section{Background} +\\subsection{Prior work} +\\section{Methods} +`); + const sections = r!.entities.filter(e => e.kind === 'section').map(e => e.name); + expect(sections).toEqual(['One', 'Intro', 'Background', 'Prior work', 'Methods']); + const contains = rels(r, 'CONTAINS').map(e => `${e.srcName}>${e.dstName}`); + expect(contains).toContain('main.tex>One'); + expect(contains).toContain('One>Intro'); + expect(contains).toContain('Intro>Background'); + expect(contains).toContain('Background>Prior work'); + // Methods resets back to chapter scope (sibling of Background's parent section) + expect(contains).toContain('Intro>Methods'); + }); + + it('captures starred sections and cleans nested markup in titles', () => { + const r = parseFile('/r/m.tex', '\\section*{The \\textbf{Bold} Title}\n'); + const sections = r!.entities.filter(e => e.kind === 'section').map(e => e.name); + expect(sections).toEqual(['The Bold Title']); + }); + + it('captures \\newcommand / \\renewcommand / \\providecommand / \\DeclareRobustCommand', () => { + const r = parseFile('/r/m.tex', ` +\\newcommand{\\foo}{x} +\\renewcommand{\\bar}[1]{#1} +\\providecommand{\\baz}{y} +\\DeclareRobustCommand{\\qux}{z} +\\newcommand\\unbraced{w} +`); + const fns = r!.entities.filter(e => e.kind === 'function').map(e => e.name).sort(); + expect(fns).toEqual(['bar', 'baz', 'foo', 'qux', 'unbraced']); + }); + + it('captures \\def and \\let control-sequence definitions', () => { + const r = parseFile('/r/m.tex', '\\def\\mymacro{hello}\n\\let\\other=\\relax\n'); + const fns = r!.entities.filter(e => e.kind === 'function').map(e => e.name).sort(); + expect(fns).toEqual(['mymacro', 'other']); + }); + + it('does not emit a bogus "csname" macro for \\def\\csname...\\endcsname', () => { + const r = parseFile('/r/m.tex', '\\expandafter\\def\\csname my@dynamic\\endcsname{body}\n\\def\\real{x}\n'); + const fns = r!.entities.filter(e => e.kind === 'function').map(e => e.name); + expect(fns).toEqual(['real']); + }); + + it('strips inline math/markup from section titles', () => { + const r = parseFile('/r/m.tex', '\\section{Convergence of \\texorpdfstring{$\\sigma$}{[sigma]}}\n'); + expect(r!.entities.filter(e => e.kind === 'section').map(e => e.name)).toEqual(['Convergence of [sigma]']); + }); + + it('captures \\newenvironment and \\newtheorem as definitions', () => { + const r = parseFile('/r/m.tex', ` +\\newenvironment{myenv}{\\begin{center}}{\\end{center}} +\\newtheorem{thm}{Theorem} +\\newtheorem*{lem}{Lemma} +`); + const classes = r!.entities.filter(e => e.kind === 'class').map(e => e.name).sort(); + expect(classes).toEqual(['lem', 'myenv', 'thm']); + }); + + it('emits IMPORTS for \\documentclass / \\usepackage / \\RequirePackage', () => { + const r = parseFile('/r/m.tex', ` +\\documentclass[12pt]{article} +\\usepackage{amsmath} +\\usepackage[utf8]{inputenc} +\\usepackage{tikz,pgfplots} +\\RequirePackage{xcolor} +`); + const imports = rels(r, 'IMPORTS').map(e => e.dstName).sort(); + expect(imports).toEqual(['amsmath', 'article', 'inputenc', 'pgfplots', 'tikz', 'xcolor']); + // package imports preserve a bare (non-path) importRaw + expect(rels(r, 'IMPORTS').every(e => e.importRaw && !e.importRaw.includes('/'))).toBe(true); + }); + + it('resolves \\input / \\include / \\subfile to .tex targets', () => { + const r = parseFile('/r/main.tex', ` +\\input{header} +\\include{chapters/intro} +\\subfile{sections/methods.tex} +\\import{parts/}{appendix} +`); + const imports = rels(r, 'IMPORTS'); + const dst = imports.map(e => e.dstName).sort(); + expect(dst).toEqual(['chapters/intro.tex', 'header.tex', 'parts/appendix.tex', 'sections/methods.tex']); + // raw specifier retained for path resolution / multi-repo gate + expect(imports.find(e => e.dstName === 'header.tex')!.importRaw).toBe('header'); + }); + + it('emits \\label anchors and resolves \\ref/\\eqref/\\cref as REFERENCES', () => { + const r = parseFile('/r/m.tex', ` +\\section{Results} +\\label{sec:results} +\\begin{equation}\\label{eq:main}\\end{equation} +See \\ref{sec:results} and \\eqref{eq:main}. +Also \\cref{sec:results,eq:main}. +`); + const labels = r!.entities.filter(e => e.kind === 'label').map(e => e.name).sort(); + expect(labels).toEqual(['eq:main', 'sec:results']); + const refs = rels(r, 'REFERENCES').map(e => e.dstName).sort(); + expect(refs).toEqual(['eq:main', 'eq:main', 'sec:results', 'sec:results']); + }); + + it('emits \\cite/\\citep/\\citet REFERENCES to bib keys (incl. comma lists)', () => { + const r = parseFile('/r/m.tex', ` +\\cite{knuth1984} +\\citep{lamport1994,goossens1993} +\\citet{wilson2020} +`); + const refs = rels(r, 'REFERENCES').map(e => e.dstName).sort(); + expect(refs).toEqual(['goossens1993', 'knuth1984', 'lamport1994', 'wilson2020']); + }); + + it('records environments and their content under the enclosing section', () => { + const r = parseFile('/r/m.tex', ` +\\section{Figures} +\\begin{figure} +\\caption{A figure} +\\label{fig:a} +\\end{figure} +\\begin{table} +\\end{table} +`); + const envs = r!.entities.filter(e => e.kind === 'environment'); + expect(envs.map(e => e.name).sort()).toEqual(['figure', 'table']); + const contains = rels(r, 'CONTAINS').map(e => `${e.srcName}>${e.dstName}`); + expect(contains).toContain('Figures>figure'); + expect(contains).toContain('Figures>table'); + // Content attaches to the section, not the (non-unique) environment name — + // an environment name is never used as a CONTAINS source (it would dangle: + // patch-builder resolves edge sources by bare name with no container hint). + expect(contains).toContain('Figures>fig:a'); + expect(rels(r, 'CONTAINS').some(e => e.srcName === 'figure' || e.srcName === 'table')).toBe(false); + }); + + it('keeps labels resolvable when the same environment type repeats across sections', () => { + const r = parseFile('/r/m.tex', ` +\\section{Results} +\\begin{figure}\\label{fig:a}\\end{figure} +\\section{Discussion} +\\begin{figure}\\label{fig:b}\\end{figure} +`); + const contains = rels(r, 'CONTAINS').map(e => `${e.srcName}>${e.dstName}`); + // Each label is contained by its own (uniquely named) section, so no edge + // is sourced from the ambiguous "figure" node. + expect(contains).toContain('Results>fig:a'); + expect(contains).toContain('Discussion>fig:b'); + expect(rels(r, 'CONTAINS').some(e => e.srcName === 'figure')).toBe(false); + }); + + it('does not prefix file-level definitions with the file name (container undefined)', () => { + const r = parseFile('/r/m.tex', '\\newcommand{\\toplevel}{x}\n\\label{top}\n'); + const defs = r!.entities.filter(e => e.kind === 'function' || e.kind === 'label'); + expect(defs.every(e => e.container === undefined)).toBe(true); + }); + + it('skips a verbatim body closed with whitespace (\\end {verbatim})', () => { + const r = parseFile('/r/m.tex', ` +\\begin{verbatim} +\\newcommand{\\fake}{nope} +\\end {verbatim} +\\section{Real} +`); + expect(r!.entities.filter(e => e.kind === 'function')).toHaveLength(0); + expect(r!.entities.filter(e => e.kind === 'section').map(e => e.name)).toEqual(['Real']); + }); + + it('does not scan command bodies inside verbatim/lstlisting environments', () => { + const r = parseFile('/r/m.tex', ` +\\begin{verbatim} +\\newcommand{\\fake}{should not be captured} +\\section{not a real section} +\\end{verbatim} +\\section{Real} +`); + expect(r!.entities.filter(e => e.kind === 'function')).toHaveLength(0); + expect(r!.entities.filter(e => e.kind === 'section').map(e => e.name)).toEqual(['Real']); + }); + + it('ignores comments (unescaped %) but keeps escaped \\%', () => { + const r = parseFile('/r/m.tex', ` +% \\section{Commented out} +\\section{Live} % \\usepackage{nope} +50\\% done +`); + expect(r!.entities.filter(e => e.kind === 'section').map(e => e.name)).toEqual(['Live']); + expect(rels(r, 'IMPORTS')).toHaveLength(0); + }); + + it('captures cross-references nested inside unknown command arguments', () => { + const r = parseFile('/r/m.tex', '\\caption{See \\ref{fig:x} and \\cite{paper}}\n'); + const refs = rels(r, 'REFERENCES').map(e => e.dstName).sort(); + expect(refs).toEqual(['fig:x', 'paper']); + }); + + it('handles malformed / unbalanced input without crashing or hanging', () => { + const samples = [ + '\\begin{figure}\n\\section{Orphan}\n', // unclosed environment + '\\end{figure}\n', // stray \end + '\\newcommand{\\foo', // unterminated brace + '\\section{', // unterminated title + '\\usepackage{a,b,', // unterminated list + '\\\\ \\% \\{ \\} \\$ text', // control symbols only + '%'.repeat(10000), // long comment + '{'.repeat(5000), // deep open braces + ]; + for (const s of samples) { + const r = parseFile('/r/x.tex', s); + expect(r).not.toBeNull(); + expect(r!.language).toBe(SupportedLanguages.LaTeX); + } + }); + + it('is deterministic (byte-identical entities/relationships on re-parse)', () => { + const src = ` +\\documentclass{book} +\\usepackage{amsmath} +\\begin{document} +\\chapter{One}\\label{ch:one} +\\section{Alpha} +\\newcommand{\\x}{1} +\\begin{theorem}\\label{thm:1}\\end{theorem} +See \\ref{ch:one} and \\cite{a,b}. +\\input{more} +\\end{document} +`; + const a = parseFile('/r/book.tex', src)!; + const b = parseFile('/r/book.tex', src)!; + expect(JSON.stringify(a.entities)).toBe(JSON.stringify(b.entities)); + expect(JSON.stringify(a.relationships)).toBe(JSON.stringify(b.relationships)); + expect(JSON.stringify(a.chunks)).toBe(JSON.stringify(b.chunks)); + }); +}); diff --git a/core-ingestion/src/index.ts b/core-ingestion/src/index.ts index ee35bc0..8dcc0ec 100644 --- a/core-ingestion/src/index.ts +++ b/core-ingestion/src/index.ts @@ -356,7 +356,7 @@ function builtinsForLanguage(lang: SupportedLanguages): Set { export function isGrammarSupported(filePath: string): boolean { const language = languageFromPath(filePath); if (!language) return false; - if (language === SupportedLanguages.YAML || language === SupportedLanguages.Dockerfile || language === SupportedLanguages.SQL || language === SupportedLanguages.JSON || language === SupportedLanguages.TOML || language === SupportedLanguages.Markdown) return true; + if (language === SupportedLanguages.YAML || language === SupportedLanguages.Dockerfile || language === SupportedLanguages.SQL || language === SupportedLanguages.JSON || language === SupportedLanguages.TOML || language === SupportedLanguages.Markdown || language === SupportedLanguages.LaTeX) return true; if (filePath.endsWith('.tsx')) return true; // TSX uses TypeScript.tsx, always available return GRAMMAR_MAP[language] !== undefined; } @@ -995,6 +995,352 @@ function parseMarkdownFile(filePath: string, source: string): FileParseResult { return { filePath, language, entities, chunks, relationships, fileRole }; } +// --------------------------------------------------------------------------- +// LaTeX / TeX (.tex, .sty, .cls, .ltx, .latex) +// +// Hand-rolled scanner rather than tree-sitter: there is no maintained Node +// tree-sitter-latex binding for this ABI, and TeX is a macro-expansion language +// whose custom-macro arities a static grammar cannot know — real grammars emit +// ERROR nodes on ordinary documents. A targeted single-pass scanner extracts the +// constructs Ix models (sectioning hierarchy, macro/environment/theorem defs, +// labels, package/file dependencies, ref/cite cross-references) and degrades +// gracefully on malformed input. O(n), deterministic. +// --------------------------------------------------------------------------- + +const LATEX_SECTION_LEVELS: Record = { + part: 0, chapter: 1, section: 2, subsection: 3, + subsubsection: 4, paragraph: 5, subparagraph: 6, +}; +const LATEX_MACRO_DEFS = new Set([ + 'newcommand', 'renewcommand', 'providecommand', 'DeclareRobustCommand', + 'DeclareMathOperator', 'newcommandx', 'renewcommandx', 'providecommandx', +]); +const LATEX_CS_DEFS = new Set(['def', 'gdef', 'edef', 'xdef', 'let']); +// When a definition's target is one of these primitives the real name is built +// at expansion time (e.g. \def\csname foo\endcsname{...} defines "foo", not +// "csname"), so it is not statically knowable — skip rather than emit noise. +const LATEX_DYNAMIC_CS = new Set(['csname', 'endcsname']); +const LATEX_ENV_DEFS = new Set(['newenvironment', 'renewenvironment', 'newenvironmentx']); +const LATEX_THEOREM_DEFS = new Set(['newtheorem']); +const LATEX_PACKAGE_IMPORTS = new Set([ + 'usepackage', 'RequirePackage', 'RequirePackageWithOptions', + 'documentclass', 'LoadClass', 'LoadClassWithOptions', 'documentstyle', +]); +const LATEX_FILE_INCLUDES = new Set([ + 'input', 'include', 'subfile', 'subfileinclude', 'includeonly', +]); +const LATEX_BIB_INCLUDES = new Set(['bibliography', 'addbibresource']); +const LATEX_REF_CMDS = new Set([ + 'ref', 'eqref', 'pageref', 'cref', 'Cref', 'autoref', 'nameref', + 'vref', 'vpageref', 'labelcref', 'cpageref', 'Autoref', 'fref', 'thref', +]); +const LATEX_CITE_CMDS = new Set([ + 'cite', 'citep', 'citet', 'citealt', 'citealp', 'citeauthor', 'citeyear', + 'citeyearpar', 'Citep', 'Citet', 'Cite', 'textcite', 'parencite', 'footcite', + 'autocite', 'smartcite', 'citenum', 'fullcite', 'nocite', 'citealp', +]); +// Inside these environments backslashes are literal text, not commands; skip +// scanning their body so listings/verbatim content never produces spurious nodes. +const LATEX_VERBATIM_ENVS = new Set([ + 'verbatim', 'verbatim*', 'Verbatim', 'BVerbatim', 'LVerbatim', + 'lstlisting', 'minted', 'comment', 'alltt', 'filecontents', 'filecontents*', +]); + +function parseLatexFile(filePath: string, source: string): FileParseResult { + const language = SupportedLanguages.LaTeX; + const fileName = nodePath.basename(filePath); + const sourceLineCount = countSourceLines(source); + const fileRole = classifyFileRole(filePath); + const entities: ParsedEntity[] = [ + { name: fileName, kind: 'file', lineStart: 1, lineEnd: sourceLineCount, language }, + ]; + const chunks: ParsedChunk[] = []; + const relationships: ParsedRelationship[] = []; + const lineStarts = computeLineStarts(source); + const n = source.length; + + const lineAt = (idx: number): number => { + let lo = 0, hi = lineStarts.length - 1, ans = 0; + while (lo <= hi) { + const mid = (lo + hi) >> 1; + if (lineStarts[mid] <= idx) { ans = mid; lo = mid + 1; } else hi = mid - 1; + } + return ans + 1; + }; + + // Read a balanced { ... } group starting at `open` (source[open] === '{'). + // Honors \{ and \} escapes. On imbalance, returns the rest of the source. + const readBraceGroup = (open: number): { text: string; end: number } => { + let depth = 0; + for (let q = open; q < n; q++) { + const cc = source[q]; + if (cc === '\\') { q++; continue; } + if (cc === '{') depth++; + else if (cc === '}') { depth--; if (depth === 0) return { text: source.slice(open + 1, q), end: q + 1 }; } + } + return { text: source.slice(open + 1, n), end: n }; + }; + + // Read an optional [ ... ] group; stops at the first unescaped ] not nested in {}. + const readOptGroup = (open: number): { text: string; end: number } => { + let braceDepth = 0; + for (let q = open + 1; q < n; q++) { + const cc = source[q]; + if (cc === '\\') { q++; continue; } + if (cc === '{') braceDepth++; + else if (cc === '}') { if (braceDepth > 0) braceDepth--; } + else if (cc === ']' && braceDepth === 0) return { text: source.slice(open + 1, q), end: q + 1 }; + } + return { text: source.slice(open + 1, n), end: n }; + }; + + // From `start`, greedily collect immediately-following { } (required) and [ ] + // (optional) argument groups. Only spaces/tabs may separate them — never a + // newline — so an unrelated brace group on a later line is not misattributed. + const collectArgs = (start: number): { args: string[]; opts: string[]; end: number } => { + const args: string[] = []; + const opts: string[] = []; + let p = start; + for (;;) { + let q = p; + while (q < n && (source[q] === ' ' || source[q] === '\t')) q++; + if (source[q] === '{') { const g = readBraceGroup(q); args.push(g.text); p = g.end; } + else if (source[q] === '[') { const g = readOptGroup(q); opts.push(g.text); p = g.end; } + else break; + } + return { args, opts, end: p }; + }; + + // Strip nested commands/braces from a sectioning/environment title for a clean name. + const cleanTitle = (raw: string): string => + raw.replace(/\\[a-zA-Z@]+\*?/g, '').replace(/[{}$]/g, '').replace(/\s+/g, ' ').trim(); + const csName = (raw: string): string => { + const m = /\\([a-zA-Z@]+)/.exec(raw); + return (m ? m[1] : raw.trim().replace(/^\\/, '')).trim(); + }; + const splitList = (raw: string): string[] => + raw.split(',').map(s => s.trim()).filter(Boolean); + const ensureExt = (raw: string, ext: string): string => + /\.[a-zA-Z0-9]+$/.test(raw) ? raw : `${raw}${ext}`; + + // Sectioning hierarchy (levels 0..6); deepest active section is the live container. + const sectionStack: (string | null)[] = [null, null, null, null, null, null, null]; + const sectionMarks: { name: string; level: number; line: number }[] = []; + const envStack: { name: string }[] = []; + const seenEnv = new Set(); + const currentSection = (): string | null => { + for (let l = 6; l >= 0; l--) if (sectionStack[l]) return sectionStack[l]; + return null; + }; + // The containment parent for definitions, labels, environments and references + // is the nearest enclosing SECTION (or the file). It deliberately does NOT use + // the open-environment name: environment type names ("figure", "table") are not + // unique within a file, and patch-builder resolves a CONTAINS edge's *source* + // by bare name with no container hint — so an environment used as a parent would + // be ambiguous and its children's edges would dangle. Sections (unique titles) + // are safe parents. Environments are still recorded as section members. + const container = (): string => currentSection() ?? fileName; + + const addImport = (rawSpec: string, dst: string, line: number): void => { + if (!dst) return; + entities.push({ name: dst, kind: 'module', lineStart: line, lineEnd: line, language }); + relationships.push({ srcName: fileName, dstName: dst, predicate: 'IMPORTS', importRaw: rawSpec }); + }; + const addDef = (name: string, kind: string, line: number): void => { + if (!name) return; + const cont = container(); + entities.push({ name, kind, lineStart: line, lineEnd: line, language, container: cont === fileName ? undefined : cont }); + relationships.push({ srcName: cont, dstName: name, predicate: 'CONTAINS' }); + }; + + let i = 0; + while (i < n) { + const ch = source[i]; + if (ch === '%') { while (i < n && source[i] !== '\n') i++; continue; } + if (ch !== '\\') { i++; continue; } + + // Read the control-sequence name. A single non-letter after '\' (\\, \%, \{, + // \,, \$) is an escape/control symbol — consume and skip. + let j = i + 1; + if (j >= n) { i = j; continue; } + const c0 = source[j]; + if (!/[a-zA-Z@]/.test(c0)) { i = j + 1; continue; } + let k = j; + while (k < n && /[a-zA-Z@]/.test(source[k])) k++; + const name = source.slice(j, k); + let after = k; + if (source[after] === '*') after++; // starred form (\section*, \newtheorem*) + const line = lineAt(i); + + if (LATEX_SECTION_LEVELS[name] !== undefined) { + const { args, opts, end } = collectArgs(after); + const title = cleanTitle(args[0] ?? opts[0] ?? ''); + if (title) { + const level = LATEX_SECTION_LEVELS[name]; + let cont: string = fileName; + for (let l = level - 1; l >= 0; l--) if (sectionStack[l]) { cont = sectionStack[l]!; break; } + for (let l = level; l <= 6; l++) sectionStack[l] = null; + sectionStack[level] = title; + entities.push({ name: title, kind: 'section', lineStart: line, lineEnd: line, language, container: cont === fileName ? undefined : cont }); + relationships.push({ srcName: cont, dstName: title, predicate: 'CONTAINS' }); + sectionMarks.push({ name: title, level, line }); + } + i = end; continue; + } + + if (name === 'begin') { + const { args, end } = collectArgs(after); + const envName = (args[0] ?? '').trim(); + if (envName) { + const cont = container(); + const key = `${cont}${envName}`; + if (!seenEnv.has(key)) { + seenEnv.add(key); + entities.push({ name: envName, kind: 'environment', lineStart: line, lineEnd: line, language, container: cont === fileName ? undefined : cont }); + relationships.push({ srcName: cont, dstName: envName, predicate: 'CONTAINS' }); + } + envStack.push({ name: envName }); + if (LATEX_VERBATIM_ENVS.has(envName)) { + // Jump to the matching \end so it pops normally. Tolerate whitespace + // (\end {verbatim}, \end{ verbatim }) — all legal TeX. + const esc = envName.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); + const m = new RegExp(`\\\\end\\s*\\{\\s*${esc}\\s*\\}`).exec(source.slice(end)); + i = m ? end + m.index : n; + continue; + } + } + i = end; continue; + } + + if (name === 'end') { + const { args, end } = collectArgs(after); + const envName = (args[0] ?? '').trim(); + // Pop to the nearest matching open environment; ignore stray \end gracefully. + for (let s = envStack.length - 1; s >= 0; s--) { + if (envStack[s].name === envName) { envStack.length = s; break; } + } + i = end; continue; + } + + if (LATEX_MACRO_DEFS.has(name)) { + let p = after; + while (p < n && (source[p] === ' ' || source[p] === '\t')) p++; + let macro: string | null = null; + if (source[p] === '\\') { const m = /^\\([a-zA-Z@]+)/.exec(source.slice(p, p + 80)); if (m) { macro = m[1]; p += m[0].length; } } + else if (source[p] === '{') { const g = readBraceGroup(p); macro = csName(g.text); p = g.end; } + if (macro && !LATEX_DYNAMIC_CS.has(macro)) addDef(macro, 'function', line); + i = p; continue; + } + + if (LATEX_CS_DEFS.has(name)) { + let p = after; + while (p < n && (source[p] === ' ' || source[p] === '\t')) p++; + if (source[p] === '\\') { const m = /^\\([a-zA-Z@]+)/.exec(source.slice(p, p + 80)); if (m) { if (!LATEX_DYNAMIC_CS.has(m[1])) addDef(m[1], 'function', line); p += m[0].length; } } + i = p; continue; + } + + if (LATEX_ENV_DEFS.has(name)) { + const { args, end } = collectArgs(after); + addDef((args[0] ?? '').trim(), 'class', line); + i = end; continue; + } + if (LATEX_THEOREM_DEFS.has(name)) { + const { args, end } = collectArgs(after); + addDef((args[0] ?? '').trim(), 'class', line); + i = end; continue; + } + + if (LATEX_PACKAGE_IMPORTS.has(name)) { + const { args, end } = collectArgs(after); + for (const pkg of splitList(args[0] ?? '')) addImport(pkg, pkg, line); + i = end; continue; + } + + if (LATEX_FILE_INCLUDES.has(name)) { + const { args, end } = collectArgs(after); + for (const raw of splitList(args[0] ?? '')) addImport(raw, ensureExt(raw, '.tex'), line); + i = end; continue; + } + // \import{dir}{file} / \subimport{dir}{file}: path is dir + file + if (name === 'import' || name === 'subimport' || name === 'inputfrom' || name === 'subinputfrom') { + const { args, end } = collectArgs(after); + if (args.length >= 2 && args[1].trim()) { + const dir = args[0].trim().replace(/\/?$/, '/'); + const raw = `${dir}${args[1].trim()}`; + addImport(raw, ensureExt(raw, '.tex'), line); + } + i = end; continue; + } + if (LATEX_BIB_INCLUDES.has(name)) { + const { args, end } = collectArgs(after); + for (const raw of splitList(args[0] ?? '')) addImport(raw, ensureExt(raw, '.bib'), line); + i = end; continue; + } + + if (name === 'label') { + const { args, end } = collectArgs(after); + addDef((args[0] ?? '').trim(), 'label', line); + i = end; continue; + } + if (name === 'bibitem') { + const { args, end } = collectArgs(after); + addDef((args[0] ?? '').trim(), 'label', line); + i = end; continue; + } + + if (LATEX_REF_CMDS.has(name) || LATEX_CITE_CMDS.has(name)) { + const { args, end } = collectArgs(after); + const src = container(); + for (const target of splitList(args[0] ?? '')) { + relationships.push({ srcName: src, dstName: target, predicate: 'REFERENCES' }); + } + i = end; continue; + } + + // Unknown command: advance past the name only, so any cross-references nested + // in its argument groups (e.g. \caption{... \ref{x}}, \footnote{\cite{y}}) are + // still scanned as ordinary content on the next iterations. + i = after; + } + + // Section chunks: each spans to the line before the next section at the same or + // shallower level, so a parent section's chunk includes its sub-sections. + for (let h = 0; h < sectionMarks.length; h++) { + const { name, level, line } = sectionMarks[h]; + let endLine = sourceLineCount; + for (let m = h + 1; m < sectionMarks.length; m++) { + if (sectionMarks[m].level <= level) { endLine = sectionMarks[m].line - 1; break; } + } + const startByte = lineStarts[line - 1] ?? 0; + const endByte = endLine < lineStarts.length ? (lineStarts[endLine] ?? source.length) : source.length; + const content = source.slice(startByte, endByte); + chunks.push({ + name, + chunkKind: 'section', + lineStart: line, + lineEnd: endLine, + startByte, + endByte, + contentHash: crypto.createHash('sha256').update(content).digest('hex'), + language, + }); + } + if (sectionMarks.length === 0) { + chunks.push({ + name: null, + chunkKind: 'file_body', + lineStart: 1, + lineEnd: Math.max(sourceLineCount, 1), + startByte: 0, + endByte: source.length, + contentHash: crypto.createHash('sha256').update(source).digest('hex'), + language, + }); + } + + return { filePath, language, entities, chunks, relationships, fileRole }; +} + function parseDockerfileFile(filePath: string, source: string): FileParseResult { const language = SupportedLanguages.Dockerfile; const fileName = nodePath.basename(filePath); @@ -1460,6 +1806,7 @@ export function parseFile(filePath: string, source: string): FileParseResult | n if (language === SupportedLanguages.JSON) return parseJsonFile(filePath, source); if (language === SupportedLanguages.TOML) return parseTomlFile(filePath, source); if (language === SupportedLanguages.Markdown) return parseMarkdownFile(filePath, source); + if (language === SupportedLanguages.LaTeX) return parseLatexFile(filePath, source); // TypeScript TSX uses a separate grammar const isTsx = filePath.endsWith('.tsx'); diff --git a/core-ingestion/src/languages.ts b/core-ingestion/src/languages.ts index 66c8b1d..e34d138 100644 --- a/core-ingestion/src/languages.ts +++ b/core-ingestion/src/languages.ts @@ -31,6 +31,7 @@ export enum SupportedLanguages { XML = 'xml', HCL = 'hcl', CSS = 'css', + LaTeX = 'latex', } const EXT_MAP: Record = { @@ -101,6 +102,11 @@ const EXT_MAP: Record = { '.scss': SupportedLanguages.CSS, '.sass': SupportedLanguages.CSS, '.less': SupportedLanguages.CSS, + '.tex': SupportedLanguages.LaTeX, + '.sty': SupportedLanguages.LaTeX, + '.cls': SupportedLanguages.LaTeX, + '.ltx': SupportedLanguages.LaTeX, + '.latex':SupportedLanguages.LaTeX, }; export function languageFromPath(filePath: string): SupportedLanguages | null { diff --git a/core-ingestion/src/queries.ts b/core-ingestion/src/queries.ts index ef1e61f..52af40b 100644 --- a/core-ingestion/src/queries.ts +++ b/core-ingestion/src/queries.ts @@ -1783,5 +1783,8 @@ export const LANGUAGE_QUERIES: Record = { [SupportedLanguages.XML]: XML_QUERIES, [SupportedLanguages.HCL]: HCL_QUERIES, [SupportedLanguages.CSS]: CSS_QUERIES, + // LaTeX/TeX has no tree-sitter grammar in this stack; it is parsed by the + // hand-rolled scanner in parseLatexFile (index.ts), like Markdown/YAML/etc. + [SupportedLanguages.LaTeX]: '', }; diff --git a/ix-cli/src/cli/__tests__/ingest-discovery.test.ts b/ix-cli/src/cli/__tests__/ingest-discovery.test.ts index 596901f..f1874ec 100644 --- a/ix-cli/src/cli/__tests__/ingest-discovery.test.ts +++ b/ix-cli/src/cli/__tests__/ingest-discovery.test.ts @@ -35,4 +35,23 @@ describe('dedupeDiscoveredFilePaths', () => { expect(isSupportedSourceFile('build/common.mk')).toBe(true); expect(isSupportedSourceFile('README.txt')).toBe(false); }); + + it('discovers TeX/LaTeX source files', () => { + expect(isSupportedSourceFile('paper/main.tex')).toBe(true); + expect(isSupportedSourceFile('pkg/mystyle.sty')).toBe(true); + expect(isSupportedSourceFile('cls/thesis.cls')).toBe(true); + expect(isSupportedSourceFile('legacy/doc.ltx')).toBe(true); + expect(isSupportedSourceFile('notes.latex')).toBe(true); + }); + + it('discovers the grammar-based parsers that ship in core-ingestion', () => { + // These extensions are parsed by core-ingestion but were missing from the + // discovery allowlist, so their files were never walked. Guard against regress. + for (const f of [ + 'init.lua', 'deploy.sh', 'run.bash', 'Main.hs', 'build.zig', + 'index.html', 'pom.xml', 'app.csproj', 'main.tf', 'theme.css', 'styles.scss', + ]) { + expect(isSupportedSourceFile(f)).toBe(true); + } + }); }); diff --git a/ix-cli/src/cli/commands/ingest.ts b/ix-cli/src/cli/commands/ingest.ts index 10a4203..d4c2977 100644 --- a/ix-cli/src/cli/commands/ingest.ts +++ b/ix-cli/src/cli/commands/ingest.ts @@ -17,6 +17,7 @@ import { loadIngestionModules } from './ingestion-loader.js'; import { ensureWorkspaceIdState } from '../bootstrap.js'; import { detectSystem, repoWorkspaceIdFor, lookupPackage, readPackageNames, readPackageDeps } from '../system.js'; import { CLIENT_EXPECTED_SCHEMA_VERSION } from '../backend-status.js'; +import { SUPPORTED_EXTENSIONS } from '../supported-extensions.js'; import { deterministicId, transformIssue, @@ -29,25 +30,6 @@ import { // File discovery // --------------------------------------------------------------------------- -// Inline extension set — mirrors core-ingestion/dist/languages.js EXT_MAP. -// Kept here so file discovery does NOT require loading tree-sitter grammars. -const SUPPORTED_EXTENSIONS = new Set([ - '.ts', '.tsx', '.js', '.jsx', '.mjs', '.cjs', - '.py', '.java', '.c', '.h', '.cpp', '.cc', '.cxx', '.hpp', - '.cs', '.go', '.rb', '.rs', '.php', '.kt', '.kts', '.swift', - '.scala', '.sc', - '.yaml', '.yml', - '.dockerfile', - '.sql', - '.json', - '.toml', - '.md', '.markdown', - '.r', - '.sas', - '.ex', '.exs', - '.mk', '.makefile', -]); - export function isSupportedSourceFile(filePath: string): boolean { const fileName = nodePath.basename(filePath).toLowerCase(); return fileName === 'dockerfile' diff --git a/ix-cli/src/cli/commands/text.ts b/ix-cli/src/cli/commands/text.ts index 8886b7e..b0e7491 100644 --- a/ix-cli/src/cli/commands/text.ts +++ b/ix-cli/src/cli/commands/text.ts @@ -97,6 +97,8 @@ function inferLanguage(filePath: string): string | undefined { if (filePath.endsWith(".md")) return "markdown"; if (filePath.endsWith(".json")) return "json"; if (filePath.endsWith(".yaml") || filePath.endsWith(".yml")) return "yaml"; + if (filePath.endsWith(".tex") || filePath.endsWith(".sty") || filePath.endsWith(".cls") + || filePath.endsWith(".ltx") || filePath.endsWith(".latex")) return "latex"; return undefined; } @@ -111,6 +113,7 @@ function languageGlobs(lang: string): string[] { case "rust": return ["*.rs"]; case "ruby": return ["*.rb"]; case "markdown": return ["*.md", "*.mdx"]; + case "latex": case "tex": return ["*.tex", "*.sty", "*.cls", "*.ltx", "*.latex"]; case "config": return ["*.json", "*.yaml", "*.yml", "*.toml"]; default: return [`*.${lang}`]; } diff --git a/ix-cli/src/cli/commands/watch.ts b/ix-cli/src/cli/commands/watch.ts index 5300a9d..d071080 100644 --- a/ix-cli/src/cli/commands/watch.ts +++ b/ix-cli/src/cli/commands/watch.ts @@ -9,13 +9,7 @@ import { getEndpoint, resolveWorkspaceRoot, clearIngestMtimeCache } from "../con import { bootstrap, ensureWorkspaceId, ensureWorkspaceIdState } from "../bootstrap.js"; import { loadWatchIngestionModules } from "./ingestion-loader.js"; import { readFileContent } from "./watch-utils.js"; -const SUPPORTED_EXTENSIONS = new Set([ - ".ts", ".tsx", ".js", ".jsx", ".mjs", ".cjs", - ".scala", ".sc", ".java", - ".py", ".rb", ".go", ".rs", ".kt", ".kts", ".cs", ".php", ".swift", - ".c", ".h", ".cpp", ".cc", ".cxx", ".hpp", - ".yaml", ".yml", -]); +import { SUPPORTED_EXTENSIONS } from "../supported-extensions.js"; const SUPPORTED_NAMES = new Set([ ".gitignore", ".gitattributes", ".editorconfig", ".env", diff --git a/ix-cli/src/cli/stale.ts b/ix-cli/src/cli/stale.ts index a79a023..d4a27bb 100644 --- a/ix-cli/src/cli/stale.ts +++ b/ix-cli/src/cli/stale.ts @@ -1,6 +1,7 @@ import * as fs from "node:fs"; import * as path from "node:path"; import { IxClient } from "../client/api.js"; +import { SUPPORTED_EXTENSIONS } from "./supported-extensions.js"; export interface StaleInfo { lastIngestAt: string | null; @@ -9,15 +10,6 @@ export interface StaleInfo { sampleChangedFiles: string[]; } -const SUPPORTED_EXTENSIONS = new Set([ - ".ts", ".tsx", ".js", ".jsx", ".mjs", ".cjs", - ".scala", ".sc", ".java", - ".py", ".rb", ".go", ".rs", - ".md", ".mdx", - ".json", ".yaml", ".yml", ".toml", - ".sql", ".graphql", ".gql", -]); - const SUPPORTED_NAMES = new Set([ ".gitignore", ".gitattributes", ".editorconfig", ".env", ".eslintrc", ".prettierrc", ".babelrc", diff --git a/ix-cli/src/cli/supported-extensions.ts b/ix-cli/src/cli/supported-extensions.ts new file mode 100644 index 0000000..61142cc --- /dev/null +++ b/ix-cli/src/cli/supported-extensions.ts @@ -0,0 +1,34 @@ +// Single source of truth for which file extensions ix discovers and ingests. +// +// This MUST stay in sync with core-ingestion's EXT_MAP (languages.ts): every +// extension a parser handles belongs here, or those files are never walked by +// `ix map` / `ix watch` / stale detection. It previously lived as three drifting +// copies (ingest/watch/stale) that fell behind every new parser — hence one +// shared set. Dockerfile/Makefile and other extensionless files are matched by +// name in the individual commands, not here. +export const SUPPORTED_EXTENSIONS = new Set([ + ".ts", ".tsx", ".js", ".jsx", ".mjs", ".cjs", + ".py", ".java", ".c", ".h", ".cpp", ".cc", ".cxx", ".hpp", + ".cs", ".go", ".rb", ".rs", ".php", ".kt", ".kts", ".swift", + ".scala", ".sc", + ".yaml", ".yml", + ".dockerfile", + ".sql", + ".json", + ".toml", + ".md", ".markdown", + ".r", + ".sas", + ".ex", ".exs", + ".mk", ".makefile", + ".lua", + ".sh", ".bash", ".zsh", ".ksh", + ".hs", ".lhs", + ".zig", + ".html", ".htm", ".xhtml", + ".xml", ".xsd", ".xsl", ".xslt", ".wsdl", + ".csproj", ".vbproj", ".fsproj", ".props", ".targets", ".plist", + ".tf", ".tfvars", ".hcl", + ".css", ".scss", ".sass", ".less", + ".tex", ".sty", ".cls", ".ltx", ".latex", +]);