From 80ce5a627417f77f6564fec279374c82556a6aca Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Sun, 26 Apr 2026 18:55:37 +0800 Subject: [PATCH 1/2] feat: add parser registry and custom format support Add extensible parser system with trait-based architecture that allows custom format parsers to be registered alongside built-in Markdown and PDF support. BREAKING CHANGE: SourceFormat enum now includes Custom variant and derive macro changes may affect existing code. - Introduce Parser trait and ParserRegistry for custom format support - Add SourceFormat::Custom variant for plugin-resolved formats - Support pre-parsed raw nodes input to skip parsing stage - Provide builder methods for PipelineExecutor with custom registries - Add Python bindings for raw node compilation - Update documentation with examples for adding custom parsers --- crates/vectorless-compiler/src/config.rs | 4 +- crates/vectorless-compiler/src/lib.rs | 3 + .../src/parse/markdown/mod.rs | 38 +++++ crates/vectorless-compiler/src/parse/mod.rs | 153 +++++++++++++++++- .../vectorless-compiler/src/parse/pdf/mod.rs | 42 +++++ .../src/passes/frontend/parse.rs | 118 ++++++++++---- .../src/pipeline/context.rs | 46 +++++- .../src/pipeline/executor.rs | 32 ++++ .../src/pipeline/orchestrator.rs | 2 +- crates/vectorless-document/src/format.rs | 28 +++- crates/vectorless-document/src/lib.rs | 1 + .../vectorless-document/src/understanding.rs | 24 +++ crates/vectorless-engine/src/engine.rs | 76 ++++++++- crates/vectorless-engine/src/indexer.rs | 13 +- crates/vectorless-engine/src/lib.rs | 2 +- crates/vectorless-py/src/engine.rs | 40 ++++- vectorless/engine.py | 30 +++- 17 files changed, 594 insertions(+), 58 deletions(-) diff --git a/crates/vectorless-compiler/src/config.rs b/crates/vectorless-compiler/src/config.rs index b129ddc..4983241 100644 --- a/crates/vectorless-compiler/src/config.rs +++ b/crates/vectorless-compiler/src/config.rs @@ -18,7 +18,7 @@ use vectorless_utils::fingerprint::{Fingerprint, Fingerprinter}; use std::path::PathBuf; /// Index mode for document processing. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[derive(Debug, Clone, PartialEq, Eq)] pub enum SourceFormat { /// Auto-detect format from file extension. Auto, @@ -26,6 +26,8 @@ pub enum SourceFormat { Markdown, /// Force PDF format. Pdf, + /// Custom format resolved via [`ParserRegistry`](crate::parse::ParserRegistry). + Custom(String), } impl Default for SourceFormat { diff --git a/crates/vectorless-compiler/src/lib.rs b/crates/vectorless-compiler/src/lib.rs index 29be8f6..d60697b 100644 --- a/crates/vectorless-compiler/src/lib.rs +++ b/crates/vectorless-compiler/src/lib.rs @@ -67,6 +67,9 @@ pub mod summary; // Re-export main types from pipeline pub use pipeline::{CompileMetrics, CompileResult, CompilerInput, PipelineExecutor}; +// Re-export parser plugin types +pub use parse::{Parser, ParserRegistry}; + // Re-export config types pub use config::{PipelineOptions, SourceFormat, ThinningConfig}; pub use vectorless_document::ReasoningIndexConfig; diff --git a/crates/vectorless-compiler/src/parse/markdown/mod.rs b/crates/vectorless-compiler/src/parse/markdown/mod.rs index e384f52..b49aefa 100644 --- a/crates/vectorless-compiler/src/parse/markdown/mod.rs +++ b/crates/vectorless-compiler/src/parse/markdown/mod.rs @@ -26,3 +26,41 @@ mod frontmatter; mod parser; pub use parser::MarkdownParser; + +use crate::parse::{Parser, ParseResult}; +use std::path::Path; +use vectorless_error::Result; + +/// [`Parser`] trait adapter for [`MarkdownParser`]. +pub struct MarkdownParserAdapter { + inner: MarkdownParser, +} + +impl MarkdownParserAdapter { + /// Create a new Markdown parser adapter. + pub fn new() -> Self { + Self { inner: MarkdownParser::new() } + } +} + +#[async_trait::async_trait] +impl Parser for MarkdownParserAdapter { + fn name(&self) -> &str { "markdown" } + + fn extensions(&self) -> &[&str] { &["md", "markdown"] } + + async fn parse_content(&self, content: &str) -> Result { + self.inner.parse(content).await + } + + async fn parse_file(&self, path: &Path) -> Result { + self.inner.parse_file(path).await + } + + async fn parse_bytes(&self, data: &[u8]) -> Result { + let content = std::str::from_utf8(data).map_err(|e| { + vectorless_error::Error::Parse(format!("Invalid UTF-8: {}", e)) + })?; + self.inner.parse(content).await + } +} diff --git a/crates/vectorless-compiler/src/parse/mod.rs b/crates/vectorless-compiler/src/parse/mod.rs index 593f654..1cfc687 100644 --- a/crates/vectorless-compiler/src/parse/mod.rs +++ b/crates/vectorless-compiler/src/parse/mod.rs @@ -3,8 +3,26 @@ //! Document parsing for the compile pipeline. //! -//! Supports Markdown and PDF formats. Parsing is dispatched directly -//! via `match` — no trait objects or registry needed. +//! Supports Markdown and PDF formats out of the box. Custom parsers can be +//! added via the [`Parser`] trait and [`ParserRegistry`]. +//! +//! # Adding a custom parser +//! +//! ```rust,ignore +//! use vectorless_compiler::parse::{Parser, ParseResult, ParserRegistry}; +//! +//! struct MyParser; +//! +//! #[async_trait] +//! impl Parser for MyParser { +//! fn name(&self) -> &str { "my-format" } +//! fn extensions(&self) -> &[&str] { &["foo", "bar"] } +//! async fn parse_content(&self, content: &str) -> Result { ... } +//! async fn parse_file(&self, path: &Path) -> Result { ... } +//! } +//! +//! let registry = ParserRegistry::default_parsers(None).with(MyParser); +//! ``` pub mod markdown; pub mod pdf; @@ -14,12 +32,134 @@ pub mod types; // Re-export core types at module level pub use types::{DocumentFormat, DocumentMeta, ParseResult, RawNode}; +use std::collections::HashMap; use std::path::Path; use crate::parse::markdown::MarkdownParser; use vectorless_error::Result; use vectorless_llm::LlmClient; +// --------------------------------------------------------------------------- +// Parser trait +// --------------------------------------------------------------------------- + +/// Trait for document format parsers. +/// +/// Implement this to add support for a new document format. +/// Register via [`ParserRegistry::register`] or [`ParserRegistry::with`]. +#[async_trait::async_trait] +pub trait Parser: Send + Sync { + /// Parser name (e.g., "markdown", "pdf", "code"). + fn name(&self) -> &str; + + /// File extensions this parser handles, without dot (e.g., `["py", "rs"]`). + fn extensions(&self) -> &[&str] { + &[] + } + + /// Parse string content into raw nodes. + async fn parse_content(&self, content: &str) -> Result; + + /// Parse a file into raw nodes. + async fn parse_file(&self, path: &Path) -> Result; + + /// Parse binary data into raw nodes. + async fn parse_bytes(&self, data: &[u8]) -> Result { + let _ = data; + Err(vectorless_error::Error::Parse( + "Binary parsing not supported by this parser".into(), + )) + } +} + +// --------------------------------------------------------------------------- +// ParserRegistry +// --------------------------------------------------------------------------- + +/// Registry of document format parsers. +/// +/// Maps parser names and file extensions to [`Parser`] implementations. +/// Built-in parsers for Markdown and PDF are provided by [`ParserRegistry::default_parsers`]. +pub struct ParserRegistry { + parsers: HashMap>, + extension_map: HashMap, +} + +impl ParserRegistry { + /// Create an empty registry. + pub fn new() -> Self { + Self { + parsers: HashMap::new(), + extension_map: HashMap::new(), + } + } + + /// Register a parser. Extensions declared by the parser are auto-indexed. + pub fn register(&mut self, parser: impl Parser + 'static) { + let name = parser.name().to_string(); + for ext in parser.extensions() { + self.extension_map.insert(ext.to_lowercase(), name.clone()); + } + self.parsers.insert(name, Box::new(parser)); + } + + /// Builder-style registration. + pub fn with(mut self, parser: impl Parser + 'static) -> Self { + self.register(parser); + self + } + + /// Get a parser by name. + pub fn get(&self, name: &str) -> Option<&dyn Parser> { + self.parsers.get(name).map(|p| p.as_ref()) + } + + /// Get a parser by file extension (lowercase). + pub fn get_by_extension(&self, ext: &str) -> Option<&dyn Parser> { + self.extension_map + .get(&ext.to_lowercase()) + .and_then(|name| self.parsers.get(name)) + .map(|p| p.as_ref()) + } + + /// Default registry with built-in Markdown + PDF parsers. + pub fn default_parsers(llm_client: Option) -> Self { + let mut registry = Self::new(); + registry.register(markdown::MarkdownParserAdapter::new()); + registry.register(pdf::PdfParserAdapter::new(llm_client)); + registry + } + + /// List all registered parser names. + pub fn parser_names(&self) -> Vec<&str> { + self.parsers.keys().map(|s| s.as_str()).collect() + } + + /// List all supported file extensions (lowercase, no dot). + pub fn supported_extensions(&self) -> Vec<&str> { + self.extension_map.keys().map(|s| s.as_str()).collect() + } +} + +impl Default for ParserRegistry { + fn default() -> Self { + Self::default_parsers(None) + } +} + +impl std::fmt::Debug for ParserRegistry { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("ParserRegistry") + .field("parsers", &self.parsers.keys().collect::>()) + .field("extensions", &self.extension_map) + .finish() + } +} + +// --------------------------------------------------------------------------- +// Legacy free functions (backward compat — delegate to default registry) +// --------------------------------------------------------------------------- + /// Parse a string content document. pub async fn parse_content( content: &str, @@ -34,6 +174,9 @@ pub async fn parse_content( DocumentFormat::Pdf => Err(vectorless_error::Error::Parse( "PDF requires bytes, not string content".to_string(), )), + _ => Err(vectorless_error::Error::Parse( + format!("Unsupported format for content parsing: {:?}", format), + )), } } @@ -55,6 +198,9 @@ pub async fn parse_file( }; parser.parse_file(path).await } + _ => Err(vectorless_error::Error::Parse( + format!("Unsupported format for file parsing: {:?}", format), + )), } } @@ -79,6 +225,9 @@ pub async fn parse_bytes( }; parser.parse_bytes_async(bytes, None).await } + _ => Err(vectorless_error::Error::Parse( + format!("Unsupported format for bytes parsing: {:?}", format), + )), } } diff --git a/crates/vectorless-compiler/src/parse/pdf/mod.rs b/crates/vectorless-compiler/src/parse/pdf/mod.rs index 3226e44..6a25387 100644 --- a/crates/vectorless-compiler/src/parse/pdf/mod.rs +++ b/crates/vectorless-compiler/src/parse/pdf/mod.rs @@ -27,3 +27,45 @@ mod types; pub use parser::PdfParser; pub use types::PdfPage; + +use crate::parse::{Parser, ParseResult}; +use std::path::Path; +use vectorless_error::Result; +use vectorless_llm::LlmClient; + +/// [`Parser`] trait adapter for [`PdfParser`]. +pub struct PdfParserAdapter { + inner: PdfParser, +} + +impl PdfParserAdapter { + /// Create a PDF parser adapter, optionally with LLM support. + pub fn new(llm_client: Option) -> Self { + let inner = match llm_client { + Some(client) => PdfParser::with_llm_client(client), + None => PdfParser::new(), + }; + Self { inner } + } +} + +#[async_trait::async_trait] +impl Parser for PdfParserAdapter { + fn name(&self) -> &str { "pdf" } + + fn extensions(&self) -> &[&str] { &["pdf"] } + + async fn parse_content(&self, _content: &str) -> Result { + Err(vectorless_error::Error::Parse( + "PDF requires bytes, not string content".into(), + )) + } + + async fn parse_file(&self, path: &Path) -> Result { + self.inner.parse_file(path).await + } + + async fn parse_bytes(&self, data: &[u8]) -> Result { + self.inner.parse_bytes_async(data, None).await + } +} diff --git a/crates/vectorless-compiler/src/passes/frontend/parse.rs b/crates/vectorless-compiler/src/passes/frontend/parse.rs index d757bca..a3f224e 100644 --- a/crates/vectorless-compiler/src/passes/frontend/parse.rs +++ b/crates/vectorless-compiler/src/passes/frontend/parse.rs @@ -11,6 +11,7 @@ use vectorless_document::DocumentFormat; use vectorless_error::Result; use crate::SourceFormat; +use crate::parse::ParserRegistry; use crate::passes::{CompilePass, PassResult}; use crate::pipeline::{CompileContext, CompilerInput}; @@ -18,24 +19,38 @@ use crate::pipeline::{CompileContext, CompilerInput}; pub struct ParsePass { /// Optional LLM client for PDF structure extraction. llm_client: Option, + /// Parser registry for format dispatch. + registry: ParserRegistry, } impl ParsePass { - /// Create a new parse stage. + /// Create a new parse stage with default parsers. pub fn new() -> Self { - Self { llm_client: None } + Self { + llm_client: None, + registry: ParserRegistry::default_parsers(None), + } } /// Create a parse stage with an LLM client. pub fn with_llm_client(client: vectorless_llm::LlmClient) -> Self { Self { - llm_client: Some(client), + llm_client: Some(client.clone()), + registry: ParserRegistry::default_parsers(Some(client)), + } + } + + /// Create a parse stage with a custom parser registry. + pub fn with_registry(registry: ParserRegistry) -> Self { + Self { + llm_client: None, + registry, } } /// Detect document format from path and options. fn detect_format(&self, ctx: &CompileContext) -> Result { - match ctx.options.mode { + match &ctx.options.mode { SourceFormat::Auto => match &ctx.input { CompilerInput::File(path) => { let ext = path.extension().and_then(|e| e.to_str()).unwrap_or(""); @@ -43,11 +58,22 @@ impl ParsePass { vectorless_error::Error::Parse(format!("Unknown format: {}", ext)) }) } - CompilerInput::Content { format, .. } => Ok(*format), - CompilerInput::Bytes { format, .. } => Ok(*format), + CompilerInput::Content { format, .. } => Ok(format.clone()), + CompilerInput::Bytes { format, .. } => Ok(format.clone()), + CompilerInput::PreParsed { .. } => Ok(DocumentFormat::Markdown), }, SourceFormat::Markdown => Ok(DocumentFormat::Markdown), SourceFormat::Pdf => Ok(DocumentFormat::Pdf), + SourceFormat::Custom(name) => Ok(DocumentFormat::Custom(name.clone())), + } + } + + /// Resolve format name for registry lookup. + fn format_name(format: &DocumentFormat) -> &str { + match format { + DocumentFormat::Markdown => "markdown", + DocumentFormat::Pdf => "pdf", + DocumentFormat::Custom(name) => name, } } } @@ -67,63 +93,88 @@ impl CompilePass for ParsePass { async fn execute(&mut self, ctx: &mut CompileContext) -> Result { let start = Instant::now(); + // Handle pre-parsed input: skip parsing entirely + if let CompilerInput::PreParsed { nodes, name } = &ctx.input { + let nodes = nodes.clone(); + let name = name.clone(); + ctx.raw_nodes = nodes; + ctx.name = name; + ctx.format = DocumentFormat::Custom("pre-parsed".to_string()); + ctx.metrics.set_nodes_processed(ctx.raw_nodes.len()); + + let duration = start.elapsed().as_millis() as u64; + info!( + "[parse] Pre-parsed: {} nodes for '{}' ({}ms)", + ctx.raw_nodes.len(), + ctx.name, + duration + ); + + let mut stage_result = PassResult::success("parse"); + stage_result.duration_ms = duration; + stage_result.metadata.insert( + "node_count".to_string(), + serde_json::json!(ctx.raw_nodes.len()), + ); + stage_result.metadata.insert( + "source".to_string(), + serde_json::json!("pre-parsed"), + ); + return Ok(stage_result); + } + // Detect format let format = self.detect_format(ctx)?; + let format_name = Self::format_name(&format).to_string(); ctx.format = format; let input_type = match &ctx.input { CompilerInput::File(_) => "file", CompilerInput::Content { .. } => "content", CompilerInput::Bytes { .. } => "bytes", + CompilerInput::PreParsed { .. } => unreachable!(), }; + info!( - "[parse] Starting: format={:?}, input={}, llm={}", - format, + "[parse] Starting: format={}, input={}, llm={}", + format_name, input_type, self.llm_client.is_some() ); + // Look up parser in registry + let parser = self.registry.get(&format_name).ok_or_else(|| { + vectorless_error::Error::Parse(format!( + "No parser registered for format '{}'. Available: {:?}", + format_name, + self.registry.parser_names() + )) + })?; + // Parse based on input type let result = match &ctx.input { CompilerInput::File(path) => { - // Resolve path let path = path.canonicalize().unwrap_or_else(|_| path.clone()); ctx.source_path = Some(path.clone()); - - // Extract name from file ctx.name = path .file_stem() .and_then(|n| n.to_str()) .unwrap_or("document") .to_string(); - debug!("[parse] Reading file: {:?}", ctx.source_path); - - // Parse directly - crate::parse::parse_file(&path, format, self.llm_client.clone()).await? + parser.parse_file(&path).await? } - CompilerInput::Content { - content, - name, - format, - } => { - // Set name + CompilerInput::Content { content, name, .. } => { ctx.name = name.clone(); - debug!("[parse] Parsing inline content ({} chars)", content.len()); - - // Parse content directly - crate::parse::parse_content(content, *format, self.llm_client.clone()).await? + parser.parse_content(content).await? } - CompilerInput::Bytes { data, name, format } => { - // Set name + CompilerInput::Bytes { data, name, .. } => { ctx.name = name.clone(); - debug!("[parse] Parsing bytes ({} bytes)", data.len()); - - // Parse bytes - crate::parse::parse_bytes(data, *format, self.llm_client.clone()).await? + parser.parse_bytes(data).await? } + CompilerInput::PreParsed { .. } => unreachable!(), }; // Store results @@ -145,9 +196,10 @@ impl CompilePass for ParsePass { ctx.metrics.record_parse(duration); info!( - "[parse] Complete: {} nodes from '{}' ({}ms)", + "[parse] Complete: {} nodes from '{}' ({}, {}ms)", ctx.raw_nodes.len(), ctx.name, + format_name, duration ); @@ -159,7 +211,7 @@ impl CompilePass for ParsePass { ); stage_result .metadata - .insert("format".to_string(), serde_json::json!(format.extension())); + .insert("format".to_string(), serde_json::json!(&format_name)); Ok(stage_result) } diff --git a/crates/vectorless-compiler/src/pipeline/context.rs b/crates/vectorless-compiler/src/pipeline/context.rs index 7aab10e..6ef1c32 100644 --- a/crates/vectorless-compiler/src/pipeline/context.rs +++ b/crates/vectorless-compiler/src/pipeline/context.rs @@ -41,6 +41,17 @@ pub enum CompilerInput { /// Document format. format: DocumentFormat, }, + + /// Pre-parsed raw nodes — skip ParsePass entirely. + /// + /// Use this when the caller (e.g., a Python plugin) has already parsed + /// the document into structured nodes. The pipeline starts from BuildPass. + PreParsed { + /// Pre-parsed raw nodes. + nodes: Vec, + /// Document name. + name: String, + }, } impl CompilerInput { @@ -93,6 +104,19 @@ impl CompilerInput { } } + /// Create input from pre-parsed raw nodes. + /// + /// Skips ParsePass — the pipeline starts from BuildPass. + pub fn pre_parsed( + nodes: Vec, + name: impl Into, + ) -> Self { + Self::PreParsed { + nodes, + name: name.into(), + } + } + /// Check if this is a file input. pub fn is_file(&self) -> bool { matches!(self, Self::File(_)) @@ -108,12 +132,18 @@ impl CompilerInput { matches!(self, Self::Bytes { .. }) } + /// Check if this is a pre-parsed input. + pub fn is_pre_parsed(&self) -> bool { + matches!(self, Self::PreParsed { .. }) + } + /// Get the format if available. pub fn format(&self) -> Option { match self { Self::File(_) => None, - Self::Content { format, .. } => Some(*format), - Self::Bytes { format, .. } => Some(*format), + Self::Content { format, .. } => Some(format.clone()), + Self::Bytes { format, .. } => Some(format.clone()), + Self::PreParsed { .. } => None, } } } @@ -327,13 +357,19 @@ impl CompileContext { use sha2::{Digest, Sha256}; let hash = match input { CompilerInput::File(path) => { - // Hash the file path as proxy — actual content may not be readable yet - // (the parse stage reads it). This is sufficient for checkpoint invalidation - // since a different file path implies different content. Sha256::digest(path.to_string_lossy().as_bytes()) } CompilerInput::Content { content, .. } => Sha256::digest(content.as_bytes()), CompilerInput::Bytes { data, .. } => Sha256::digest(data), + CompilerInput::PreParsed { nodes, .. } => { + // Hash a summary of the nodes: count + first titles + let mut hasher = Sha256::new(); + hasher.update(nodes.len().to_le_bytes()); + for node in nodes.iter().take(10) { + hasher.update(node.title.as_bytes()); + } + hasher.finalize() + } }; format!("{:x}", hash) } diff --git a/crates/vectorless-compiler/src/pipeline/executor.rs b/crates/vectorless-compiler/src/pipeline/executor.rs index d73c3bc..41f870f 100644 --- a/crates/vectorless-compiler/src/pipeline/executor.rs +++ b/crates/vectorless-compiler/src/pipeline/executor.rs @@ -12,6 +12,7 @@ use vectorless_error::Result; use vectorless_llm::LlmClient; use super::super::PipelineOptions; +use super::super::parse::{Parser, ParserRegistry}; use super::super::passes::{ BuildPass, ChainPass, CompilePass, ConceptPass, EnhancePass, EnrichPass, NavigationPass, OptimizePass, OverlapPass, ParsePass, ReasoningPass, RoutePass, ScorePass, SplitPass, @@ -138,6 +139,37 @@ impl PipelineExecutor { Self { orchestrator } } + /// Create with a custom parser registry. + /// + /// Use this to register custom format parsers alongside the built-in + /// Markdown and PDF parsers. + pub fn with_registry(registry: ParserRegistry) -> Self { + let orchestrator = PipelineOrchestrator::new() + .stage_with_priority(ParsePass::with_registry(registry), 10) + .stage_with_priority(BuildPass::new(), 20) + .stage_with_priority(ValidatePass::new(), 22) + .stage_with_priority(SplitPass::new(), 25) + .stage_with_priority(EnrichPass::new(), 40) + .stage_with_priority(ReasoningPass::new(), 45) + .stage_with_priority(ConceptPass::new(), 47) + .stage_with_priority(NavigationPass::new(), 50) + .stage_with_priority(RoutePass::new(), 52) + .stage_with_priority(ChainPass::new(), 54) + .stage_with_priority(OverlapPass::new(), 56) + .stage_with_priority(ScorePass::new(), 58) + .stage_with_priority(VerifyPass, 55) + .stage_with_priority(OptimizePass::new(), 60); + Self { orchestrator } + } + + /// Add a single custom parser. + /// + /// Creates a default registry with built-in parsers plus the provided one. + pub fn with_parser(parser: impl Parser + 'static) -> Self { + let registry = ParserRegistry::default_parsers(None).with(parser); + Self::with_registry(registry) + } + /// Add a stage with default priority. /// /// The stage will be added after existing stages with the same priority. diff --git a/crates/vectorless-compiler/src/pipeline/orchestrator.rs b/crates/vectorless-compiler/src/pipeline/orchestrator.rs index 26c0860..d1741b6 100644 --- a/crates/vectorless-compiler/src/pipeline/orchestrator.rs +++ b/crates/vectorless-compiler/src/pipeline/orchestrator.rs @@ -576,7 +576,7 @@ impl PipelineOrchestrator { clone.existing_tree = ctx.existing_tree.clone(); clone.doc_id = ctx.doc_id.clone(); clone.name = ctx.name.clone(); - clone.format = ctx.format; + clone.format = ctx.format.clone(); clone.source_path = ctx.source_path.clone(); if let Some(ref llm) = ctx.llm_client { clone.llm_client = Some(llm.clone()); diff --git a/crates/vectorless-document/src/format.rs b/crates/vectorless-document/src/format.rs index 8901dc4..a308fdd 100644 --- a/crates/vectorless-document/src/format.rs +++ b/crates/vectorless-document/src/format.rs @@ -6,12 +6,35 @@ use serde::{Deserialize, Serialize}; /// Supported document formats. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[derive(Debug, Clone, PartialEq, Eq, Hash)] pub enum DocumentFormat { /// Markdown files (.md, .markdown) Markdown, /// PDF files (.pdf) Pdf, + /// Custom format identified by name (for parser plugins). + Custom(String), +} + +impl Serialize for DocumentFormat { + fn serialize(&self, serializer: S) -> Result { + match self { + Self::Markdown => serializer.serialize_str("markdown"), + Self::Pdf => serializer.serialize_str("pdf"), + Self::Custom(name) => serializer.serialize_str(name), + } + } +} + +impl<'de> Deserialize<'de> for DocumentFormat { + fn deserialize>(deserializer: D) -> Result { + let s = String::deserialize(deserializer)?; + match s.as_str() { + "markdown" => Ok(Self::Markdown), + "pdf" => Ok(Self::Pdf), + _ => Ok(Self::Custom(s)), + } + } } impl DocumentFormat { @@ -25,10 +48,11 @@ impl DocumentFormat { } /// Get the file extension for this format. - pub fn extension(&self) -> &'static str { + pub fn extension(&self) -> &str { match self { Self::Markdown => "md", Self::Pdf => "pdf", + Self::Custom(name) => name, } } diff --git a/crates/vectorless-document/src/lib.rs b/crates/vectorless-document/src/lib.rs index 08d7cd9..b32a05b 100644 --- a/crates/vectorless-document/src/lib.rs +++ b/crates/vectorless-document/src/lib.rs @@ -46,6 +46,7 @@ pub use toc::{TocConfig, TocEntry, TocNode, TocView}; pub use tree::{DocumentTree, RetrievalIndex}; pub use understanding::{ CURRENT_SCHEMA_VERSION, Concept, Document, DocumentInfo, DocumentMeta, IngestInput, + RawNodeInput, }; // Re-export agent acceleration types diff --git a/crates/vectorless-document/src/understanding.rs b/crates/vectorless-document/src/understanding.rs index 91a40c6..47951e6 100644 --- a/crates/vectorless-document/src/understanding.rs +++ b/crates/vectorless-document/src/understanding.rs @@ -353,6 +353,30 @@ pub enum IngestInput { /// Document content. content: String, }, + /// Compile from pre-parsed raw nodes. + /// + /// Skips the parse stage — the pipeline starts from tree building. + /// Use this when the caller has already structured the document. + PreParsed { + /// Document name. + name: String, + /// Pre-parsed raw nodes. + nodes: Vec, + }, +} + +/// A raw node for [`IngestInput::PreParsed`]. +/// +/// Simplified version of `RawNode` for external API — callers construct +/// these from Python or other languages. +#[derive(Debug, Clone)] +pub struct RawNodeInput { + /// Node title (e.g., section heading or file path). + pub title: String, + /// Node content. + pub content: String, + /// Hierarchy level (0 = root, 1 = top-level, etc.). + pub level: usize, } #[cfg(test)] diff --git a/crates/vectorless-engine/src/engine.rs b/crates/vectorless-engine/src/engine.rs index 84b9184..16a9901 100644 --- a/crates/vectorless-engine/src/engine.rs +++ b/crates/vectorless-engine/src/engine.rs @@ -374,15 +374,21 @@ impl Engine { /// The engine builds a full understanding including tree, navigation index, /// reasoning index, summary, and key concepts. pub async fn compile(&self, input: IngestInput) -> Result { + // Handle PreParsed input directly — bypass CompileSource routing + if let IngestInput::PreParsed { nodes, name } = &input { + return self.compile_pre_parsed(nodes, name).await; + } + let ctx = match &input { IngestInput::Path(path) => CompileInput::from_path(path), IngestInput::Bytes { data, format, .. } => { - CompileInput::from_bytes(data.clone(), *format) + CompileInput::from_bytes(data.clone(), format.clone()) } IngestInput::Text { content, .. } => CompileInput::from_content( content, vectorless_compiler::parse::DocumentFormat::Markdown, ), + IngestInput::PreParsed { .. } => unreachable!(), }; let result = self.compile_pipeline(ctx).await?; @@ -402,6 +408,67 @@ impl Engine { Ok(doc.info()) } + /// Compile from pre-parsed raw nodes — skips the parse stage. + async fn compile_pre_parsed( + &self, + nodes: &[vectorless_document::RawNodeInput], + name: &str, + ) -> Result { + use vectorless_compiler::parse::RawNode; + use vectorless_document::{CURRENT_SCHEMA_VERSION, DocumentMeta}; + + let raw_nodes: Vec = nodes + .iter() + .map(|n| RawNode::new(&n.title).with_content(&n.content).with_level(n.level)) + .collect(); + + let compiler_input = + vectorless_compiler::CompilerInput::pre_parsed(raw_nodes, name.to_string()); + let pipeline_options = vectorless_compiler::PipelineOptions::default(); + + let mut executor = (self.indexer.executor_factory)(); + let result = executor.execute(compiler_input, pipeline_options).await?; + + let tree = result + .tree + .ok_or_else(|| Error::Parse("Document tree not generated".to_string()))?; + + let node_count = tree.node_count(); + let doc_id = uuid::Uuid::new_v4().to_string(); + + let mut meta = DocumentMeta::new(); + meta.update_processing_stats(node_count, result.metrics.total_tokens_generated, result.metrics.total_time_ms()); + + let doc = vectorless_document::Document { + schema_version: CURRENT_SCHEMA_VERSION, + doc_id: doc_id.clone(), + name: name.to_string(), + format: "pre-parsed".to_string(), + source_path: None, + tree, + nav_index: result.navigation_index.unwrap_or_default(), + reasoning_index: result.reasoning_index.unwrap_or_default(), + summary: result.description.unwrap_or_default(), + concepts: result.concepts, + query_routes: result.query_routes, + chain_index: result.chain_index, + content_overlap: result.content_overlap, + evidence_scores: result.evidence_scores, + page_count: result.page_count, + meta: Some(meta), + }; + + self.workspace.save(&doc).await?; + + let loaded = self + .workspace + .load(&doc_id) + .await? + .ok_or_else(|| Error::Config("Document not found after compile".into()))?; + + Ok(loaded.info()) + } + /// Remove a document from the workspace. pub async fn forget(&self, doc_id: &str) -> Result<()> { self.workspace.remove(doc_id).await?; @@ -509,8 +576,8 @@ impl Engine { .indexer .detect_format_from_path(path) .unwrap_or(vectorless_compiler::parse::DocumentFormat::Markdown), - CompileSource::Content { format, .. } => *format, - CompileSource::Bytes { format, .. } => *format, + CompileSource::Content { format, .. } => format.clone(), + CompileSource::Bytes { format, .. } => format.clone(), }; let checkpoint_dir = Some(self.config.storage.checkpoint_dir.clone()); @@ -519,6 +586,9 @@ impl Engine { mode: match format { vectorless_compiler::parse::DocumentFormat::Markdown => SourceFormat::Markdown, vectorless_compiler::parse::DocumentFormat::Pdf => SourceFormat::Pdf, + vectorless_compiler::parse::DocumentFormat::Custom(ref name) => { + SourceFormat::Custom(name.clone()) + } }, generate_ids: options.generate_ids, summary_strategy: if options.generate_summaries { diff --git a/crates/vectorless-engine/src/indexer.rs b/crates/vectorless-engine/src/indexer.rs index fd31574..5854fdd 100644 --- a/crates/vectorless-engine/src/indexer.rs +++ b/crates/vectorless-engine/src/indexer.rs @@ -42,7 +42,7 @@ use vectorless_events::{CompileEvent, EventEmitter}; /// true parallel document compilation without mutex contention. pub(crate) struct IndexerClient { /// Factory for creating pipeline executors (one per compile operation). - executor_factory: Arc PipelineExecutor + Send + Sync>, + pub(crate) executor_factory: Arc PipelineExecutor + Send + Sync>, /// Event emitter. events: EventEmitter, @@ -92,11 +92,11 @@ impl IndexerClient { match source { CompileSource::Path(path) => self.index_from_path(path, name, pipeline_options).await, CompileSource::Content { data, format } => { - self.index_from_content(data, *format, name, pipeline_options) + self.index_from_content(data, format.clone(), name, pipeline_options) .await } CompileSource::Bytes { data, format } => { - self.index_from_bytes(data, *format, name, pipeline_options) + self.index_from_bytes(data, format.clone(), name, pipeline_options) .await } } @@ -152,7 +152,7 @@ impl IndexerClient { pipeline_options: PipelineOptions, ) -> Result { // Validate content before compiling - let validation = vectorless_utils::validate_content(content, format); + let validation = vectorless_utils::validate_content(content, format.clone()); if !validation.valid { return Err(Error::Parse( validation @@ -184,7 +184,7 @@ impl IndexerClient { pipeline_options: PipelineOptions, ) -> Result { // Validate bytes before compiling - let validation = vectorless_utils::validate_bytes(bytes, format); + let validation = vectorless_utils::validate_bytes(bytes, format.clone()); if !validation.valid { return Err(Error::Parse( validation @@ -230,7 +230,7 @@ impl IndexerClient { let doc_id = Uuid::new_v4().to_string(); self.events - .emit_compile(CompileEvent::FormatDetected { format }); + .emit_compile(CompileEvent::FormatDetected { format: format.clone() }); info!("Compiling {:?} document: {}", format, source_label); @@ -325,6 +325,7 @@ impl IndexerClient { SourceFormat::Markdown => DocumentFormat::Markdown, SourceFormat::Pdf => DocumentFormat::Pdf, SourceFormat::Auto => DocumentFormat::Markdown, + SourceFormat::Custom(name) => DocumentFormat::Custom(name.clone()), } } diff --git a/crates/vectorless-engine/src/lib.rs b/crates/vectorless-engine/src/lib.rs index c0af41e..a89d582 100644 --- a/crates/vectorless-engine/src/lib.rs +++ b/crates/vectorless-engine/src/lib.rs @@ -48,7 +48,7 @@ pub use vectorless_document::DocumentFormat; pub use vectorless_config::Config; pub use vectorless_document::DocumentTree; -pub use vectorless_document::{Concept, DocumentInfo, IngestInput}; +pub use vectorless_document::{Concept, DocumentInfo, IngestInput, RawNodeInput}; pub use vectorless_error::{Error, Result}; pub use vectorless_events::{CompileEvent, EventEmitter, WorkspaceEvent}; pub use vectorless_graph::{ diff --git a/crates/vectorless-py/src/engine.rs b/crates/vectorless-py/src/engine.rs index 31f64e9..a3471e5 100644 --- a/crates/vectorless-py/src/engine.rs +++ b/crates/vectorless-py/src/engine.rs @@ -8,7 +8,7 @@ use pyo3_async_runtimes::tokio::future_into_py; use std::sync::Arc; use tokio::runtime::Runtime; -use ::vectorless_engine::{Engine, EngineBuilder, IngestInput}; +use ::vectorless_engine::{Engine, EngineBuilder, IngestInput, RawNodeInput}; use super::document::{PyDocument, PyDocumentInfo}; use super::error::VectorlessError; @@ -25,6 +25,20 @@ async fn run_compile(engine: Arc, input: IngestInput) -> PyResult, + name: String, + nodes: Vec<(String, String, usize)>, +) -> PyResult { + let raw_nodes: Vec = nodes + .into_iter() + .map(|(title, content, level)| RawNodeInput { title, content, level }) + .collect(); + let input = IngestInput::PreParsed { name, nodes: raw_nodes }; + let doc = engine.compile(input).await.map_err(to_py_err)?; + Ok(PyDocumentInfo { inner: doc }) +} + async fn run_forget(engine: Arc, doc_id: String) -> PyResult<()> { engine.forget(&doc_id).await.map_err(to_py_err) } @@ -179,6 +193,30 @@ impl PyEngine { future_into_py(py, run_compile(engine, input)) } + /// Compile from pre-parsed raw nodes — skips the parse stage. + /// + /// Use this when you have already structured the document into nodes + /// (e.g., a Python plugin that parses code files into sections). + /// + /// Args: + /// name: Document name. + /// raw_nodes: List of (title, content, level) tuples. + /// + /// Returns: + /// DocumentInfo with doc_id, summary, structure, concepts. + /// + /// Raises: + /// VectorlessError: If compilation fails. + fn compile_raw<'py>( + &self, + py: Python<'py>, + name: String, + raw_nodes: Vec<(String, String, usize)>, + ) -> PyResult> { + let engine = Arc::clone(&self.inner); + future_into_py(py, run_compile_raw(engine, name, raw_nodes)) + } + /// Remove a document by ID. /// /// Args: diff --git a/vectorless/engine.py b/vectorless/engine.py index 9b991eb..d7e2b0b 100644 --- a/vectorless/engine.py +++ b/vectorless/engine.py @@ -143,6 +143,7 @@ async def compile( directory: str | Path | None = None, content: str | None = None, bytes_data: bytes | None = None, + raw_nodes: list[dict[str, Any]] | None = None, format: str = "markdown", name: str | None = None, mode: str = "default", @@ -151,14 +152,27 @@ async def compile( """Compile a document from various sources. Exactly one source must be provided: path, paths, directory, - content, or bytes_data. + content, bytes_data, or raw_nodes. + + ``raw_nodes`` accepts a list of dicts with keys ``title``, ``content``, + and ``level``. This skips the parse stage — the pipeline starts from + tree building. Use this when the caller has already structured the + document (e.g., a Python plugin that parses code files). + + Example:: + + nodes = [ + {"title": "src/main.py", "content": file_content, "level": 1}, + {"title": "src/lib.rs", "content": file_content, "level": 1}, + ] + result = await engine.compile(raw_nodes=nodes, name="my-project") """ sources_provided = sum( - x is not None for x in [path, paths, directory, content, bytes_data] + x is not None for x in [path, paths, directory, content, bytes_data, raw_nodes] ) if sources_provided != 1: raise ValueError( - "Provide exactly one source: path, paths, directory, content, or bytes_data" + "Provide exactly one source: path, paths, directory, content, bytes_data, or raw_nodes" ) # For single file, delegate to Rust compile @@ -222,6 +236,16 @@ async def compile( import os os.unlink(tmp_path) + if raw_nodes is not None: + doc_name = name or "pre-parsed" + # Convert dicts to (title, content, level) tuples + node_tuples = [ + (n.get("title", ""), n.get("content", ""), n.get("level", 1)) + for n in raw_nodes + ] + doc_info = await self._rust.compile_raw(doc_name, node_tuples) + return CompileOutput.from_doc_info(doc_info) + raise ValueError("No source provided") async def compile_batch( From 5bd1bad9eda4e665887d173b0573b28c8ec8cfcd Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Sun, 26 Apr 2026 21:57:44 +0800 Subject: [PATCH 2/2] refactor: standardize import order and improve code formatting - Standardize import order across multiple files to maintain consistency - Format function definitions and method implementations with proper indentation and line breaks - Refactor error handling chains to be more readable - Apply consistent code style throughout the parsing modules - Add documentation for vectorless-code AST parsing, getting started, and incremental compilation features --- .../src/parse/markdown/mod.rs | 19 ++- crates/vectorless-compiler/src/parse/mod.rs | 21 ++- .../vectorless-compiler/src/parse/pdf/mod.rs | 10 +- .../src/passes/frontend/parse.rs | 7 +- .../src/pipeline/context.rs | 9 +- crates/vectorless-engine/src/engine.rs | 12 +- crates/vectorless-engine/src/indexer.rs | 5 +- crates/vectorless-py/src/engine.rs | 11 +- docs/docs/vectorless-code/ast-parsing.mdx | 156 ++++++++++++++++++ docs/docs/vectorless-code/getting-started.mdx | 111 +++++++++++++ docs/docs/vectorless-code/incremental.mdx | 109 ++++++++++++ docs/sidebars.ts | 9 + 12 files changed, 443 insertions(+), 36 deletions(-) create mode 100644 docs/docs/vectorless-code/ast-parsing.mdx create mode 100644 docs/docs/vectorless-code/getting-started.mdx create mode 100644 docs/docs/vectorless-code/incremental.mdx diff --git a/crates/vectorless-compiler/src/parse/markdown/mod.rs b/crates/vectorless-compiler/src/parse/markdown/mod.rs index b49aefa..84d4596 100644 --- a/crates/vectorless-compiler/src/parse/markdown/mod.rs +++ b/crates/vectorless-compiler/src/parse/markdown/mod.rs @@ -27,7 +27,7 @@ mod parser; pub use parser::MarkdownParser; -use crate::parse::{Parser, ParseResult}; +use crate::parse::{ParseResult, Parser}; use std::path::Path; use vectorless_error::Result; @@ -39,15 +39,21 @@ pub struct MarkdownParserAdapter { impl MarkdownParserAdapter { /// Create a new Markdown parser adapter. pub fn new() -> Self { - Self { inner: MarkdownParser::new() } + Self { + inner: MarkdownParser::new(), + } } } #[async_trait::async_trait] impl Parser for MarkdownParserAdapter { - fn name(&self) -> &str { "markdown" } + fn name(&self) -> &str { + "markdown" + } - fn extensions(&self) -> &[&str] { &["md", "markdown"] } + fn extensions(&self) -> &[&str] { + &["md", "markdown"] + } async fn parse_content(&self, content: &str) -> Result { self.inner.parse(content).await @@ -58,9 +64,8 @@ impl Parser for MarkdownParserAdapter { } async fn parse_bytes(&self, data: &[u8]) -> Result { - let content = std::str::from_utf8(data).map_err(|e| { - vectorless_error::Error::Parse(format!("Invalid UTF-8: {}", e)) - })?; + let content = std::str::from_utf8(data) + .map_err(|e| vectorless_error::Error::Parse(format!("Invalid UTF-8: {}", e)))?; self.inner.parse(content).await } } diff --git a/crates/vectorless-compiler/src/parse/mod.rs b/crates/vectorless-compiler/src/parse/mod.rs index 1cfc687..ff53ccd 100644 --- a/crates/vectorless-compiler/src/parse/mod.rs +++ b/crates/vectorless-compiler/src/parse/mod.rs @@ -174,9 +174,10 @@ pub async fn parse_content( DocumentFormat::Pdf => Err(vectorless_error::Error::Parse( "PDF requires bytes, not string content".to_string(), )), - _ => Err(vectorless_error::Error::Parse( - format!("Unsupported format for content parsing: {:?}", format), - )), + _ => Err(vectorless_error::Error::Parse(format!( + "Unsupported format for content parsing: {:?}", + format + ))), } } @@ -198,9 +199,10 @@ pub async fn parse_file( }; parser.parse_file(path).await } - _ => Err(vectorless_error::Error::Parse( - format!("Unsupported format for file parsing: {:?}", format), - )), + _ => Err(vectorless_error::Error::Parse(format!( + "Unsupported format for file parsing: {:?}", + format + ))), } } @@ -225,9 +227,10 @@ pub async fn parse_bytes( }; parser.parse_bytes_async(bytes, None).await } - _ => Err(vectorless_error::Error::Parse( - format!("Unsupported format for bytes parsing: {:?}", format), - )), + _ => Err(vectorless_error::Error::Parse(format!( + "Unsupported format for bytes parsing: {:?}", + format + ))), } } diff --git a/crates/vectorless-compiler/src/parse/pdf/mod.rs b/crates/vectorless-compiler/src/parse/pdf/mod.rs index 6a25387..45a3647 100644 --- a/crates/vectorless-compiler/src/parse/pdf/mod.rs +++ b/crates/vectorless-compiler/src/parse/pdf/mod.rs @@ -28,7 +28,7 @@ mod types; pub use parser::PdfParser; pub use types::PdfPage; -use crate::parse::{Parser, ParseResult}; +use crate::parse::{ParseResult, Parser}; use std::path::Path; use vectorless_error::Result; use vectorless_llm::LlmClient; @@ -51,9 +51,13 @@ impl PdfParserAdapter { #[async_trait::async_trait] impl Parser for PdfParserAdapter { - fn name(&self) -> &str { "pdf" } + fn name(&self) -> &str { + "pdf" + } - fn extensions(&self) -> &[&str] { &["pdf"] } + fn extensions(&self) -> &[&str] { + &["pdf"] + } async fn parse_content(&self, _content: &str) -> Result { Err(vectorless_error::Error::Parse( diff --git a/crates/vectorless-compiler/src/passes/frontend/parse.rs b/crates/vectorless-compiler/src/passes/frontend/parse.rs index a3f224e..f6bfe1b 100644 --- a/crates/vectorless-compiler/src/passes/frontend/parse.rs +++ b/crates/vectorless-compiler/src/passes/frontend/parse.rs @@ -116,10 +116,9 @@ impl CompilePass for ParsePass { "node_count".to_string(), serde_json::json!(ctx.raw_nodes.len()), ); - stage_result.metadata.insert( - "source".to_string(), - serde_json::json!("pre-parsed"), - ); + stage_result + .metadata + .insert("source".to_string(), serde_json::json!("pre-parsed")); return Ok(stage_result); } diff --git a/crates/vectorless-compiler/src/pipeline/context.rs b/crates/vectorless-compiler/src/pipeline/context.rs index 6ef1c32..48d3d3f 100644 --- a/crates/vectorless-compiler/src/pipeline/context.rs +++ b/crates/vectorless-compiler/src/pipeline/context.rs @@ -107,10 +107,7 @@ impl CompilerInput { /// Create input from pre-parsed raw nodes. /// /// Skips ParsePass — the pipeline starts from BuildPass. - pub fn pre_parsed( - nodes: Vec, - name: impl Into, - ) -> Self { + pub fn pre_parsed(nodes: Vec, name: impl Into) -> Self { Self::PreParsed { nodes, name: name.into(), @@ -356,9 +353,7 @@ impl CompileContext { fn compute_source_hash(input: &CompilerInput) -> String { use sha2::{Digest, Sha256}; let hash = match input { - CompilerInput::File(path) => { - Sha256::digest(path.to_string_lossy().as_bytes()) - } + CompilerInput::File(path) => Sha256::digest(path.to_string_lossy().as_bytes()), CompilerInput::Content { content, .. } => Sha256::digest(content.as_bytes()), CompilerInput::Bytes { data, .. } => Sha256::digest(data), CompilerInput::PreParsed { nodes, .. } => { diff --git a/crates/vectorless-engine/src/engine.rs b/crates/vectorless-engine/src/engine.rs index 16a9901..be60d5f 100644 --- a/crates/vectorless-engine/src/engine.rs +++ b/crates/vectorless-engine/src/engine.rs @@ -419,7 +419,11 @@ impl Engine { let raw_nodes: Vec = nodes .iter() - .map(|n| RawNode::new(&n.title).with_content(&n.content).with_level(n.level)) + .map(|n| { + RawNode::new(&n.title) + .with_content(&n.content) + .with_level(n.level) + }) .collect(); let compiler_input = @@ -437,7 +441,11 @@ impl Engine { let doc_id = uuid::Uuid::new_v4().to_string(); let mut meta = DocumentMeta::new(); - meta.update_processing_stats(node_count, result.metrics.total_tokens_generated, result.metrics.total_time_ms()); + meta.update_processing_stats( + node_count, + result.metrics.total_tokens_generated, + result.metrics.total_time_ms(), + ); let doc = vectorless_document::Document { schema_version: CURRENT_SCHEMA_VERSION, diff --git a/crates/vectorless-engine/src/indexer.rs b/crates/vectorless-engine/src/indexer.rs index 5854fdd..7aec623 100644 --- a/crates/vectorless-engine/src/indexer.rs +++ b/crates/vectorless-engine/src/indexer.rs @@ -229,8 +229,9 @@ impl IndexerClient { }); let doc_id = Uuid::new_v4().to_string(); - self.events - .emit_compile(CompileEvent::FormatDetected { format: format.clone() }); + self.events.emit_compile(CompileEvent::FormatDetected { + format: format.clone(), + }); info!("Compiling {:?} document: {}", format, source_label); diff --git a/crates/vectorless-py/src/engine.rs b/crates/vectorless-py/src/engine.rs index a3471e5..49ef759 100644 --- a/crates/vectorless-py/src/engine.rs +++ b/crates/vectorless-py/src/engine.rs @@ -32,9 +32,16 @@ async fn run_compile_raw( ) -> PyResult { let raw_nodes: Vec = nodes .into_iter() - .map(|(title, content, level)| RawNodeInput { title, content, level }) + .map(|(title, content, level)| RawNodeInput { + title, + content, + level, + }) .collect(); - let input = IngestInput::PreParsed { name, nodes: raw_nodes }; + let input = IngestInput::PreParsed { + name, + nodes: raw_nodes, + }; let doc = engine.compile(input).await.map_err(to_py_err)?; Ok(PyDocumentInfo { inner: doc }) } diff --git a/docs/docs/vectorless-code/ast-parsing.mdx b/docs/docs/vectorless-code/ast-parsing.mdx new file mode 100644 index 0000000..250c21a --- /dev/null +++ b/docs/docs/vectorless-code/ast-parsing.mdx @@ -0,0 +1,156 @@ +--- +sidebar_position: 2 +--- + +# AST-Level Code Parsing + +vectorless-code uses tree-sitter to parse source code into semantic nodes — functions, classes, methods — instead of treating files as flat text. This produces a structured tree that the vectorless engine can navigate with precision. + +## Why AST Parsing Matters + +Naive code indexing treats each file as a single block of text. When you ask "how does authentication work", the engine has to scan entire files hoping to find relevant snippets. There's no understanding of what a function is, what a class contains, or how methods relate to their parent class. + +AST parsing changes this. The engine receives a tree like: + +``` +src/auth.py +├── class_definition: AuthService +│ ├── function_definition: __init__ +│ ├── function_definition: login +│ └── function_definition: verify_token +└── function_definition: create_session +``` + +Now the Orchestrator can `cd` into `AuthService`, `ls` to see its methods, and `cat login` to read the authentication logic. This is the same navigation model that works for documents — applied to code with structural precision. + +## How It Works + +### Per-Language Node Types + +Each language defines which AST node types represent semantic units worth indexing: + +```python +SPLITTABLE_NODE_TYPES = { + "python": { + "function_definition", + "class_definition", + "decorated_definition", + "async_function_definition", + }, + "rust": { + "function_item", + "impl_item", + "struct_item", + "enum_item", + "trait_item", + "mod_item", + }, + # ... 12 languages total +} +``` + +tree-sitter parses the source into an AST, then vectorless-code walks the tree extracting nodes whose type matches this set. Each extracted node becomes a `CodeNode` with: + +- `name` — the symbol name (e.g. `AuthService`, `login`) +- `node_type` — the AST node type (e.g. `class_definition`) +- `content` — the full source code of the node +- `children` — nested definitions (methods inside classes) + +### Nested Extraction + +When a class is extracted, its methods are extracted as children — not as separate top-level nodes. This preserves the parent-child relationship: + +```python +# Input: Python source +class AuthService: + def login(self, username, password): + token = self._create_token(username) + return token + + def verify_token(self, token): + return self._decode(token) + +# Output: CodeNode tree +CodeNode( + name="AuthService", + node_type="class_definition", + children=[ + CodeNode(name="login", node_type="function_definition", ...), + CodeNode(name="verify_token", node_type="function_definition", ...), + ], +) +``` + +This nesting produces the raw_node tree that vectorless builds into a navigable Document. Level 1 = file, Level 2 = top-level definitions, Level 3 = nested definitions. + +### Name Extraction + +The parser extracts human-readable names from AST nodes by finding identifier children: + +- `function_definition` → looks for `identifier` child → `"login"` +- `class_definition` → looks for `identifier` child → `"AuthService"` +- `decorated_definition` → recurses into the decorated node +- `impl_item` → looks for `type_identifier` → `"impl UserService"` + +## Fallback Strategy + +When tree-sitter is unavailable (unsupported language, grammar not installed, parse error), vectorless-code falls back to line-based splitting — splitting on blank-line boundaries into blocks. This produces flat `block` nodes without nesting, but still provides functional indexing. + +The fallback is transparent. The same `parse_file()` function handles both paths: + +```python +def parse_file(file_path, content, language): + parser = _get_parser(language) # cached per language + if parser is None: + return fallback_split(content, file_path, language) + + nodes = ast_extract(parser, content, language) + if not nodes: + return fallback_split(content, file_path, language) + return nodes +``` + +## Performance Considerations + +### Parser Caching + +tree-sitter `Parser` instances are cached per language. A 10,000-file Python project creates exactly one Python parser, reused for every `.py` file. This avoids repeated memory allocation and grammar loading. + +### Single-Pass File Scan + +Files are read exactly once. A single pass computes: + +1. File hash (SHA-256 for incremental detection) +2. Stats (line count, byte size, language distribution) +3. Content for parsing + +### Incremental Parsing + +On subsequent compiles, only files whose hash changed are re-parsed. Unchanged files reuse cached raw_nodes directly. See [Incremental Compilation](./incremental.mdx). + +## Adding a New Language + +To add support for a new language: + +1. Add the language to `SPLITTABLE_NODE_TYPES` with the relevant AST node types +2. Add the tree-sitter grammar package to `pyproject.toml` dependencies +3. Add the package mapping to `_LANG_PACKAGE_MAP` + +For example, to add Zig: + +```python +# ast_parser.py +SPLITTABLE_NODE_TYPES["zig"] = { + "FunctionDecl", + "TopLevelDecl", +} + +_LANG_PACKAGE_MAP["zig"] = "tree_sitter_zig" +``` + +```toml +# pyproject.toml +"tree-sitter-zig>=0.21", +``` + +No other code changes needed. The parser, cache, fallback, and incremental systems handle it automatically. diff --git a/docs/docs/vectorless-code/getting-started.mdx b/docs/docs/vectorless-code/getting-started.mdx new file mode 100644 index 0000000..53a2b90 --- /dev/null +++ b/docs/docs/vectorless-code/getting-started.mdx @@ -0,0 +1,111 @@ +--- +sidebar_position: 1 +--- + +# Getting Started + +**vectorless-code** is a code-aware search engine built on vectorless. It indexes your codebase using tree-sitter AST parsing and answers questions via LLM reasoning — no embeddings, no vector database. + +## Install + +```bash +pip install vectorless-code +``` + +## Quick Start + +```bash +# Initialize in your project directory +cd your-project +vcc init +``` + +This creates `.vectorless_code/settings.yml` with default include/exclude patterns. + +```bash +# Compile the codebase +vcc compile +``` + +This discovers code files, parses them with tree-sitter into semantic nodes (functions, classes, methods), and feeds them to the vectorless engine. + +```bash +# Ask a question +vcc ask "where is the authentication logic" +``` + +## How It Works + +``` +vcc compile + │ + ├─ File Discovery (gitignore-aware) + │ + ├─ AST Parsing (tree-sitter) + │ ├─ Per-language SPLITTABLE_NODE_TYPES + │ └─ Fallback: line-based splitting + │ + ├─ Incremental (SHA-256 per-file hashing) + │ ├─ Changed/new → re-parse + │ └─ Unchanged → reuse cached raw_nodes + │ + └─ Engine.compile(raw_nodes=...) + └─ BuildPass → EnrichPass → ReasoningPass → NavigationPass +``` + +## Configuration + +### Project Settings + +`.vectorless_code/settings.yml`: + +```yaml +# File patterns to include +include_patterns: + - "**/*.py" + - "**/*.rs" + - "**/*.ts" + +# File patterns to exclude +exclude_patterns: + - "**/.*" + - "**/node_modules" + - "**/target" +``` + +### API Key + +```bash +# Option 1: Environment variable +export VECTORLESS_API_KEY="sk-..." + +# Option 2: Prompted during vcc init +vcc init +``` + +### Supported Languages + +| Language | Extensions | AST Parsing | +|----------|-----------|-------------| +| Python | `.py`, `.pyi` | Yes | +| Rust | `.rs` | Yes | +| Go | `.go` | Yes | +| JavaScript | `.js`, `.jsx`, `.mjs` | Yes | +| TypeScript | `.ts`, `.tsx` | Yes | +| Java | `.java` | Yes | +| C | `.c`, `.h` | Yes | +| C++ | `.cpp`, `.hpp`, `.cc` | Yes | +| Ruby | `.rb` | Yes | +| Swift | `.swift` | Yes | +| Kotlin | `.kt` | Yes | +| Scala | `.scala` | Yes | +| Others | `.sql`, `.sh`, `.lua`, etc. | Fallback (line-based) | + +## CLI Reference + +| Command | Description | +|---------|-------------| +| `vcc init` | Initialize project settings | +| `vcc compile` | Compile codebase into searchable index | +| `vcc ask ` | Ask a question about the codebase | +| `vcc status` | Show compilation status and statistics | diff --git a/docs/docs/vectorless-code/incremental.mdx b/docs/docs/vectorless-code/incremental.mdx new file mode 100644 index 0000000..3184afd --- /dev/null +++ b/docs/docs/vectorless-code/incremental.mdx @@ -0,0 +1,109 @@ +--- +sidebar_position: 3 +--- + +# Incremental Compilation + +For large codebases, re-parsing every file on each compile is wasteful. vectorless-code uses per-file SHA-256 hashing and a two-tier cache to skip unchanged files entirely. + +## The Problem + +A mid-size project might have 5,000 source files. Full AST parsing at ~10ms per file takes ~50 seconds. But in a typical edit session, maybe 5 files changed. Re-parsing all 5,000 files to index 5 changes is 1,000x more work than necessary. + +## Solution: Hash + Cache + +``` +First compile: + scan all files → hash → parse all → build raw_nodes → compile + +Subsequent compiles: + scan all files → hash → detect changes + ├─ 5 changed files → parse → build raw_nodes (fresh) + └─ 4,995 unchanged files → reuse cached raw_nodes + merge → compile +``` + +### Hash Computation + +Each file's content is SHA-256 hashed during the scan pass. The hash is deterministic — same content always produces the same hash, regardless of file metadata. + +```python +current_hashes[rel] = hashlib.sha256(content.encode("utf-8")).hexdigest() +``` + +### Change Detection + +Comparing current hashes against the previous compile's hashes produces three sets: + +- **Changed or new** — hash differs or file didn't exist before → needs parsing +- **Unchanged** — hash matches → reuse cached raw_nodes +- **Removed** — file existed before but not now → exclude from output + +### Two-Tier Cache + +The cache stores two files in `.vectorless_code/cache/`: + +| File | Content | Purpose | +|------|---------|---------| +| `hashes.json` | `{rel_path: sha256_hex}` | Change detection | +| `parsed_nodes.json` | `{rel_path: [raw_nodes]}` | Skip re-parsing unchanged files | + +On incremental compile: + +1. Load `hashes.json` and `parsed_nodes.json` from previous run +2. Scan all files, compute current hashes +3. For changed files: parse with AST, build fresh raw_nodes +4. For unchanged files: load raw_nodes from `parsed_nodes.json` +5. Merge, sort by path, compile + +## Performance Impact + +| Scenario | Files | Changed | Parsing Time | +|----------|-------|---------|-------------| +| First compile | 5,000 | 5,000 | ~50s | +| Incremental (5 changes) | 5,000 | 5 | ~0.5s | +| Incremental (0 changes) | 5,000 | 0 | ~0s | + +The scan pass (read + hash) still touches every file, but this is I/O-bound and fast compared to AST parsing. The expensive work — tree-sitter parsing — only runs on changed files. + +## Cache Consistency + +### When is cache invalidated? + +- **File content changes** — hash mismatch triggers re-parse +- **File removed** — excluded from merged output +- **New file added** — no previous hash, treated as changed + +### What about parser upgrades? + +If `SPLITTABLE_NODE_TYPES` changes (e.g., adding a new node type to extract), the cache still contains raw_nodes built with the old configuration. To force a full re-parse: + +```bash +rm -rf .vectorless_code/cache/ +vcc compile +``` + +### When does cache get saved? + +Cache is written after a successful compile. If the compile fails (e.g., API error), the cache is not updated — the next compile will retry with the same cache state. + +## Implementation Detail + +The scan-then-parse separation is intentional: + +```python +# Step 1: Scan (cheap, I/O-bound) +current_hashes, stats, content_map = _scan_files(files, root) + +# Step 2: Parse only changed files (expensive, CPU-bound) +changed = [p for p, h in current_hashes.items() if prev_hashes.get(p) != h] +for rel in changed: + nodes = parse_file(rel, content_map[rel], lang) + fresh_raw[rel] = build_raw_nodes([(rel, lang, nodes)]) + +# Step 3: Merge with cached +for rel in unchanged: + merged_raw[rel] = cached_raw[rel] +``` + +`content_map` holds file contents in memory during the scan pass and is released (`del content_map`) after parsing completes. For large codebases, this means peak memory is roughly the sum of all file contents — acceptable because file contents are strings that Python can manage efficiently, and they're freed before the compile step which has its own memory profile. diff --git a/docs/sidebars.ts b/docs/sidebars.ts index c7ee832..678e0a0 100644 --- a/docs/sidebars.ts +++ b/docs/sidebars.ts @@ -11,6 +11,15 @@ const sidebars: SidebarsConfig = { ], }, 'architecture', + { + type: 'category', + label: 'vectorless-code', + items: [ + 'vectorless-code/getting-started', + 'vectorless-code/ast-parsing', + 'vectorless-code/incremental', + ], + }, { type: 'category', label: 'Vectorless Compiler',