From 80ce5a627417f77f6564fec279374c82556a6aca Mon Sep 17 00:00:00 2001
From: zTgx <747674262@qq.com>
Date: Sun, 26 Apr 2026 18:55:37 +0800
Subject: [PATCH 1/2] feat: add parser registry and custom format support

Add extensible parser system with trait-based architecture that allows
custom format parsers to be registered alongside built-in Markdown and
PDF support.

BREAKING CHANGE: SourceFormat enum now includes Custom variant and
derive macro changes may affect existing code.

- Introduce Parser trait and ParserRegistry for custom format support
- Add SourceFormat::Custom variant for plugin-resolved formats
- Support pre-parsed raw nodes input to skip parsing stage
- Provide builder methods for PipelineExecutor with custom registries
- Add Python bindings for raw node compilation
- Update documentation with examples for adding custom parsers
---
 crates/vectorless-compiler/src/config.rs      |   4 +-
 crates/vectorless-compiler/src/lib.rs         |   3 +
 .../src/parse/markdown/mod.rs                 |  38 +++++
 crates/vectorless-compiler/src/parse/mod.rs   | 153 +++++++++++++++++-
 .../vectorless-compiler/src/parse/pdf/mod.rs  |  42 +++++
 .../src/passes/frontend/parse.rs              | 118 ++++++++++----
 .../src/pipeline/context.rs                   |  46 +++++-
 .../src/pipeline/executor.rs                  |  32 ++++
 .../src/pipeline/orchestrator.rs              |   2 +-
 crates/vectorless-document/src/format.rs      |  28 +++-
 crates/vectorless-document/src/lib.rs         |   1 +
 .../vectorless-document/src/understanding.rs  |  24 +++
 crates/vectorless-engine/src/engine.rs        |  76 ++++++++-
 crates/vectorless-engine/src/indexer.rs       |  13 +-
 crates/vectorless-engine/src/lib.rs           |   2 +-
 crates/vectorless-py/src/engine.rs            |  40 ++++-
 vectorless/engine.py                          |  30 +++-
 17 files changed, 594 insertions(+), 58 deletions(-)

diff --git a/crates/vectorless-compiler/src/config.rs b/crates/vectorless-compiler/src/config.rs
index b129ddc..4983241 100644
--- a/crates/vectorless-compiler/src/config.rs
+++ b/crates/vectorless-compiler/src/config.rs
@@ -18,7 +18,7 @@ use vectorless_utils::fingerprint::{Fingerprint, Fingerprinter};
 use std::path::PathBuf;
 
 /// Index mode for document processing.
-#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+#[derive(Debug, Clone, PartialEq, Eq)]
 pub enum SourceFormat {
     /// Auto-detect format from file extension.
     Auto,
@@ -26,6 +26,8 @@ pub enum SourceFormat {
     Markdown,
     /// Force PDF format.
     Pdf,
+    /// Custom format resolved via [`ParserRegistry`](crate::parse::ParserRegistry).
+    Custom(String),
 }
 
 impl Default for SourceFormat {
diff --git a/crates/vectorless-compiler/src/lib.rs b/crates/vectorless-compiler/src/lib.rs
index 29be8f6..d60697b 100644
--- a/crates/vectorless-compiler/src/lib.rs
+++ b/crates/vectorless-compiler/src/lib.rs
@@ -67,6 +67,9 @@ pub mod summary;
 // Re-export main types from pipeline
 pub use pipeline::{CompileMetrics, CompileResult, CompilerInput, PipelineExecutor};
 
+// Re-export parser plugin types
+pub use parse::{Parser, ParserRegistry};
+
 // Re-export config types
 pub use config::{PipelineOptions, SourceFormat, ThinningConfig};
 pub use vectorless_document::ReasoningIndexConfig;
diff --git a/crates/vectorless-compiler/src/parse/markdown/mod.rs b/crates/vectorless-compiler/src/parse/markdown/mod.rs
index e384f52..b49aefa 100644
--- a/crates/vectorless-compiler/src/parse/markdown/mod.rs
+++ b/crates/vectorless-compiler/src/parse/markdown/mod.rs
@@ -26,3 +26,41 @@ mod frontmatter;
 mod parser;
 
 pub use parser::MarkdownParser;
+
+use crate::parse::{Parser, ParseResult};
+use std::path::Path;
+use vectorless_error::Result;
+
+/// [`Parser`] trait adapter for [`MarkdownParser`].
+pub struct MarkdownParserAdapter {
+    inner: MarkdownParser,
+}
+
+impl MarkdownParserAdapter {
+    /// Create a new Markdown parser adapter.
+    pub fn new() -> Self {
+        Self { inner: MarkdownParser::new() }
+    }
+}
+
+#[async_trait::async_trait]
+impl Parser for MarkdownParserAdapter {
+    fn name(&self) -> &str { "markdown" }
+
+    fn extensions(&self) -> &[&str] { &["md", "markdown"] }
+
+    async fn parse_content(&self, content: &str) -> Result<ParseResult> {
+        self.inner.parse(content).await
+    }
+
+    async fn parse_file(&self, path: &Path) -> Result<ParseResult> {
+        self.inner.parse_file(path).await
+    }
+
+    async fn parse_bytes(&self, data: &[u8]) -> Result<ParseResult> {
+        let content = std::str::from_utf8(data).map_err(|e| {
+            vectorless_error::Error::Parse(format!("Invalid UTF-8: {}", e))
+        })?;
+        self.inner.parse(content).await
+    }
+}
diff --git a/crates/vectorless-compiler/src/parse/mod.rs b/crates/vectorless-compiler/src/parse/mod.rs
index 593f654..1cfc687 100644
--- a/crates/vectorless-compiler/src/parse/mod.rs
+++ b/crates/vectorless-compiler/src/parse/mod.rs
@@ -3,8 +3,26 @@
 
 //! Document parsing for the compile pipeline.
 //!
-//! Supports Markdown and PDF formats. Parsing is dispatched directly
-//! via `match` — no trait objects or registry needed.
+//! Supports Markdown and PDF formats out of the box. Custom parsers can be
+//! added via the [`Parser`] trait and [`ParserRegistry`].
+//!
+//! # Adding a custom parser
+//!
+//! ```rust,ignore
+//! use vectorless_compiler::parse::{Parser, ParseResult, ParserRegistry};
+//!
+//! struct MyParser;
+//!
+//! #[async_trait]
+//! impl Parser for MyParser {
+//!     fn name(&self) -> &str { "my-format" }
+//!     fn extensions(&self) -> &[&str] { &["foo", "bar"] }
+//!     async fn parse_content(&self, content: &str) -> Result<ParseResult> { ... }
+//!     async fn parse_file(&self, path: &Path) -> Result<ParseResult> { ... }
+//! }
+//!
+//! let registry = ParserRegistry::default_parsers(None).with(MyParser);
+//! ```
 
 pub mod markdown;
 pub mod pdf;
@@ -14,12 +32,134 @@ pub mod types;
 // Re-export core types at module level
 pub use types::{DocumentFormat, DocumentMeta, ParseResult, RawNode};
 
+use std::collections::HashMap;
 use std::path::Path;
 
 use crate::parse::markdown::MarkdownParser;
 use vectorless_error::Result;
 use vectorless_llm::LlmClient;
 
+// ---------------------------------------------------------------------------
+// Parser trait
+// ---------------------------------------------------------------------------
+
+/// Trait for document format parsers.
+///
+/// Implement this to add support for a new document format.
+/// Register via [`ParserRegistry::register`] or [`ParserRegistry::with`].
+#[async_trait::async_trait]
+pub trait Parser: Send + Sync {
+    /// Parser name (e.g., "markdown", "pdf", "code").
+    fn name(&self) -> &str;
+
+    /// File extensions this parser handles, without dot (e.g., `["py", "rs"]`).
+    fn extensions(&self) -> &[&str] {
+        &[]
+    }
+
+    /// Parse string content into raw nodes.
+    async fn parse_content(&self, content: &str) -> Result<ParseResult>;
+
+    /// Parse a file into raw nodes.
+    async fn parse_file(&self, path: &Path) -> Result<ParseResult>;
+
+    /// Parse binary data into raw nodes.
+    async fn parse_bytes(&self, data: &[u8]) -> Result<ParseResult> {
+        let _ = data;
+        Err(vectorless_error::Error::Parse(
+            "Binary parsing not supported by this parser".into(),
+        ))
+    }
+}
+
+// ---------------------------------------------------------------------------
+// ParserRegistry
+// ---------------------------------------------------------------------------
+
+/// Registry of document format parsers.
+///
+/// Maps parser names and file extensions to [`Parser`] implementations.
+/// Built-in parsers for Markdown and PDF are provided by [`ParserRegistry::default_parsers`].
+pub struct ParserRegistry {
+    parsers: HashMap<String, Box<dyn Parser>>,
+    extension_map: HashMap<String, String>,
+}
+
+impl ParserRegistry {
+    /// Create an empty registry.
+    pub fn new() -> Self {
+        Self {
+            parsers: HashMap::new(),
+            extension_map: HashMap::new(),
+        }
+    }
+
+    /// Register a parser. Extensions declared by the parser are auto-indexed.
+    pub fn register(&mut self, parser: impl Parser + 'static) {
+        let name = parser.name().to_string();
+        for ext in parser.extensions() {
+            self.extension_map.insert(ext.to_lowercase(), name.clone());
+        }
+        self.parsers.insert(name, Box::new(parser));
+    }
+
+    /// Builder-style registration.
+    pub fn with(mut self, parser: impl Parser + 'static) -> Self {
+        self.register(parser);
+        self
+    }
+
+    /// Get a parser by name.
+    pub fn get(&self, name: &str) -> Option<&dyn Parser> {
+        self.parsers.get(name).map(|p| p.as_ref())
+    }
+
+    /// Get a parser by file extension (lowercase).
+    pub fn get_by_extension(&self, ext: &str) -> Option<&dyn Parser> {
+        self.extension_map
+            .get(&ext.to_lowercase())
+            .and_then(|name| self.parsers.get(name))
+            .map(|p| p.as_ref())
+    }
+
+    /// Default registry with built-in Markdown + PDF parsers.
+    pub fn default_parsers(llm_client: Option<LlmClient>) -> Self {
+        let mut registry = Self::new();
+        registry.register(markdown::MarkdownParserAdapter::new());
+        registry.register(pdf::PdfParserAdapter::new(llm_client));
+        registry
+    }
+
+    /// List all registered parser names.
+    pub fn parser_names(&self) -> Vec<&str> {
+        self.parsers.keys().map(|s| s.as_str()).collect()
+    }
+
+    /// List all supported file extensions (lowercase, no dot).
+    pub fn supported_extensions(&self) -> Vec<&str> {
+        self.extension_map.keys().map(|s| s.as_str()).collect()
+    }
+}
+
+impl Default for ParserRegistry {
+    fn default() -> Self {
+        Self::default_parsers(None)
+    }
+}
+
+impl std::fmt::Debug for ParserRegistry {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("ParserRegistry")
+            .field("parsers", &self.parsers.keys().collect::<Vec<_>>())
+            .field("extensions", &self.extension_map)
+            .finish()
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Legacy free functions (backward compat — delegate to default registry)
+// ---------------------------------------------------------------------------
+
 /// Parse a string content document.
 pub async fn parse_content(
     content: &str,
@@ -34,6 +174,9 @@ pub async fn parse_content(
         DocumentFormat::Pdf => Err(vectorless_error::Error::Parse(
             "PDF requires bytes, not string content".to_string(),
         )),
+        _ => Err(vectorless_error::Error::Parse(
+            format!("Unsupported format for content parsing: {:?}", format),
+        )),
     }
 }
 
@@ -55,6 +198,9 @@ pub async fn parse_file(
             };
             parser.parse_file(path).await
         }
+        _ => Err(vectorless_error::Error::Parse(
+            format!("Unsupported format for file parsing: {:?}", format),
+        )),
     }
 }
 
@@ -79,6 +225,9 @@ pub async fn parse_bytes(
             };
             parser.parse_bytes_async(bytes, None).await
         }
+        _ => Err(vectorless_error::Error::Parse(
+            format!("Unsupported format for bytes parsing: {:?}", format),
+        )),
     }
 }
 
diff --git a/crates/vectorless-compiler/src/parse/pdf/mod.rs b/crates/vectorless-compiler/src/parse/pdf/mod.rs
index 3226e44..6a25387 100644
--- a/crates/vectorless-compiler/src/parse/pdf/mod.rs
+++ b/crates/vectorless-compiler/src/parse/pdf/mod.rs
@@ -27,3 +27,45 @@ mod types;
 
 pub use parser::PdfParser;
 pub use types::PdfPage;
+
+use crate::parse::{Parser, ParseResult};
+use std::path::Path;
+use vectorless_error::Result;
+use vectorless_llm::LlmClient;
+
+/// [`Parser`] trait adapter for [`PdfParser`].
+pub struct PdfParserAdapter {
+    inner: PdfParser,
+}
+
+impl PdfParserAdapter {
+    /// Create a PDF parser adapter, optionally with LLM support.
+    pub fn new(llm_client: Option<LlmClient>) -> Self {
+        let inner = match llm_client {
+            Some(client) => PdfParser::with_llm_client(client),
+            None => PdfParser::new(),
+        };
+        Self { inner }
+    }
+}
+
+#[async_trait::async_trait]
+impl Parser for PdfParserAdapter {
+    fn name(&self) -> &str { "pdf" }
+
+    fn extensions(&self) -> &[&str] { &["pdf"] }
+
+    async fn parse_content(&self, _content: &str) -> Result<ParseResult> {
+        Err(vectorless_error::Error::Parse(
+            "PDF requires bytes, not string content".into(),
+        ))
+    }
+
+    async fn parse_file(&self, path: &Path) -> Result<ParseResult> {
+        self.inner.parse_file(path).await
+    }
+
+    async fn parse_bytes(&self, data: &[u8]) -> Result<ParseResult> {
+        self.inner.parse_bytes_async(data, None).await
+    }
+}
diff --git a/crates/vectorless-compiler/src/passes/frontend/parse.rs b/crates/vectorless-compiler/src/passes/frontend/parse.rs
index d757bca..a3f224e 100644
--- a/crates/vectorless-compiler/src/passes/frontend/parse.rs
+++ b/crates/vectorless-compiler/src/passes/frontend/parse.rs
@@ -11,6 +11,7 @@ use vectorless_document::DocumentFormat;
 use vectorless_error::Result;
 
 use crate::SourceFormat;
+use crate::parse::ParserRegistry;
 use crate::passes::{CompilePass, PassResult};
 use crate::pipeline::{CompileContext, CompilerInput};
 
@@ -18,24 +19,38 @@ use crate::pipeline::{CompileContext, CompilerInput};
 pub struct ParsePass {
     /// Optional LLM client for PDF structure extraction.
     llm_client: Option<vectorless_llm::LlmClient>,
+    /// Parser registry for format dispatch.
+    registry: ParserRegistry,
 }
 
 impl ParsePass {
-    /// Create a new parse stage.
+    /// Create a new parse stage with default parsers.
     pub fn new() -> Self {
-        Self { llm_client: None }
+        Self {
+            llm_client: None,
+            registry: ParserRegistry::default_parsers(None),
+        }
     }
 
     /// Create a parse stage with an LLM client.
     pub fn with_llm_client(client: vectorless_llm::LlmClient) -> Self {
         Self {
-            llm_client: Some(client),
+            llm_client: Some(client.clone()),
+            registry: ParserRegistry::default_parsers(Some(client)),
+        }
+    }
+
+    /// Create a parse stage with a custom parser registry.
+    pub fn with_registry(registry: ParserRegistry) -> Self {
+        Self {
+            llm_client: None,
+            registry,
         }
     }
 
     /// Detect document format from path and options.
     fn detect_format(&self, ctx: &CompileContext) -> Result<DocumentFormat> {
-        match ctx.options.mode {
+        match &ctx.options.mode {
             SourceFormat::Auto => match &ctx.input {
                 CompilerInput::File(path) => {
                     let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
@@ -43,11 +58,22 @@ impl ParsePass {
                         vectorless_error::Error::Parse(format!("Unknown format: {}", ext))
                     })
                 }
-                CompilerInput::Content { format, .. } => Ok(*format),
-                CompilerInput::Bytes { format, .. } => Ok(*format),
+                CompilerInput::Content { format, .. } => Ok(format.clone()),
+                CompilerInput::Bytes { format, .. } => Ok(format.clone()),
+                CompilerInput::PreParsed { .. } => Ok(DocumentFormat::Markdown),
             },
             SourceFormat::Markdown => Ok(DocumentFormat::Markdown),
             SourceFormat::Pdf => Ok(DocumentFormat::Pdf),
+            SourceFormat::Custom(name) => Ok(DocumentFormat::Custom(name.clone())),
+        }
+    }
+
+    /// Resolve format name for registry lookup.
+    fn format_name(format: &DocumentFormat) -> &str {
+        match format {
+            DocumentFormat::Markdown => "markdown",
+            DocumentFormat::Pdf => "pdf",
+            DocumentFormat::Custom(name) => name,
         }
     }
 }
@@ -67,63 +93,88 @@ impl CompilePass for ParsePass {
     async fn execute(&mut self, ctx: &mut CompileContext) -> Result<PassResult> {
         let start = Instant::now();
 
+        // Handle pre-parsed input: skip parsing entirely
+        if let CompilerInput::PreParsed { nodes, name } = &ctx.input {
+            let nodes = nodes.clone();
+            let name = name.clone();
+            ctx.raw_nodes = nodes;
+            ctx.name = name;
+            ctx.format = DocumentFormat::Custom("pre-parsed".to_string());
+            ctx.metrics.set_nodes_processed(ctx.raw_nodes.len());
+
+            let duration = start.elapsed().as_millis() as u64;
+            info!(
+                "[parse] Pre-parsed: {} nodes for '{}' ({}ms)",
+                ctx.raw_nodes.len(),
+                ctx.name,
+                duration
+            );
+
+            let mut stage_result = PassResult::success("parse");
+            stage_result.duration_ms = duration;
+            stage_result.metadata.insert(
+                "node_count".to_string(),
+                serde_json::json!(ctx.raw_nodes.len()),
+            );
+            stage_result.metadata.insert(
+                "source".to_string(),
+                serde_json::json!("pre-parsed"),
+            );
+            return Ok(stage_result);
+        }
+
         // Detect format
         let format = self.detect_format(ctx)?;
+        let format_name = Self::format_name(&format).to_string();
         ctx.format = format;
 
         let input_type = match &ctx.input {
             CompilerInput::File(_) => "file",
             CompilerInput::Content { .. } => "content",
             CompilerInput::Bytes { .. } => "bytes",
+            CompilerInput::PreParsed { .. } => unreachable!(),
         };
+
         info!(
-            "[parse] Starting: format={:?}, input={}, llm={}",
-            format,
+            "[parse] Starting: format={}, input={}, llm={}",
+            format_name,
             input_type,
             self.llm_client.is_some()
         );
 
+        // Look up parser in registry
+        let parser = self.registry.get(&format_name).ok_or_else(|| {
+            vectorless_error::Error::Parse(format!(
+                "No parser registered for format '{}'. Available: {:?}",
+                format_name,
+                self.registry.parser_names()
+            ))
+        })?;
+
         // Parse based on input type
         let result = match &ctx.input {
             CompilerInput::File(path) => {
-                // Resolve path
                 let path = path.canonicalize().unwrap_or_else(|_| path.clone());
                 ctx.source_path = Some(path.clone());
-
-                // Extract name from file
                 ctx.name = path
                     .file_stem()
                     .and_then(|n| n.to_str())
                     .unwrap_or("document")
                     .to_string();
-
                 debug!("[parse] Reading file: {:?}", ctx.source_path);
-
-                // Parse directly
-                crate::parse::parse_file(&path, format, self.llm_client.clone()).await?
+                parser.parse_file(&path).await?
             }
-            CompilerInput::Content {
-                content,
-                name,
-                format,
-            } => {
-                // Set name
+            CompilerInput::Content { content, name, .. } => {
                 ctx.name = name.clone();
-
                 debug!("[parse] Parsing inline content ({} chars)", content.len());
-
-                // Parse content directly
-                crate::parse::parse_content(content, *format, self.llm_client.clone()).await?
+                parser.parse_content(content).await?
             }
-            CompilerInput::Bytes { data, name, format } => {
-                // Set name
+            CompilerInput::Bytes { data, name, .. } => {
                 ctx.name = name.clone();
-
                 debug!("[parse] Parsing bytes ({} bytes)", data.len());
-
-                // Parse bytes
-                crate::parse::parse_bytes(data, *format, self.llm_client.clone()).await?
+                parser.parse_bytes(data).await?
             }
+            CompilerInput::PreParsed { .. } => unreachable!(),
         };
 
         // Store results
@@ -145,9 +196,10 @@ impl CompilePass for ParsePass {
         ctx.metrics.record_parse(duration);
 
         info!(
-            "[parse] Complete: {} nodes from '{}' ({}ms)",
+            "[parse] Complete: {} nodes from '{}' ({}, {}ms)",
             ctx.raw_nodes.len(),
             ctx.name,
+            format_name,
             duration
         );
 
@@ -159,7 +211,7 @@ impl CompilePass for ParsePass {
         );
         stage_result
             .metadata
-            .insert("format".to_string(), serde_json::json!(format.extension()));
+            .insert("format".to_string(), serde_json::json!(&format_name));
 
         Ok(stage_result)
     }
diff --git a/crates/vectorless-compiler/src/pipeline/context.rs b/crates/vectorless-compiler/src/pipeline/context.rs
index 7aab10e..6ef1c32 100644
--- a/crates/vectorless-compiler/src/pipeline/context.rs
+++ b/crates/vectorless-compiler/src/pipeline/context.rs
@@ -41,6 +41,17 @@ pub enum CompilerInput {
         /// Document format.
         format: DocumentFormat,
     },
+
+    /// Pre-parsed raw nodes — skip ParsePass entirely.
+    ///
+    /// Use this when the caller (e.g., a Python plugin) has already parsed
+    /// the document into structured nodes. The pipeline starts from BuildPass.
+    PreParsed {
+        /// Pre-parsed raw nodes.
+        nodes: Vec<crate::parse::RawNode>,
+        /// Document name.
+        name: String,
+    },
 }
 
 impl CompilerInput {
@@ -93,6 +104,19 @@ impl CompilerInput {
         }
     }
 
+    /// Create input from pre-parsed raw nodes.
+    ///
+    /// Skips ParsePass — the pipeline starts from BuildPass.
+    pub fn pre_parsed(
+        nodes: Vec<crate::parse::RawNode>,
+        name: impl Into<String>,
+    ) -> Self {
+        Self::PreParsed {
+            nodes,
+            name: name.into(),
+        }
+    }
+
     /// Check if this is a file input.
     pub fn is_file(&self) -> bool {
         matches!(self, Self::File(_))
@@ -108,12 +132,18 @@ impl CompilerInput {
         matches!(self, Self::Bytes { .. })
     }
 
+    /// Check if this is a pre-parsed input.
+    pub fn is_pre_parsed(&self) -> bool {
+        matches!(self, Self::PreParsed { .. })
+    }
+
     /// Get the format if available.
     pub fn format(&self) -> Option<DocumentFormat> {
         match self {
             Self::File(_) => None,
-            Self::Content { format, .. } => Some(*format),
-            Self::Bytes { format, .. } => Some(*format),
+            Self::Content { format, .. } => Some(format.clone()),
+            Self::Bytes { format, .. } => Some(format.clone()),
+            Self::PreParsed { .. } => None,
         }
     }
 }
@@ -327,13 +357,19 @@ impl CompileContext {
         use sha2::{Digest, Sha256};
         let hash = match input {
             CompilerInput::File(path) => {
-                // Hash the file path as proxy — actual content may not be readable yet
-                // (the parse stage reads it). This is sufficient for checkpoint invalidation
-                // since a different file path implies different content.
                 Sha256::digest(path.to_string_lossy().as_bytes())
             }
             CompilerInput::Content { content, .. } => Sha256::digest(content.as_bytes()),
             CompilerInput::Bytes { data, .. } => Sha256::digest(data),
+            CompilerInput::PreParsed { nodes, .. } => {
+                // Hash a summary of the nodes: count + first titles
+                let mut hasher = Sha256::new();
+                hasher.update(nodes.len().to_le_bytes());
+                for node in nodes.iter().take(10) {
+                    hasher.update(node.title.as_bytes());
+                }
+                hasher.finalize()
+            }
         };
         format!("{:x}", hash)
     }
diff --git a/crates/vectorless-compiler/src/pipeline/executor.rs b/crates/vectorless-compiler/src/pipeline/executor.rs
index d73c3bc..41f870f 100644
--- a/crates/vectorless-compiler/src/pipeline/executor.rs
+++ b/crates/vectorless-compiler/src/pipeline/executor.rs
@@ -12,6 +12,7 @@ use vectorless_error::Result;
 use vectorless_llm::LlmClient;
 
 use super::super::PipelineOptions;
+use super::super::parse::{Parser, ParserRegistry};
 use super::super::passes::{
     BuildPass, ChainPass, CompilePass, ConceptPass, EnhancePass, EnrichPass, NavigationPass,
     OptimizePass, OverlapPass, ParsePass, ReasoningPass, RoutePass, ScorePass, SplitPass,
@@ -138,6 +139,37 @@ impl PipelineExecutor {
         Self { orchestrator }
     }
 
+    /// Create with a custom parser registry.
+    ///
+    /// Use this to register custom format parsers alongside the built-in
+    /// Markdown and PDF parsers.
+    pub fn with_registry(registry: ParserRegistry) -> Self {
+        let orchestrator = PipelineOrchestrator::new()
+            .stage_with_priority(ParsePass::with_registry(registry), 10)
+            .stage_with_priority(BuildPass::new(), 20)
+            .stage_with_priority(ValidatePass::new(), 22)
+            .stage_with_priority(SplitPass::new(), 25)
+            .stage_with_priority(EnrichPass::new(), 40)
+            .stage_with_priority(ReasoningPass::new(), 45)
+            .stage_with_priority(ConceptPass::new(), 47)
+            .stage_with_priority(NavigationPass::new(), 50)
+            .stage_with_priority(RoutePass::new(), 52)
+            .stage_with_priority(ChainPass::new(), 54)
+            .stage_with_priority(OverlapPass::new(), 56)
+            .stage_with_priority(ScorePass::new(), 58)
+            .stage_with_priority(VerifyPass, 55)
+            .stage_with_priority(OptimizePass::new(), 60);
+        Self { orchestrator }
+    }
+
+    /// Add a single custom parser.
+    ///
+    /// Creates a default registry with built-in parsers plus the provided one.
+    pub fn with_parser(parser: impl Parser + 'static) -> Self {
+        let registry = ParserRegistry::default_parsers(None).with(parser);
+        Self::with_registry(registry)
+    }
+
     /// Add a stage with default priority.
     ///
     /// The stage will be added after existing stages with the same priority.
diff --git a/crates/vectorless-compiler/src/pipeline/orchestrator.rs b/crates/vectorless-compiler/src/pipeline/orchestrator.rs
index 26c0860..d1741b6 100644
--- a/crates/vectorless-compiler/src/pipeline/orchestrator.rs
+++ b/crates/vectorless-compiler/src/pipeline/orchestrator.rs
@@ -576,7 +576,7 @@ impl PipelineOrchestrator {
                         clone.existing_tree = ctx.existing_tree.clone();
                         clone.doc_id = ctx.doc_id.clone();
                         clone.name = ctx.name.clone();
-                        clone.format = ctx.format;
+                        clone.format = ctx.format.clone();
                         clone.source_path = ctx.source_path.clone();
                         if let Some(ref llm) = ctx.llm_client {
                             clone.llm_client = Some(llm.clone());
diff --git a/crates/vectorless-document/src/format.rs b/crates/vectorless-document/src/format.rs
index 8901dc4..a308fdd 100644
--- a/crates/vectorless-document/src/format.rs
+++ b/crates/vectorless-document/src/format.rs
@@ -6,12 +6,35 @@
 use serde::{Deserialize, Serialize};
 
 /// Supported document formats.
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
 pub enum DocumentFormat {
     /// Markdown files (.md, .markdown)
     Markdown,
     /// PDF files (.pdf)
     Pdf,
+    /// Custom format identified by name (for parser plugins).
+    Custom(String),
+}
+
+impl Serialize for DocumentFormat {
+    fn serialize<S: serde::Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
+        match self {
+            Self::Markdown => serializer.serialize_str("markdown"),
+            Self::Pdf => serializer.serialize_str("pdf"),
+            Self::Custom(name) => serializer.serialize_str(name),
+        }
+    }
+}
+
+impl<'de> Deserialize<'de> for DocumentFormat {
+    fn deserialize<D: serde::Deserializer<'de>>(deserializer: D) -> Result<Self, D::Error> {
+        let s = String::deserialize(deserializer)?;
+        match s.as_str() {
+            "markdown" => Ok(Self::Markdown),
+            "pdf" => Ok(Self::Pdf),
+            _ => Ok(Self::Custom(s)),
+        }
+    }
 }
 
 impl DocumentFormat {
@@ -25,10 +48,11 @@ impl DocumentFormat {
     }
 
     /// Get the file extension for this format.
-    pub fn extension(&self) -> &'static str {
+    pub fn extension(&self) -> &str {
         match self {
             Self::Markdown => "md",
             Self::Pdf => "pdf",
+            Self::Custom(name) => name,
         }
     }
 
diff --git a/crates/vectorless-document/src/lib.rs b/crates/vectorless-document/src/lib.rs
index 08d7cd9..b32a05b 100644
--- a/crates/vectorless-document/src/lib.rs
+++ b/crates/vectorless-document/src/lib.rs
@@ -46,6 +46,7 @@ pub use toc::{TocConfig, TocEntry, TocNode, TocView};
 pub use tree::{DocumentTree, RetrievalIndex};
 pub use understanding::{
     CURRENT_SCHEMA_VERSION, Concept, Document, DocumentInfo, DocumentMeta, IngestInput,
+    RawNodeInput,
 };
 
 // Re-export agent acceleration types
diff --git a/crates/vectorless-document/src/understanding.rs b/crates/vectorless-document/src/understanding.rs
index 91a40c6..47951e6 100644
--- a/crates/vectorless-document/src/understanding.rs
+++ b/crates/vectorless-document/src/understanding.rs
@@ -353,6 +353,30 @@ pub enum IngestInput {
         /// Document content.
         content: String,
     },
+    /// Compile from pre-parsed raw nodes.
+    ///
+    /// Skips the parse stage — the pipeline starts from tree building.
+    /// Use this when the caller has already structured the document.
+    PreParsed {
+        /// Document name.
+        name: String,
+        /// Pre-parsed raw nodes.
+        nodes: Vec<RawNodeInput>,
+    },
+}
+
+/// A raw node for [`IngestInput::PreParsed`].
+///
+/// Simplified version of `RawNode` for external API — callers construct
+/// these from Python or other languages.
+#[derive(Debug, Clone)]
+pub struct RawNodeInput {
+    /// Node title (e.g., section heading or file path).
+    pub title: String,
+    /// Node content.
+    pub content: String,
+    /// Hierarchy level (0 = root, 1 = top-level, etc.).
+    pub level: usize,
 }
 
 #[cfg(test)]
diff --git a/crates/vectorless-engine/src/engine.rs b/crates/vectorless-engine/src/engine.rs
index 84b9184..16a9901 100644
--- a/crates/vectorless-engine/src/engine.rs
+++ b/crates/vectorless-engine/src/engine.rs
@@ -374,15 +374,21 @@ impl Engine {
     /// The engine builds a full understanding including tree, navigation index,
     /// reasoning index, summary, and key concepts.
     pub async fn compile(&self, input: IngestInput) -> Result<vectorless_document::DocumentInfo> {
+        // Handle PreParsed input directly — bypass CompileSource routing
+        if let IngestInput::PreParsed { nodes, name } = &input {
+            return self.compile_pre_parsed(nodes, name).await;
+        }
+
         let ctx = match &input {
             IngestInput::Path(path) => CompileInput::from_path(path),
             IngestInput::Bytes { data, format, .. } => {
-                CompileInput::from_bytes(data.clone(), *format)
+                CompileInput::from_bytes(data.clone(), format.clone())
             }
             IngestInput::Text { content, .. } => CompileInput::from_content(
                 content,
                 vectorless_compiler::parse::DocumentFormat::Markdown,
             ),
+            IngestInput::PreParsed { .. } => unreachable!(),
         };
 
         let result = self.compile_pipeline(ctx).await?;
@@ -402,6 +408,67 @@ impl Engine {
         Ok(doc.info())
     }
 
+    /// Compile from pre-parsed raw nodes — skips the parse stage.
+    async fn compile_pre_parsed(
+        &self,
+        nodes: &[vectorless_document::RawNodeInput],
+        name: &str,
+    ) -> Result<vectorless_document::DocumentInfo> {
+        use vectorless_compiler::parse::RawNode;
+        use vectorless_document::{CURRENT_SCHEMA_VERSION, DocumentMeta};
+
+        let raw_nodes: Vec<RawNode> = nodes
+            .iter()
+            .map(|n| RawNode::new(&n.title).with_content(&n.content).with_level(n.level))
+            .collect();
+
+        let compiler_input =
+            vectorless_compiler::CompilerInput::pre_parsed(raw_nodes, name.to_string());
+        let pipeline_options = vectorless_compiler::PipelineOptions::default();
+
+        let mut executor = (self.indexer.executor_factory)();
+        let result = executor.execute(compiler_input, pipeline_options).await?;
+
+        let tree = result
+            .tree
+            .ok_or_else(|| Error::Parse("Document tree not generated".to_string()))?;
+
+        let node_count = tree.node_count();
+        let doc_id = uuid::Uuid::new_v4().to_string();
+
+        let mut meta = DocumentMeta::new();
+        meta.update_processing_stats(node_count, result.metrics.total_tokens_generated, result.metrics.total_time_ms());
+
+        let doc = vectorless_document::Document {
+            schema_version: CURRENT_SCHEMA_VERSION,
+            doc_id: doc_id.clone(),
+            name: name.to_string(),
+            format: "pre-parsed".to_string(),
+            source_path: None,
+            tree,
+            nav_index: result.navigation_index.unwrap_or_default(),
+            reasoning_index: result.reasoning_index.unwrap_or_default(),
+            summary: result.description.unwrap_or_default(),
+            concepts: result.concepts,
+            query_routes: result.query_routes,
+            chain_index: result.chain_index,
+            content_overlap: result.content_overlap,
+            evidence_scores: result.evidence_scores,
+            page_count: result.page_count,
+            meta: Some(meta),
+        };
+
+        self.workspace.save(&doc).await?;
+
+        let loaded = self
+            .workspace
+            .load(&doc_id)
+            .await?
+            .ok_or_else(|| Error::Config("Document not found after compile".into()))?;
+
+        Ok(loaded.info())
+    }
+
     /// Remove a document from the workspace.
     pub async fn forget(&self, doc_id: &str) -> Result<()> {
         self.workspace.remove(doc_id).await?;
@@ -509,8 +576,8 @@ impl Engine {
                 .indexer
                 .detect_format_from_path(path)
                 .unwrap_or(vectorless_compiler::parse::DocumentFormat::Markdown),
-            CompileSource::Content { format, .. } => *format,
-            CompileSource::Bytes { format, .. } => *format,
+            CompileSource::Content { format, .. } => format.clone(),
+            CompileSource::Bytes { format, .. } => format.clone(),
         };
 
         let checkpoint_dir = Some(self.config.storage.checkpoint_dir.clone());
@@ -519,6 +586,9 @@ impl Engine {
             mode: match format {
                 vectorless_compiler::parse::DocumentFormat::Markdown => SourceFormat::Markdown,
                 vectorless_compiler::parse::DocumentFormat::Pdf => SourceFormat::Pdf,
+                vectorless_compiler::parse::DocumentFormat::Custom(ref name) => {
+                    SourceFormat::Custom(name.clone())
+                }
             },
             generate_ids: options.generate_ids,
             summary_strategy: if options.generate_summaries {
diff --git a/crates/vectorless-engine/src/indexer.rs b/crates/vectorless-engine/src/indexer.rs
index fd31574..5854fdd 100644
--- a/crates/vectorless-engine/src/indexer.rs
+++ b/crates/vectorless-engine/src/indexer.rs
@@ -42,7 +42,7 @@ use vectorless_events::{CompileEvent, EventEmitter};
 /// true parallel document compilation without mutex contention.
 pub(crate) struct IndexerClient {
     /// Factory for creating pipeline executors (one per compile operation).
-    executor_factory: Arc<dyn Fn() -> PipelineExecutor + Send + Sync>,
+    pub(crate) executor_factory: Arc<dyn Fn() -> PipelineExecutor + Send + Sync>,
 
     /// Event emitter.
     events: EventEmitter,
@@ -92,11 +92,11 @@ impl IndexerClient {
         match source {
             CompileSource::Path(path) => self.index_from_path(path, name, pipeline_options).await,
             CompileSource::Content { data, format } => {
-                self.index_from_content(data, *format, name, pipeline_options)
+                self.index_from_content(data, format.clone(), name, pipeline_options)
                     .await
             }
             CompileSource::Bytes { data, format } => {
-                self.index_from_bytes(data, *format, name, pipeline_options)
+                self.index_from_bytes(data, format.clone(), name, pipeline_options)
                     .await
             }
         }
@@ -152,7 +152,7 @@ impl IndexerClient {
         pipeline_options: PipelineOptions,
     ) -> Result<Document> {
         // Validate content before compiling
-        let validation = vectorless_utils::validate_content(content, format);
+        let validation = vectorless_utils::validate_content(content, format.clone());
         if !validation.valid {
             return Err(Error::Parse(
                 validation
@@ -184,7 +184,7 @@ impl IndexerClient {
         pipeline_options: PipelineOptions,
     ) -> Result<Document> {
         // Validate bytes before compiling
-        let validation = vectorless_utils::validate_bytes(bytes, format);
+        let validation = vectorless_utils::validate_bytes(bytes, format.clone());
         if !validation.valid {
             return Err(Error::Parse(
                 validation
@@ -230,7 +230,7 @@ impl IndexerClient {
 
         let doc_id = Uuid::new_v4().to_string();
         self.events
-            .emit_compile(CompileEvent::FormatDetected { format });
+            .emit_compile(CompileEvent::FormatDetected { format: format.clone() });
 
         info!("Compiling {:?} document: {}", format, source_label);
 
@@ -325,6 +325,7 @@ impl IndexerClient {
             SourceFormat::Markdown => DocumentFormat::Markdown,
             SourceFormat::Pdf => DocumentFormat::Pdf,
             SourceFormat::Auto => DocumentFormat::Markdown,
+            SourceFormat::Custom(name) => DocumentFormat::Custom(name.clone()),
         }
     }
 
diff --git a/crates/vectorless-engine/src/lib.rs b/crates/vectorless-engine/src/lib.rs
index c0af41e..a89d582 100644
--- a/crates/vectorless-engine/src/lib.rs
+++ b/crates/vectorless-engine/src/lib.rs
@@ -48,7 +48,7 @@ pub use vectorless_document::DocumentFormat;
 
 pub use vectorless_config::Config;
 pub use vectorless_document::DocumentTree;
-pub use vectorless_document::{Concept, DocumentInfo, IngestInput};
+pub use vectorless_document::{Concept, DocumentInfo, IngestInput, RawNodeInput};
 pub use vectorless_error::{Error, Result};
 pub use vectorless_events::{CompileEvent, EventEmitter, WorkspaceEvent};
 pub use vectorless_graph::{
diff --git a/crates/vectorless-py/src/engine.rs b/crates/vectorless-py/src/engine.rs
index 31f64e9..a3471e5 100644
--- a/crates/vectorless-py/src/engine.rs
+++ b/crates/vectorless-py/src/engine.rs
@@ -8,7 +8,7 @@ use pyo3_async_runtimes::tokio::future_into_py;
 use std::sync::Arc;
 use tokio::runtime::Runtime;
 
-use ::vectorless_engine::{Engine, EngineBuilder, IngestInput};
+use ::vectorless_engine::{Engine, EngineBuilder, IngestInput, RawNodeInput};
 
 use super::document::{PyDocument, PyDocumentInfo};
 use super::error::VectorlessError;
@@ -25,6 +25,20 @@ async fn run_compile(engine: Arc<Engine>, input: IngestInput) -> PyResult<PyDocu
     Ok(PyDocumentInfo { inner: doc })
 }
 
+async fn run_compile_raw(
+    engine: Arc<Engine>,
+    name: String,
+    nodes: Vec<(String, String, usize)>,
+) -> PyResult<PyDocumentInfo> {
+    let raw_nodes: Vec<RawNodeInput> = nodes
+        .into_iter()
+        .map(|(title, content, level)| RawNodeInput { title, content, level })
+        .collect();
+    let input = IngestInput::PreParsed { name, nodes: raw_nodes };
+    let doc = engine.compile(input).await.map_err(to_py_err)?;
+    Ok(PyDocumentInfo { inner: doc })
+}
+
 async fn run_forget(engine: Arc<Engine>, doc_id: String) -> PyResult<()> {
     engine.forget(&doc_id).await.map_err(to_py_err)
 }
@@ -179,6 +193,30 @@ impl PyEngine {
         future_into_py(py, run_compile(engine, input))
     }
 
+    /// Compile from pre-parsed raw nodes — skips the parse stage.
+    ///
+    /// Use this when you have already structured the document into nodes
+    /// (e.g., a Python plugin that parses code files into sections).
+    ///
+    /// Args:
+    ///     name: Document name.
+    ///     raw_nodes: List of (title, content, level) tuples.
+    ///
+    /// Returns:
+    ///     DocumentInfo with doc_id, summary, structure, concepts.
+    ///
+    /// Raises:
+    ///     VectorlessError: If compilation fails.
+    fn compile_raw<'py>(
+        &self,
+        py: Python<'py>,
+        name: String,
+        raw_nodes: Vec<(String, String, usize)>,
+    ) -> PyResult<Bound<'py, PyAny>> {
+        let engine = Arc::clone(&self.inner);
+        future_into_py(py, run_compile_raw(engine, name, raw_nodes))
+    }
+
     /// Remove a document by ID.
     ///
     /// Args:
diff --git a/vectorless/engine.py b/vectorless/engine.py
index 9b991eb..d7e2b0b 100644
--- a/vectorless/engine.py
+++ b/vectorless/engine.py
@@ -143,6 +143,7 @@ async def compile(
         directory: str | Path | None = None,
         content: str | None = None,
         bytes_data: bytes | None = None,
+        raw_nodes: list[dict[str, Any]] | None = None,
         format: str = "markdown",
         name: str | None = None,
         mode: str = "default",
@@ -151,14 +152,27 @@ async def compile(
         """Compile a document from various sources.
 
         Exactly one source must be provided: path, paths, directory,
-        content, or bytes_data.
+        content, bytes_data, or raw_nodes.
+
+        ``raw_nodes`` accepts a list of dicts with keys ``title``, ``content``,
+        and ``level``.  This skips the parse stage — the pipeline starts from
+        tree building.  Use this when the caller has already structured the
+        document (e.g., a Python plugin that parses code files).
+
+        Example::
+
+            nodes = [
+                {"title": "src/main.py", "content": file_content, "level": 1},
+                {"title": "src/lib.rs", "content": file_content, "level": 1},
+            ]
+            result = await engine.compile(raw_nodes=nodes, name="my-project")
         """
         sources_provided = sum(
-            x is not None for x in [path, paths, directory, content, bytes_data]
+            x is not None for x in [path, paths, directory, content, bytes_data, raw_nodes]
         )
         if sources_provided != 1:
             raise ValueError(
-                "Provide exactly one source: path, paths, directory, content, or bytes_data"
+                "Provide exactly one source: path, paths, directory, content, bytes_data, or raw_nodes"
             )
 
         # For single file, delegate to Rust compile
@@ -222,6 +236,16 @@ async def compile(
                 import os
                 os.unlink(tmp_path)
 
+        if raw_nodes is not None:
+            doc_name = name or "pre-parsed"
+            # Convert dicts to (title, content, level) tuples
+            node_tuples = [
+                (n.get("title", ""), n.get("content", ""), n.get("level", 1))
+                for n in raw_nodes
+            ]
+            doc_info = await self._rust.compile_raw(doc_name, node_tuples)
+            return CompileOutput.from_doc_info(doc_info)
+
         raise ValueError("No source provided")
 
     async def compile_batch(

From 5bd1bad9eda4e665887d173b0573b28c8ec8cfcd Mon Sep 17 00:00:00 2001
From: zTgx <747674262@qq.com>
Date: Sun, 26 Apr 2026 21:57:44 +0800
Subject: [PATCH 2/2] refactor: standardize import order and improve code
 formatting

- Standardize import order across multiple files to maintain consistency
- Format function definitions and method implementations with proper
  indentation and line breaks
- Refactor error handling chains to be more readable
- Apply consistent code style throughout the parsing modules
- Add documentation for vectorless-code AST parsing, getting started,
  and incremental compilation features
---
 .../src/parse/markdown/mod.rs                 |  19 ++-
 crates/vectorless-compiler/src/parse/mod.rs   |  21 ++-
 .../vectorless-compiler/src/parse/pdf/mod.rs  |  10 +-
 .../src/passes/frontend/parse.rs              |   7 +-
 .../src/pipeline/context.rs                   |   9 +-
 crates/vectorless-engine/src/engine.rs        |  12 +-
 crates/vectorless-engine/src/indexer.rs       |   5 +-
 crates/vectorless-py/src/engine.rs            |  11 +-
 docs/docs/vectorless-code/ast-parsing.mdx     | 156 ++++++++++++++++++
 docs/docs/vectorless-code/getting-started.mdx | 111 +++++++++++++
 docs/docs/vectorless-code/incremental.mdx     | 109 ++++++++++++
 docs/sidebars.ts                              |   9 +
 12 files changed, 443 insertions(+), 36 deletions(-)
 create mode 100644 docs/docs/vectorless-code/ast-parsing.mdx
 create mode 100644 docs/docs/vectorless-code/getting-started.mdx
 create mode 100644 docs/docs/vectorless-code/incremental.mdx

diff --git a/crates/vectorless-compiler/src/parse/markdown/mod.rs b/crates/vectorless-compiler/src/parse/markdown/mod.rs
index b49aefa..84d4596 100644
--- a/crates/vectorless-compiler/src/parse/markdown/mod.rs
+++ b/crates/vectorless-compiler/src/parse/markdown/mod.rs
@@ -27,7 +27,7 @@ mod parser;
 
 pub use parser::MarkdownParser;
 
-use crate::parse::{Parser, ParseResult};
+use crate::parse::{ParseResult, Parser};
 use std::path::Path;
 use vectorless_error::Result;
 
@@ -39,15 +39,21 @@ pub struct MarkdownParserAdapter {
 impl MarkdownParserAdapter {
     /// Create a new Markdown parser adapter.
     pub fn new() -> Self {
-        Self { inner: MarkdownParser::new() }
+        Self {
+            inner: MarkdownParser::new(),
+        }
     }
 }
 
 #[async_trait::async_trait]
 impl Parser for MarkdownParserAdapter {
-    fn name(&self) -> &str { "markdown" }
+    fn name(&self) -> &str {
+        "markdown"
+    }
 
-    fn extensions(&self) -> &[&str] { &["md", "markdown"] }
+    fn extensions(&self) -> &[&str] {
+        &["md", "markdown"]
+    }
 
     async fn parse_content(&self, content: &str) -> Result<ParseResult> {
         self.inner.parse(content).await
@@ -58,9 +64,8 @@ impl Parser for MarkdownParserAdapter {
     }
 
     async fn parse_bytes(&self, data: &[u8]) -> Result<ParseResult> {
-        let content = std::str::from_utf8(data).map_err(|e| {
-            vectorless_error::Error::Parse(format!("Invalid UTF-8: {}", e))
-        })?;
+        let content = std::str::from_utf8(data)
+            .map_err(|e| vectorless_error::Error::Parse(format!("Invalid UTF-8: {}", e)))?;
         self.inner.parse(content).await
     }
 }
diff --git a/crates/vectorless-compiler/src/parse/mod.rs b/crates/vectorless-compiler/src/parse/mod.rs
index 1cfc687..ff53ccd 100644
--- a/crates/vectorless-compiler/src/parse/mod.rs
+++ b/crates/vectorless-compiler/src/parse/mod.rs
@@ -174,9 +174,10 @@ pub async fn parse_content(
         DocumentFormat::Pdf => Err(vectorless_error::Error::Parse(
             "PDF requires bytes, not string content".to_string(),
         )),
-        _ => Err(vectorless_error::Error::Parse(
-            format!("Unsupported format for content parsing: {:?}", format),
-        )),
+        _ => Err(vectorless_error::Error::Parse(format!(
+            "Unsupported format for content parsing: {:?}",
+            format
+        ))),
     }
 }
 
@@ -198,9 +199,10 @@ pub async fn parse_file(
             };
             parser.parse_file(path).await
         }
-        _ => Err(vectorless_error::Error::Parse(
-            format!("Unsupported format for file parsing: {:?}", format),
-        )),
+        _ => Err(vectorless_error::Error::Parse(format!(
+            "Unsupported format for file parsing: {:?}",
+            format
+        ))),
     }
 }
 
@@ -225,9 +227,10 @@ pub async fn parse_bytes(
             };
             parser.parse_bytes_async(bytes, None).await
         }
-        _ => Err(vectorless_error::Error::Parse(
-            format!("Unsupported format for bytes parsing: {:?}", format),
-        )),
+        _ => Err(vectorless_error::Error::Parse(format!(
+            "Unsupported format for bytes parsing: {:?}",
+            format
+        ))),
     }
 }
 
diff --git a/crates/vectorless-compiler/src/parse/pdf/mod.rs b/crates/vectorless-compiler/src/parse/pdf/mod.rs
index 6a25387..45a3647 100644
--- a/crates/vectorless-compiler/src/parse/pdf/mod.rs
+++ b/crates/vectorless-compiler/src/parse/pdf/mod.rs
@@ -28,7 +28,7 @@ mod types;
 pub use parser::PdfParser;
 pub use types::PdfPage;
 
-use crate::parse::{Parser, ParseResult};
+use crate::parse::{ParseResult, Parser};
 use std::path::Path;
 use vectorless_error::Result;
 use vectorless_llm::LlmClient;
@@ -51,9 +51,13 @@ impl PdfParserAdapter {
 
 #[async_trait::async_trait]
 impl Parser for PdfParserAdapter {
-    fn name(&self) -> &str { "pdf" }
+    fn name(&self) -> &str {
+        "pdf"
+    }
 
-    fn extensions(&self) -> &[&str] { &["pdf"] }
+    fn extensions(&self) -> &[&str] {
+        &["pdf"]
+    }
 
     async fn parse_content(&self, _content: &str) -> Result<ParseResult> {
         Err(vectorless_error::Error::Parse(
diff --git a/crates/vectorless-compiler/src/passes/frontend/parse.rs b/crates/vectorless-compiler/src/passes/frontend/parse.rs
index a3f224e..f6bfe1b 100644
--- a/crates/vectorless-compiler/src/passes/frontend/parse.rs
+++ b/crates/vectorless-compiler/src/passes/frontend/parse.rs
@@ -116,10 +116,9 @@ impl CompilePass for ParsePass {
                 "node_count".to_string(),
                 serde_json::json!(ctx.raw_nodes.len()),
             );
-            stage_result.metadata.insert(
-                "source".to_string(),
-                serde_json::json!("pre-parsed"),
-            );
+            stage_result
+                .metadata
+                .insert("source".to_string(), serde_json::json!("pre-parsed"));
             return Ok(stage_result);
         }
 
diff --git a/crates/vectorless-compiler/src/pipeline/context.rs b/crates/vectorless-compiler/src/pipeline/context.rs
index 6ef1c32..48d3d3f 100644
--- a/crates/vectorless-compiler/src/pipeline/context.rs
+++ b/crates/vectorless-compiler/src/pipeline/context.rs
@@ -107,10 +107,7 @@ impl CompilerInput {
     /// Create input from pre-parsed raw nodes.
     ///
     /// Skips ParsePass — the pipeline starts from BuildPass.
-    pub fn pre_parsed(
-        nodes: Vec<crate::parse::RawNode>,
-        name: impl Into<String>,
-    ) -> Self {
+    pub fn pre_parsed(nodes: Vec<crate::parse::RawNode>, name: impl Into<String>) -> Self {
         Self::PreParsed {
             nodes,
             name: name.into(),
@@ -356,9 +353,7 @@ impl CompileContext {
     fn compute_source_hash(input: &CompilerInput) -> String {
         use sha2::{Digest, Sha256};
         let hash = match input {
-            CompilerInput::File(path) => {
-                Sha256::digest(path.to_string_lossy().as_bytes())
-            }
+            CompilerInput::File(path) => Sha256::digest(path.to_string_lossy().as_bytes()),
             CompilerInput::Content { content, .. } => Sha256::digest(content.as_bytes()),
             CompilerInput::Bytes { data, .. } => Sha256::digest(data),
             CompilerInput::PreParsed { nodes, .. } => {
diff --git a/crates/vectorless-engine/src/engine.rs b/crates/vectorless-engine/src/engine.rs
index 16a9901..be60d5f 100644
--- a/crates/vectorless-engine/src/engine.rs
+++ b/crates/vectorless-engine/src/engine.rs
@@ -419,7 +419,11 @@ impl Engine {
 
         let raw_nodes: Vec<RawNode> = nodes
             .iter()
-            .map(|n| RawNode::new(&n.title).with_content(&n.content).with_level(n.level))
+            .map(|n| {
+                RawNode::new(&n.title)
+                    .with_content(&n.content)
+                    .with_level(n.level)
+            })
             .collect();
 
         let compiler_input =
@@ -437,7 +441,11 @@ impl Engine {
         let doc_id = uuid::Uuid::new_v4().to_string();
 
         let mut meta = DocumentMeta::new();
-        meta.update_processing_stats(node_count, result.metrics.total_tokens_generated, result.metrics.total_time_ms());
+        meta.update_processing_stats(
+            node_count,
+            result.metrics.total_tokens_generated,
+            result.metrics.total_time_ms(),
+        );
 
         let doc = vectorless_document::Document {
             schema_version: CURRENT_SCHEMA_VERSION,
diff --git a/crates/vectorless-engine/src/indexer.rs b/crates/vectorless-engine/src/indexer.rs
index 5854fdd..7aec623 100644
--- a/crates/vectorless-engine/src/indexer.rs
+++ b/crates/vectorless-engine/src/indexer.rs
@@ -229,8 +229,9 @@ impl IndexerClient {
         });
 
         let doc_id = Uuid::new_v4().to_string();
-        self.events
-            .emit_compile(CompileEvent::FormatDetected { format: format.clone() });
+        self.events.emit_compile(CompileEvent::FormatDetected {
+            format: format.clone(),
+        });
 
         info!("Compiling {:?} document: {}", format, source_label);
 
diff --git a/crates/vectorless-py/src/engine.rs b/crates/vectorless-py/src/engine.rs
index a3471e5..49ef759 100644
--- a/crates/vectorless-py/src/engine.rs
+++ b/crates/vectorless-py/src/engine.rs
@@ -32,9 +32,16 @@ async fn run_compile_raw(
 ) -> PyResult<PyDocumentInfo> {
     let raw_nodes: Vec<RawNodeInput> = nodes
         .into_iter()
-        .map(|(title, content, level)| RawNodeInput { title, content, level })
+        .map(|(title, content, level)| RawNodeInput {
+            title,
+            content,
+            level,
+        })
         .collect();
-    let input = IngestInput::PreParsed { name, nodes: raw_nodes };
+    let input = IngestInput::PreParsed {
+        name,
+        nodes: raw_nodes,
+    };
     let doc = engine.compile(input).await.map_err(to_py_err)?;
     Ok(PyDocumentInfo { inner: doc })
 }
diff --git a/docs/docs/vectorless-code/ast-parsing.mdx b/docs/docs/vectorless-code/ast-parsing.mdx
new file mode 100644
index 0000000..250c21a
--- /dev/null
+++ b/docs/docs/vectorless-code/ast-parsing.mdx
@@ -0,0 +1,156 @@
+---
+sidebar_position: 2
+---
+
+# AST-Level Code Parsing
+
+vectorless-code uses tree-sitter to parse source code into semantic nodes — functions, classes, methods — instead of treating files as flat text. This produces a structured tree that the vectorless engine can navigate with precision.
+
+## Why AST Parsing Matters
+
+Naive code indexing treats each file as a single block of text. When you ask "how does authentication work", the engine has to scan entire files hoping to find relevant snippets. There's no understanding of what a function is, what a class contains, or how methods relate to their parent class.
+
+AST parsing changes this. The engine receives a tree like:
+
+```
+src/auth.py
+├── class_definition: AuthService
+│   ├── function_definition: __init__
+│   ├── function_definition: login
+│   └── function_definition: verify_token
+└── function_definition: create_session
+```
+
+Now the Orchestrator can `cd` into `AuthService`, `ls` to see its methods, and `cat login` to read the authentication logic. This is the same navigation model that works for documents — applied to code with structural precision.
+
+## How It Works
+
+### Per-Language Node Types
+
+Each language defines which AST node types represent semantic units worth indexing:
+
+```python
+SPLITTABLE_NODE_TYPES = {
+    "python": {
+        "function_definition",
+        "class_definition",
+        "decorated_definition",
+        "async_function_definition",
+    },
+    "rust": {
+        "function_item",
+        "impl_item",
+        "struct_item",
+        "enum_item",
+        "trait_item",
+        "mod_item",
+    },
+    # ... 12 languages total
+}
+```
+
+tree-sitter parses the source into an AST, then vectorless-code walks the tree extracting nodes whose type matches this set. Each extracted node becomes a `CodeNode` with:
+
+- `name` — the symbol name (e.g. `AuthService`, `login`)
+- `node_type` — the AST node type (e.g. `class_definition`)
+- `content` — the full source code of the node
+- `children` — nested definitions (methods inside classes)
+
+### Nested Extraction
+
+When a class is extracted, its methods are extracted as children — not as separate top-level nodes. This preserves the parent-child relationship:
+
+```python
+# Input: Python source
+class AuthService:
+    def login(self, username, password):
+        token = self._create_token(username)
+        return token
+
+    def verify_token(self, token):
+        return self._decode(token)
+
+# Output: CodeNode tree
+CodeNode(
+    name="AuthService",
+    node_type="class_definition",
+    children=[
+        CodeNode(name="login", node_type="function_definition", ...),
+        CodeNode(name="verify_token", node_type="function_definition", ...),
+    ],
+)
+```
+
+This nesting produces the raw_node tree that vectorless builds into a navigable Document. Level 1 = file, Level 2 = top-level definitions, Level 3 = nested definitions.
+
+### Name Extraction
+
+The parser extracts human-readable names from AST nodes by finding identifier children:
+
+- `function_definition` → looks for `identifier` child → `"login"`
+- `class_definition` → looks for `identifier` child → `"AuthService"`
+- `decorated_definition` → recurses into the decorated node
+- `impl_item` → looks for `type_identifier` → `"impl UserService"`
+
+## Fallback Strategy
+
+When tree-sitter is unavailable (unsupported language, grammar not installed, parse error), vectorless-code falls back to line-based splitting — splitting on blank-line boundaries into blocks. This produces flat `block` nodes without nesting, but still provides functional indexing.
+
+The fallback is transparent. The same `parse_file()` function handles both paths:
+
+```python
+def parse_file(file_path, content, language):
+    parser = _get_parser(language)  # cached per language
+    if parser is None:
+        return fallback_split(content, file_path, language)
+
+    nodes = ast_extract(parser, content, language)
+    if not nodes:
+        return fallback_split(content, file_path, language)
+    return nodes
+```
+
+## Performance Considerations
+
+### Parser Caching
+
+tree-sitter `Parser` instances are cached per language. A 10,000-file Python project creates exactly one Python parser, reused for every `.py` file. This avoids repeated memory allocation and grammar loading.
+
+### Single-Pass File Scan
+
+Files are read exactly once. A single pass computes:
+
+1. File hash (SHA-256 for incremental detection)
+2. Stats (line count, byte size, language distribution)
+3. Content for parsing
+
+### Incremental Parsing
+
+On subsequent compiles, only files whose hash changed are re-parsed. Unchanged files reuse cached raw_nodes directly. See [Incremental Compilation](./incremental.mdx).
+
+## Adding a New Language
+
+To add support for a new language:
+
+1. Add the language to `SPLITTABLE_NODE_TYPES` with the relevant AST node types
+2. Add the tree-sitter grammar package to `pyproject.toml` dependencies
+3. Add the package mapping to `_LANG_PACKAGE_MAP`
+
+For example, to add Zig:
+
+```python
+# ast_parser.py
+SPLITTABLE_NODE_TYPES["zig"] = {
+    "FunctionDecl",
+    "TopLevelDecl",
+}
+
+_LANG_PACKAGE_MAP["zig"] = "tree_sitter_zig"
+```
+
+```toml
+# pyproject.toml
+"tree-sitter-zig>=0.21",
+```
+
+No other code changes needed. The parser, cache, fallback, and incremental systems handle it automatically.
diff --git a/docs/docs/vectorless-code/getting-started.mdx b/docs/docs/vectorless-code/getting-started.mdx
new file mode 100644
index 0000000..53a2b90
--- /dev/null
+++ b/docs/docs/vectorless-code/getting-started.mdx
@@ -0,0 +1,111 @@
+---
+sidebar_position: 1
+---
+
+# Getting Started
+
+**vectorless-code** is a code-aware search engine built on vectorless. It indexes your codebase using tree-sitter AST parsing and answers questions via LLM reasoning — no embeddings, no vector database.
+
+## Install
+
+```bash
+pip install vectorless-code
+```
+
+## Quick Start
+
+```bash
+# Initialize in your project directory
+cd your-project
+vcc init
+```
+
+This creates `.vectorless_code/settings.yml` with default include/exclude patterns.
+
+```bash
+# Compile the codebase
+vcc compile
+```
+
+This discovers code files, parses them with tree-sitter into semantic nodes (functions, classes, methods), and feeds them to the vectorless engine.
+
+```bash
+# Ask a question
+vcc ask "where is the authentication logic"
+```
+
+## How It Works
+
+```
+vcc compile
+  │
+  ├─ File Discovery (gitignore-aware)
+  │
+  ├─ AST Parsing (tree-sitter)
+  │   ├─ Per-language SPLITTABLE_NODE_TYPES
+  │   └─ Fallback: line-based splitting
+  │
+  ├─ Incremental (SHA-256 per-file hashing)
+  │   ├─ Changed/new → re-parse
+  │   └─ Unchanged → reuse cached raw_nodes
+  │
+  └─ Engine.compile(raw_nodes=...)
+       └─ BuildPass → EnrichPass → ReasoningPass → NavigationPass
+```
+
+## Configuration
+
+### Project Settings
+
+`.vectorless_code/settings.yml`:
+
+```yaml
+# File patterns to include
+include_patterns:
+  - "**/*.py"
+  - "**/*.rs"
+  - "**/*.ts"
+
+# File patterns to exclude
+exclude_patterns:
+  - "**/.*"
+  - "**/node_modules"
+  - "**/target"
+```
+
+### API Key
+
+```bash
+# Option 1: Environment variable
+export VECTORLESS_API_KEY="sk-..."
+
+# Option 2: Prompted during vcc init
+vcc init
+```
+
+### Supported Languages
+
+| Language | Extensions | AST Parsing |
+|----------|-----------|-------------|
+| Python | `.py`, `.pyi` | Yes |
+| Rust | `.rs` | Yes |
+| Go | `.go` | Yes |
+| JavaScript | `.js`, `.jsx`, `.mjs` | Yes |
+| TypeScript | `.ts`, `.tsx` | Yes |
+| Java | `.java` | Yes |
+| C | `.c`, `.h` | Yes |
+| C++ | `.cpp`, `.hpp`, `.cc` | Yes |
+| Ruby | `.rb` | Yes |
+| Swift | `.swift` | Yes |
+| Kotlin | `.kt` | Yes |
+| Scala | `.scala` | Yes |
+| Others | `.sql`, `.sh`, `.lua`, etc. | Fallback (line-based) |
+
+## CLI Reference
+
+| Command | Description |
+|---------|-------------|
+| `vcc init` | Initialize project settings |
+| `vcc compile` | Compile codebase into searchable index |
+| `vcc ask <question>` | Ask a question about the codebase |
+| `vcc status` | Show compilation status and statistics |
diff --git a/docs/docs/vectorless-code/incremental.mdx b/docs/docs/vectorless-code/incremental.mdx
new file mode 100644
index 0000000..3184afd
--- /dev/null
+++ b/docs/docs/vectorless-code/incremental.mdx
@@ -0,0 +1,109 @@
+---
+sidebar_position: 3
+---
+
+# Incremental Compilation
+
+For large codebases, re-parsing every file on each compile is wasteful. vectorless-code uses per-file SHA-256 hashing and a two-tier cache to skip unchanged files entirely.
+
+## The Problem
+
+A mid-size project might have 5,000 source files. Full AST parsing at ~10ms per file takes ~50 seconds. But in a typical edit session, maybe 5 files changed. Re-parsing all 5,000 files to index 5 changes is 1,000x more work than necessary.
+
+## Solution: Hash + Cache
+
+```
+First compile:
+  scan all files → hash → parse all → build raw_nodes → compile
+
+Subsequent compiles:
+  scan all files → hash → detect changes
+    ├─ 5 changed files → parse → build raw_nodes (fresh)
+    └─ 4,995 unchanged files → reuse cached raw_nodes
+  merge → compile
+```
+
+### Hash Computation
+
+Each file's content is SHA-256 hashed during the scan pass. The hash is deterministic — same content always produces the same hash, regardless of file metadata.
+
+```python
+current_hashes[rel] = hashlib.sha256(content.encode("utf-8")).hexdigest()
+```
+
+### Change Detection
+
+Comparing current hashes against the previous compile's hashes produces three sets:
+
+- **Changed or new** — hash differs or file didn't exist before → needs parsing
+- **Unchanged** — hash matches → reuse cached raw_nodes
+- **Removed** — file existed before but not now → exclude from output
+
+### Two-Tier Cache
+
+The cache stores two files in `.vectorless_code/cache/`:
+
+| File | Content | Purpose |
+|------|---------|---------|
+| `hashes.json` | `{rel_path: sha256_hex}` | Change detection |
+| `parsed_nodes.json` | `{rel_path: [raw_nodes]}` | Skip re-parsing unchanged files |
+
+On incremental compile:
+
+1. Load `hashes.json` and `parsed_nodes.json` from previous run
+2. Scan all files, compute current hashes
+3. For changed files: parse with AST, build fresh raw_nodes
+4. For unchanged files: load raw_nodes from `parsed_nodes.json`
+5. Merge, sort by path, compile
+
+## Performance Impact
+
+| Scenario | Files | Changed | Parsing Time |
+|----------|-------|---------|-------------|
+| First compile | 5,000 | 5,000 | ~50s |
+| Incremental (5 changes) | 5,000 | 5 | ~0.5s |
+| Incremental (0 changes) | 5,000 | 0 | ~0s |
+
+The scan pass (read + hash) still touches every file, but this is I/O-bound and fast compared to AST parsing. The expensive work — tree-sitter parsing — only runs on changed files.
+
+## Cache Consistency
+
+### When is cache invalidated?
+
+- **File content changes** — hash mismatch triggers re-parse
+- **File removed** — excluded from merged output
+- **New file added** — no previous hash, treated as changed
+
+### What about parser upgrades?
+
+If `SPLITTABLE_NODE_TYPES` changes (e.g., adding a new node type to extract), the cache still contains raw_nodes built with the old configuration. To force a full re-parse:
+
+```bash
+rm -rf .vectorless_code/cache/
+vcc compile
+```
+
+### When does cache get saved?
+
+Cache is written after a successful compile. If the compile fails (e.g., API error), the cache is not updated — the next compile will retry with the same cache state.
+
+## Implementation Detail
+
+The scan-then-parse separation is intentional:
+
+```python
+# Step 1: Scan (cheap, I/O-bound)
+current_hashes, stats, content_map = _scan_files(files, root)
+
+# Step 2: Parse only changed files (expensive, CPU-bound)
+changed = [p for p, h in current_hashes.items() if prev_hashes.get(p) != h]
+for rel in changed:
+    nodes = parse_file(rel, content_map[rel], lang)
+    fresh_raw[rel] = build_raw_nodes([(rel, lang, nodes)])
+
+# Step 3: Merge with cached
+for rel in unchanged:
+    merged_raw[rel] = cached_raw[rel]
+```
+
+`content_map` holds file contents in memory during the scan pass and is released (`del content_map`) after parsing completes. For large codebases, this means peak memory is roughly the sum of all file contents — acceptable because file contents are strings that Python can manage efficiently, and they're freed before the compile step which has its own memory profile.
diff --git a/docs/sidebars.ts b/docs/sidebars.ts
index c7ee832..678e0a0 100644
--- a/docs/sidebars.ts
+++ b/docs/sidebars.ts
@@ -11,6 +11,15 @@ const sidebars: SidebarsConfig = {
       ],
     },
     'architecture',
+    {
+      type: 'category',
+      label: 'vectorless-code',
+      items: [
+        'vectorless-code/getting-started',
+        'vectorless-code/ast-parsing',
+        'vectorless-code/incremental',
+      ],
+    },
     {
       type: 'category',
       label: 'Vectorless Compiler',