refactor: standardize import order and improve code formatting

zTgx · zTgx · commit 5bd1bad9eda4 · 2026-04-26T21:57:44.000+08:00
- Standardize import order across multiple files to maintain consistency
- Format function definitions and method implementations with proper
  indentation and line breaks
- Refactor error handling chains to be more readable
- Apply consistent code style throughout the parsing modules
- Add documentation for vectorless-code AST parsing, getting started,
  and incremental compilation features
diff --git a/crates/vectorless-compiler/src/parse/markdown/mod.rs b/crates/vectorless-compiler/src/parse/markdown/mod.rs
@@ -27,7 +27,7 @@ mod parser;
 
 pub use parser::MarkdownParser;
 
-use crate::parse::{Parser, ParseResult};
+use crate::parse::{ParseResult, Parser};
 use std::path::Path;
 use vectorless_error::Result;
 
@@ -39,15 +39,21 @@ pub struct MarkdownParserAdapter {
 impl MarkdownParserAdapter {
     /// Create a new Markdown parser adapter.
     pub fn new() -> Self {
-        Self { inner: MarkdownParser::new() }
+        Self {
+            inner: MarkdownParser::new(),
+        }
     }
 }
 
 #[async_trait::async_trait]
 impl Parser for MarkdownParserAdapter {
-    fn name(&self) -> &str { "markdown" }
+    fn name(&self) -> &str {
+        "markdown"
+    }
 
-    fn extensions(&self) -> &[&str] { &["md", "markdown"] }
+    fn extensions(&self) -> &[&str] {
+        &["md", "markdown"]
+    }
 
     async fn parse_content(&self, content: &str) -> Result<ParseResult> {
         self.inner.parse(content).await
@@ -58,9 +64,8 @@ impl Parser for MarkdownParserAdapter {
     }
 
     async fn parse_bytes(&self, data: &[u8]) -> Result<ParseResult> {
-        let content = std::str::from_utf8(data).map_err(|e| {
-            vectorless_error::Error::Parse(format!("Invalid UTF-8: {}", e))
-        })?;
+        let content = std::str::from_utf8(data)
+            .map_err(|e| vectorless_error::Error::Parse(format!("Invalid UTF-8: {}", e)))?;
         self.inner.parse(content).await
     }
 }
diff --git a/crates/vectorless-compiler/src/parse/mod.rs b/crates/vectorless-compiler/src/parse/mod.rs
@@ -174,9 +174,10 @@ pub async fn parse_content(
         DocumentFormat::Pdf => Err(vectorless_error::Error::Parse(
             "PDF requires bytes, not string content".to_string(),
         )),
-        _ => Err(vectorless_error::Error::Parse(
-            format!("Unsupported format for content parsing: {:?}", format),
-        )),
+        _ => Err(vectorless_error::Error::Parse(format!(
+            "Unsupported format for content parsing: {:?}",
+            format
+        ))),
     }
 }
 
@@ -198,9 +199,10 @@ pub async fn parse_file(
             };
             parser.parse_file(path).await
         }
-        _ => Err(vectorless_error::Error::Parse(
-            format!("Unsupported format for file parsing: {:?}", format),
-        )),
+        _ => Err(vectorless_error::Error::Parse(format!(
+            "Unsupported format for file parsing: {:?}",
+            format
+        ))),
     }
 }
 
@@ -225,9 +227,10 @@ pub async fn parse_bytes(
             };
             parser.parse_bytes_async(bytes, None).await
         }
-        _ => Err(vectorless_error::Error::Parse(
-            format!("Unsupported format for bytes parsing: {:?}", format),
-        )),
+        _ => Err(vectorless_error::Error::Parse(format!(
+            "Unsupported format for bytes parsing: {:?}",
+            format
+        ))),
     }
 }
 
diff --git a/crates/vectorless-compiler/src/parse/pdf/mod.rs b/crates/vectorless-compiler/src/parse/pdf/mod.rs
@@ -28,7 +28,7 @@ mod types;
 pub use parser::PdfParser;
 pub use types::PdfPage;
 
-use crate::parse::{Parser, ParseResult};
+use crate::parse::{ParseResult, Parser};
 use std::path::Path;
 use vectorless_error::Result;
 use vectorless_llm::LlmClient;
@@ -51,9 +51,13 @@ impl PdfParserAdapter {
 
 #[async_trait::async_trait]
 impl Parser for PdfParserAdapter {
-    fn name(&self) -> &str { "pdf" }
+    fn name(&self) -> &str {
+        "pdf"
+    }
 
-    fn extensions(&self) -> &[&str] { &["pdf"] }
+    fn extensions(&self) -> &[&str] {
+        &["pdf"]
+    }
 
     async fn parse_content(&self, _content: &str) -> Result<ParseResult> {
         Err(vectorless_error::Error::Parse(
diff --git a/crates/vectorless-compiler/src/passes/frontend/parse.rs b/crates/vectorless-compiler/src/passes/frontend/parse.rs
@@ -116,10 +116,9 @@ impl CompilePass for ParsePass {
                 "node_count".to_string(),
                 serde_json::json!(ctx.raw_nodes.len()),
             );
-            stage_result.metadata.insert(
-                "source".to_string(),
-                serde_json::json!("pre-parsed"),
-            );
+            stage_result
+                .metadata
+                .insert("source".to_string(), serde_json::json!("pre-parsed"));
             return Ok(stage_result);
         }
 
diff --git a/crates/vectorless-compiler/src/pipeline/context.rs b/crates/vectorless-compiler/src/pipeline/context.rs
@@ -107,10 +107,7 @@ impl CompilerInput {
     /// Create input from pre-parsed raw nodes.
     ///
     /// Skips ParsePass — the pipeline starts from BuildPass.
-    pub fn pre_parsed(
-        nodes: Vec<crate::parse::RawNode>,
-        name: impl Into<String>,
-    ) -> Self {
+    pub fn pre_parsed(nodes: Vec<crate::parse::RawNode>, name: impl Into<String>) -> Self {
         Self::PreParsed {
             nodes,
             name: name.into(),
@@ -356,9 +353,7 @@ impl CompileContext {
     fn compute_source_hash(input: &CompilerInput) -> String {
         use sha2::{Digest, Sha256};
         let hash = match input {
-            CompilerInput::File(path) => {
-                Sha256::digest(path.to_string_lossy().as_bytes())
-            }
+            CompilerInput::File(path) => Sha256::digest(path.to_string_lossy().as_bytes()),
             CompilerInput::Content { content, .. } => Sha256::digest(content.as_bytes()),
             CompilerInput::Bytes { data, .. } => Sha256::digest(data),
             CompilerInput::PreParsed { nodes, .. } => {
diff --git a/crates/vectorless-engine/src/engine.rs b/crates/vectorless-engine/src/engine.rs
@@ -419,7 +419,11 @@ impl Engine {
 
         let raw_nodes: Vec<RawNode> = nodes
             .iter()
-            .map(|n| RawNode::new(&n.title).with_content(&n.content).with_level(n.level))
+            .map(|n| {
+                RawNode::new(&n.title)
+                    .with_content(&n.content)
+                    .with_level(n.level)
+            })
             .collect();
 
         let compiler_input =
@@ -437,7 +441,11 @@ impl Engine {
         let doc_id = uuid::Uuid::new_v4().to_string();
 
         let mut meta = DocumentMeta::new();
-        meta.update_processing_stats(node_count, result.metrics.total_tokens_generated, result.metrics.total_time_ms());
+        meta.update_processing_stats(
+            node_count,
+            result.metrics.total_tokens_generated,
+            result.metrics.total_time_ms(),
+        );
 
         let doc = vectorless_document::Document {
             schema_version: CURRENT_SCHEMA_VERSION,
diff --git a/crates/vectorless-engine/src/indexer.rs b/crates/vectorless-engine/src/indexer.rs
@@ -229,8 +229,9 @@ impl IndexerClient {
         });
 
         let doc_id = Uuid::new_v4().to_string();
-        self.events
-            .emit_compile(CompileEvent::FormatDetected { format: format.clone() });
+        self.events.emit_compile(CompileEvent::FormatDetected {
+            format: format.clone(),
+        });
 
         info!("Compiling {:?} document: {}", format, source_label);
 
diff --git a/crates/vectorless-py/src/engine.rs b/crates/vectorless-py/src/engine.rs
@@ -32,9 +32,16 @@ async fn run_compile_raw(
 ) -> PyResult<PyDocumentInfo> {
     let raw_nodes: Vec<RawNodeInput> = nodes
         .into_iter()
-        .map(|(title, content, level)| RawNodeInput { title, content, level })
+        .map(|(title, content, level)| RawNodeInput {
+            title,
+            content,
+            level,
+        })
         .collect();
-    let input = IngestInput::PreParsed { name, nodes: raw_nodes };
+    let input = IngestInput::PreParsed {
+        name,
+        nodes: raw_nodes,
+    };
     let doc = engine.compile(input).await.map_err(to_py_err)?;
     Ok(PyDocumentInfo { inner: doc })
 }
diff --git a/docs/docs/vectorless-code/ast-parsing.mdx b/docs/docs/vectorless-code/ast-parsing.mdx
@@ -0,0 +1,156 @@
+---
+sidebar_position: 2
+---
+
+# AST-Level Code Parsing
+
+vectorless-code uses tree-sitter to parse source code into semantic nodes — functions, classes, methods — instead of treating files as flat text. This produces a structured tree that the vectorless engine can navigate with precision.
+
+## Why AST Parsing Matters
+
+Naive code indexing treats each file as a single block of text. When you ask "how does authentication work", the engine has to scan entire files hoping to find relevant snippets. There's no understanding of what a function is, what a class contains, or how methods relate to their parent class.
+
+AST parsing changes this. The engine receives a tree like:
+
+```
+src/auth.py
+├── class_definition: AuthService
+│   ├── function_definition: __init__
+│   ├── function_definition: login
+│   └── function_definition: verify_token
+└── function_definition: create_session
+```
+
+Now the Orchestrator can `cd` into `AuthService`, `ls` to see its methods, and `cat login` to read the authentication logic. This is the same navigation model that works for documents — applied to code with structural precision.
+
+## How It Works
+
+### Per-Language Node Types
+
+Each language defines which AST node types represent semantic units worth indexing:
+
+```python
+SPLITTABLE_NODE_TYPES = {
+    "python": {
+        "function_definition",
+        "class_definition",
+        "decorated_definition",
+        "async_function_definition",
+    },
+    "rust": {
+        "function_item",
+        "impl_item",
+        "struct_item",
+        "enum_item",
+        "trait_item",
+        "mod_item",
+    },
+    # ... 12 languages total
+}
+```
+
+tree-sitter parses the source into an AST, then vectorless-code walks the tree extracting nodes whose type matches this set. Each extracted node becomes a `CodeNode` with:
+
+- `name` — the symbol name (e.g. `AuthService`, `login`)
+- `node_type` — the AST node type (e.g. `class_definition`)
+- `content` — the full source code of the node
+- `children` — nested definitions (methods inside classes)
+
+### Nested Extraction
+
+When a class is extracted, its methods are extracted as children — not as separate top-level nodes. This preserves the parent-child relationship:
+
+```python
+# Input: Python source
+class AuthService:
+    def login(self, username, password):
+        token = self._create_token(username)
+        return token
+
+    def verify_token(self, token):
+        return self._decode(token)
+
+# Output: CodeNode tree
+CodeNode(
+    name="AuthService",
+    node_type="class_definition",
+    children=[
+        CodeNode(name="login", node_type="function_definition", ...),
+        CodeNode(name="verify_token", node_type="function_definition", ...),
+    ],
+)
+```
+
+This nesting produces the raw_node tree that vectorless builds into a navigable Document. Level 1 = file, Level 2 = top-level definitions, Level 3 = nested definitions.
+
+### Name Extraction
+
+The parser extracts human-readable names from AST nodes by finding identifier children:
+
+- `function_definition` → looks for `identifier` child → `"login"`
+- `class_definition` → looks for `identifier` child → `"AuthService"`
+- `decorated_definition` → recurses into the decorated node
+- `impl_item` → looks for `type_identifier` → `"impl UserService"`
+
+## Fallback Strategy
+
+When tree-sitter is unavailable (unsupported language, grammar not installed, parse error), vectorless-code falls back to line-based splitting — splitting on blank-line boundaries into blocks. This produces flat `block` nodes without nesting, but still provides functional indexing.
+
+The fallback is transparent. The same `parse_file()` function handles both paths:
+
+```python
+def parse_file(file_path, content, language):
+    parser = _get_parser(language)  # cached per language
+    if parser is None:
+        return fallback_split(content, file_path, language)
+
+    nodes = ast_extract(parser, content, language)
+    if not nodes:
+        return fallback_split(content, file_path, language)
+    return nodes
+```
+
+## Performance Considerations
+
+### Parser Caching
+
+tree-sitter `Parser` instances are cached per language. A 10,000-file Python project creates exactly one Python parser, reused for every `.py` file. This avoids repeated memory allocation and grammar loading.
+
+### Single-Pass File Scan
+
+Files are read exactly once. A single pass computes:
+
+1. File hash (SHA-256 for incremental detection)
+2. Stats (line count, byte size, language distribution)
+3. Content for parsing
+
+### Incremental Parsing
+
+On subsequent compiles, only files whose hash changed are re-parsed. Unchanged files reuse cached raw_nodes directly. See [Incremental Compilation](./incremental.mdx).
+
+## Adding a New Language
+
+To add support for a new language:
+
+1. Add the language to `SPLITTABLE_NODE_TYPES` with the relevant AST node types
+2. Add the tree-sitter grammar package to `pyproject.toml` dependencies
+3. Add the package mapping to `_LANG_PACKAGE_MAP`
+
+For example, to add Zig:
+
+```python
+# ast_parser.py
+SPLITTABLE_NODE_TYPES["zig"] = {
+    "FunctionDecl",
+    "TopLevelDecl",
+}
+
+_LANG_PACKAGE_MAP["zig"] = "tree_sitter_zig"
+```
+
+```toml
+# pyproject.toml
+"tree-sitter-zig>=0.21",
+```
+
+No other code changes needed. The parser, cache, fallback, and incremental systems handle it automatically.
diff --git a/docs/docs/vectorless-code/getting-started.mdx b/docs/docs/vectorless-code/getting-started.mdx
diff --git a/docs/docs/vectorless-code/incremental.mdx b/docs/docs/vectorless-code/incremental.mdx
diff --git a/docs/sidebars.ts b/docs/sidebars.ts

Original file line number	Diff line number	Diff line change
`@@ -174,9 +174,10 @@ pub async fn parse_content(`
`174`	`174`	`DocumentFormat::Pdf => Err(vectorless_error::Error::Parse(`
`175`	`175`	`"PDF requires bytes, not string content".to_string(),`
`176`	`176`	`)),`
`177`		`- _ => Err(vectorless_error::Error::Parse(`
`178`		`- format!("Unsupported format for content parsing: {:?}", format),`
`179`		`- )),`
	`177`	`+ _ => Err(vectorless_error::Error::Parse(format!(`
	`178`	`+ "Unsupported format for content parsing: {:?}",`
	`179`	`+ format`
	`180`	`+ ))),`
`180`	`181`	`}`
`181`	`182`	`}`
`182`	`183`
`@@ -198,9 +199,10 @@ pub async fn parse_file(`
`198`	`199`	`};`
`199`	`200`	`parser.parse_file(path).await`
`200`	`201`	`}`
`201`		`- _ => Err(vectorless_error::Error::Parse(`
`202`		`- format!("Unsupported format for file parsing: {:?}", format),`
`203`		`- )),`
	`202`	`+ _ => Err(vectorless_error::Error::Parse(format!(`
	`203`	`+ "Unsupported format for file parsing: {:?}",`
	`204`	`+ format`
	`205`	`+ ))),`
`204`	`206`	`}`
`205`	`207`	`}`
`206`	`208`
`@@ -225,9 +227,10 @@ pub async fn parse_bytes(`
`225`	`227`	`};`
`226`	`228`	`parser.parse_bytes_async(bytes, None).await`
`227`	`229`	`}`
`228`		`- _ => Err(vectorless_error::Error::Parse(`
`229`		`- format!("Unsupported format for bytes parsing: {:?}", format),`
`230`		`- )),`
	`230`	`+ _ => Err(vectorless_error::Error::Parse(format!(`
	`231`	`+ "Unsupported format for bytes parsing: {:?}",`
	`232`	`+ format`
	`233`	`+ ))),`
`231`	`234`	`}`
`232`	`235`	`}`
`233`	`236`