Skip to content

Commit 5bd1bad

Browse files
committed
refactor: standardize import order and improve code formatting
- Standardize import order across multiple files to maintain consistency - Format function definitions and method implementations with proper indentation and line breaks - Refactor error handling chains to be more readable - Apply consistent code style throughout the parsing modules - Add documentation for vectorless-code AST parsing, getting started, and incremental compilation features
1 parent 80ce5a6 commit 5bd1bad

12 files changed

Lines changed: 443 additions & 36 deletions

File tree

crates/vectorless-compiler/src/parse/markdown/mod.rs

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ mod parser;
2727

2828
pub use parser::MarkdownParser;
2929

30-
use crate::parse::{Parser, ParseResult};
30+
use crate::parse::{ParseResult, Parser};
3131
use std::path::Path;
3232
use vectorless_error::Result;
3333

@@ -39,15 +39,21 @@ pub struct MarkdownParserAdapter {
3939
impl MarkdownParserAdapter {
4040
/// Create a new Markdown parser adapter.
4141
pub fn new() -> Self {
42-
Self { inner: MarkdownParser::new() }
42+
Self {
43+
inner: MarkdownParser::new(),
44+
}
4345
}
4446
}
4547

4648
#[async_trait::async_trait]
4749
impl Parser for MarkdownParserAdapter {
48-
fn name(&self) -> &str { "markdown" }
50+
fn name(&self) -> &str {
51+
"markdown"
52+
}
4953

50-
fn extensions(&self) -> &[&str] { &["md", "markdown"] }
54+
fn extensions(&self) -> &[&str] {
55+
&["md", "markdown"]
56+
}
5157

5258
async fn parse_content(&self, content: &str) -> Result<ParseResult> {
5359
self.inner.parse(content).await
@@ -58,9 +64,8 @@ impl Parser for MarkdownParserAdapter {
5864
}
5965

6066
async fn parse_bytes(&self, data: &[u8]) -> Result<ParseResult> {
61-
let content = std::str::from_utf8(data).map_err(|e| {
62-
vectorless_error::Error::Parse(format!("Invalid UTF-8: {}", e))
63-
})?;
67+
let content = std::str::from_utf8(data)
68+
.map_err(|e| vectorless_error::Error::Parse(format!("Invalid UTF-8: {}", e)))?;
6469
self.inner.parse(content).await
6570
}
6671
}

crates/vectorless-compiler/src/parse/mod.rs

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -174,9 +174,10 @@ pub async fn parse_content(
174174
DocumentFormat::Pdf => Err(vectorless_error::Error::Parse(
175175
"PDF requires bytes, not string content".to_string(),
176176
)),
177-
_ => Err(vectorless_error::Error::Parse(
178-
format!("Unsupported format for content parsing: {:?}", format),
179-
)),
177+
_ => Err(vectorless_error::Error::Parse(format!(
178+
"Unsupported format for content parsing: {:?}",
179+
format
180+
))),
180181
}
181182
}
182183

@@ -198,9 +199,10 @@ pub async fn parse_file(
198199
};
199200
parser.parse_file(path).await
200201
}
201-
_ => Err(vectorless_error::Error::Parse(
202-
format!("Unsupported format for file parsing: {:?}", format),
203-
)),
202+
_ => Err(vectorless_error::Error::Parse(format!(
203+
"Unsupported format for file parsing: {:?}",
204+
format
205+
))),
204206
}
205207
}
206208

@@ -225,9 +227,10 @@ pub async fn parse_bytes(
225227
};
226228
parser.parse_bytes_async(bytes, None).await
227229
}
228-
_ => Err(vectorless_error::Error::Parse(
229-
format!("Unsupported format for bytes parsing: {:?}", format),
230-
)),
230+
_ => Err(vectorless_error::Error::Parse(format!(
231+
"Unsupported format for bytes parsing: {:?}",
232+
format
233+
))),
231234
}
232235
}
233236

crates/vectorless-compiler/src/parse/pdf/mod.rs

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ mod types;
2828
pub use parser::PdfParser;
2929
pub use types::PdfPage;
3030

31-
use crate::parse::{Parser, ParseResult};
31+
use crate::parse::{ParseResult, Parser};
3232
use std::path::Path;
3333
use vectorless_error::Result;
3434
use vectorless_llm::LlmClient;
@@ -51,9 +51,13 @@ impl PdfParserAdapter {
5151

5252
#[async_trait::async_trait]
5353
impl Parser for PdfParserAdapter {
54-
fn name(&self) -> &str { "pdf" }
54+
fn name(&self) -> &str {
55+
"pdf"
56+
}
5557

56-
fn extensions(&self) -> &[&str] { &["pdf"] }
58+
fn extensions(&self) -> &[&str] {
59+
&["pdf"]
60+
}
5761

5862
async fn parse_content(&self, _content: &str) -> Result<ParseResult> {
5963
Err(vectorless_error::Error::Parse(

crates/vectorless-compiler/src/passes/frontend/parse.rs

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -116,10 +116,9 @@ impl CompilePass for ParsePass {
116116
"node_count".to_string(),
117117
serde_json::json!(ctx.raw_nodes.len()),
118118
);
119-
stage_result.metadata.insert(
120-
"source".to_string(),
121-
serde_json::json!("pre-parsed"),
122-
);
119+
stage_result
120+
.metadata
121+
.insert("source".to_string(), serde_json::json!("pre-parsed"));
123122
return Ok(stage_result);
124123
}
125124

crates/vectorless-compiler/src/pipeline/context.rs

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -107,10 +107,7 @@ impl CompilerInput {
107107
/// Create input from pre-parsed raw nodes.
108108
///
109109
/// Skips ParsePass — the pipeline starts from BuildPass.
110-
pub fn pre_parsed(
111-
nodes: Vec<crate::parse::RawNode>,
112-
name: impl Into<String>,
113-
) -> Self {
110+
pub fn pre_parsed(nodes: Vec<crate::parse::RawNode>, name: impl Into<String>) -> Self {
114111
Self::PreParsed {
115112
nodes,
116113
name: name.into(),
@@ -356,9 +353,7 @@ impl CompileContext {
356353
fn compute_source_hash(input: &CompilerInput) -> String {
357354
use sha2::{Digest, Sha256};
358355
let hash = match input {
359-
CompilerInput::File(path) => {
360-
Sha256::digest(path.to_string_lossy().as_bytes())
361-
}
356+
CompilerInput::File(path) => Sha256::digest(path.to_string_lossy().as_bytes()),
362357
CompilerInput::Content { content, .. } => Sha256::digest(content.as_bytes()),
363358
CompilerInput::Bytes { data, .. } => Sha256::digest(data),
364359
CompilerInput::PreParsed { nodes, .. } => {

crates/vectorless-engine/src/engine.rs

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -419,7 +419,11 @@ impl Engine {
419419

420420
let raw_nodes: Vec<RawNode> = nodes
421421
.iter()
422-
.map(|n| RawNode::new(&n.title).with_content(&n.content).with_level(n.level))
422+
.map(|n| {
423+
RawNode::new(&n.title)
424+
.with_content(&n.content)
425+
.with_level(n.level)
426+
})
423427
.collect();
424428

425429
let compiler_input =
@@ -437,7 +441,11 @@ impl Engine {
437441
let doc_id = uuid::Uuid::new_v4().to_string();
438442

439443
let mut meta = DocumentMeta::new();
440-
meta.update_processing_stats(node_count, result.metrics.total_tokens_generated, result.metrics.total_time_ms());
444+
meta.update_processing_stats(
445+
node_count,
446+
result.metrics.total_tokens_generated,
447+
result.metrics.total_time_ms(),
448+
);
441449

442450
let doc = vectorless_document::Document {
443451
schema_version: CURRENT_SCHEMA_VERSION,

crates/vectorless-engine/src/indexer.rs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -229,8 +229,9 @@ impl IndexerClient {
229229
});
230230

231231
let doc_id = Uuid::new_v4().to_string();
232-
self.events
233-
.emit_compile(CompileEvent::FormatDetected { format: format.clone() });
232+
self.events.emit_compile(CompileEvent::FormatDetected {
233+
format: format.clone(),
234+
});
234235

235236
info!("Compiling {:?} document: {}", format, source_label);
236237

crates/vectorless-py/src/engine.rs

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,16 @@ async fn run_compile_raw(
3232
) -> PyResult<PyDocumentInfo> {
3333
let raw_nodes: Vec<RawNodeInput> = nodes
3434
.into_iter()
35-
.map(|(title, content, level)| RawNodeInput { title, content, level })
35+
.map(|(title, content, level)| RawNodeInput {
36+
title,
37+
content,
38+
level,
39+
})
3640
.collect();
37-
let input = IngestInput::PreParsed { name, nodes: raw_nodes };
41+
let input = IngestInput::PreParsed {
42+
name,
43+
nodes: raw_nodes,
44+
};
3845
let doc = engine.compile(input).await.map_err(to_py_err)?;
3946
Ok(PyDocumentInfo { inner: doc })
4047
}
Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
---
2+
sidebar_position: 2
3+
---
4+
5+
# AST-Level Code Parsing
6+
7+
vectorless-code uses tree-sitter to parse source code into semantic nodes — functions, classes, methods — instead of treating files as flat text. This produces a structured tree that the vectorless engine can navigate with precision.
8+
9+
## Why AST Parsing Matters
10+
11+
Naive code indexing treats each file as a single block of text. When you ask "how does authentication work", the engine has to scan entire files hoping to find relevant snippets. There's no understanding of what a function is, what a class contains, or how methods relate to their parent class.
12+
13+
AST parsing changes this. The engine receives a tree like:
14+
15+
```
16+
src/auth.py
17+
├── class_definition: AuthService
18+
│ ├── function_definition: __init__
19+
│ ├── function_definition: login
20+
│ └── function_definition: verify_token
21+
└── function_definition: create_session
22+
```
23+
24+
Now the Orchestrator can `cd` into `AuthService`, `ls` to see its methods, and `cat login` to read the authentication logic. This is the same navigation model that works for documents — applied to code with structural precision.
25+
26+
## How It Works
27+
28+
### Per-Language Node Types
29+
30+
Each language defines which AST node types represent semantic units worth indexing:
31+
32+
```python
33+
SPLITTABLE_NODE_TYPES = {
34+
"python": {
35+
"function_definition",
36+
"class_definition",
37+
"decorated_definition",
38+
"async_function_definition",
39+
},
40+
"rust": {
41+
"function_item",
42+
"impl_item",
43+
"struct_item",
44+
"enum_item",
45+
"trait_item",
46+
"mod_item",
47+
},
48+
# ... 12 languages total
49+
}
50+
```
51+
52+
tree-sitter parses the source into an AST, then vectorless-code walks the tree extracting nodes whose type matches this set. Each extracted node becomes a `CodeNode` with:
53+
54+
- `name` — the symbol name (e.g. `AuthService`, `login`)
55+
- `node_type` — the AST node type (e.g. `class_definition`)
56+
- `content` — the full source code of the node
57+
- `children` — nested definitions (methods inside classes)
58+
59+
### Nested Extraction
60+
61+
When a class is extracted, its methods are extracted as children — not as separate top-level nodes. This preserves the parent-child relationship:
62+
63+
```python
64+
# Input: Python source
65+
class AuthService:
66+
def login(self, username, password):
67+
token = self._create_token(username)
68+
return token
69+
70+
def verify_token(self, token):
71+
return self._decode(token)
72+
73+
# Output: CodeNode tree
74+
CodeNode(
75+
name="AuthService",
76+
node_type="class_definition",
77+
children=[
78+
CodeNode(name="login", node_type="function_definition", ...),
79+
CodeNode(name="verify_token", node_type="function_definition", ...),
80+
],
81+
)
82+
```
83+
84+
This nesting produces the raw_node tree that vectorless builds into a navigable Document. Level 1 = file, Level 2 = top-level definitions, Level 3 = nested definitions.
85+
86+
### Name Extraction
87+
88+
The parser extracts human-readable names from AST nodes by finding identifier children:
89+
90+
- `function_definition` → looks for `identifier` child → `"login"`
91+
- `class_definition` → looks for `identifier` child → `"AuthService"`
92+
- `decorated_definition` → recurses into the decorated node
93+
- `impl_item` → looks for `type_identifier``"impl UserService"`
94+
95+
## Fallback Strategy
96+
97+
When tree-sitter is unavailable (unsupported language, grammar not installed, parse error), vectorless-code falls back to line-based splitting — splitting on blank-line boundaries into blocks. This produces flat `block` nodes without nesting, but still provides functional indexing.
98+
99+
The fallback is transparent. The same `parse_file()` function handles both paths:
100+
101+
```python
102+
def parse_file(file_path, content, language):
103+
parser = _get_parser(language) # cached per language
104+
if parser is None:
105+
return fallback_split(content, file_path, language)
106+
107+
nodes = ast_extract(parser, content, language)
108+
if not nodes:
109+
return fallback_split(content, file_path, language)
110+
return nodes
111+
```
112+
113+
## Performance Considerations
114+
115+
### Parser Caching
116+
117+
tree-sitter `Parser` instances are cached per language. A 10,000-file Python project creates exactly one Python parser, reused for every `.py` file. This avoids repeated memory allocation and grammar loading.
118+
119+
### Single-Pass File Scan
120+
121+
Files are read exactly once. A single pass computes:
122+
123+
1. File hash (SHA-256 for incremental detection)
124+
2. Stats (line count, byte size, language distribution)
125+
3. Content for parsing
126+
127+
### Incremental Parsing
128+
129+
On subsequent compiles, only files whose hash changed are re-parsed. Unchanged files reuse cached raw_nodes directly. See [Incremental Compilation](./incremental.mdx).
130+
131+
## Adding a New Language
132+
133+
To add support for a new language:
134+
135+
1. Add the language to `SPLITTABLE_NODE_TYPES` with the relevant AST node types
136+
2. Add the tree-sitter grammar package to `pyproject.toml` dependencies
137+
3. Add the package mapping to `_LANG_PACKAGE_MAP`
138+
139+
For example, to add Zig:
140+
141+
```python
142+
# ast_parser.py
143+
SPLITTABLE_NODE_TYPES["zig"] = {
144+
"FunctionDecl",
145+
"TopLevelDecl",
146+
}
147+
148+
_LANG_PACKAGE_MAP["zig"] = "tree_sitter_zig"
149+
```
150+
151+
```toml
152+
# pyproject.toml
153+
"tree-sitter-zig>=0.21",
154+
```
155+
156+
No other code changes needed. The parser, cache, fallback, and incremental systems handle it automatically.

0 commit comments

Comments
 (0)