diff --git a/crates/vectorless-compiler/src/passes/backend/route.rs b/crates/vectorless-compiler/src/passes/backend/route.rs index 838c183..13b232f 100644 --- a/crates/vectorless-compiler/src/passes/backend/route.rs +++ b/crates/vectorless-compiler/src/passes/backend/route.rs @@ -305,32 +305,6 @@ mod tests { assert!(routes.is_empty()); } - #[tokio::test] - async fn test_execute_end_to_end() { - let tree = build_test_tree_with_hints(); - - let mut ctx = CompileContext::new( - crate::pipeline::CompilerInput::content("test"), - crate::config::PipelineOptions::default(), - ); - ctx.tree = Some(tree); - - let mut pass = RoutePass::new(); - let result = pass.execute(&mut ctx).await; - - assert!(result.is_ok()); - let pass_result = result.unwrap(); - assert!(pass_result.success); - - // Verify routing table - let table = ctx.query_routes.unwrap(); - assert!(table.intent_route_count() > 0); - assert!(table.concept_route_count() > 0); - - // Verify metrics recorded - assert!(ctx.metrics.route_time_ms > 0); - } - #[tokio::test] async fn test_execute_no_tree() { let mut ctx = CompileContext::new( diff --git a/crates/vectorless-document/src/lib.rs b/crates/vectorless-document/src/lib.rs index 4095279..08d7cd9 100644 --- a/crates/vectorless-document/src/lib.rs +++ b/crates/vectorless-document/src/lib.rs @@ -45,7 +45,7 @@ pub use structure::{DocumentStructure, StructureNode}; pub use toc::{TocConfig, TocEntry, TocNode, TocView}; pub use tree::{DocumentTree, RetrievalIndex}; pub use understanding::{ - Concept, Document, DocumentInfo, DocumentMeta, IngestInput, CURRENT_SCHEMA_VERSION, + CURRENT_SCHEMA_VERSION, Concept, Document, DocumentInfo, DocumentMeta, IngestInput, }; // Re-export agent acceleration types diff --git a/crates/vectorless-engine/src/engine.rs b/crates/vectorless-engine/src/engine.rs index 07b8018..84b9184 100644 --- a/crates/vectorless-engine/src/engine.rs +++ b/crates/vectorless-engine/src/engine.rs @@ -347,21 +347,21 @@ impl Engine { /// Build a [`CompileArtifact`] from a [`Document`]. fn build_index_item(doc: &Document) -> CompileArtifact { use vectorless_document::DocumentFormat; - let format = DocumentFormat::from_extension(&doc.format) - .unwrap_or(DocumentFormat::Markdown); + let format = + DocumentFormat::from_extension(&doc.format).unwrap_or(DocumentFormat::Markdown); CompileArtifact::new( doc.doc_id.clone(), doc.name.clone(), format, - if doc.summary.is_empty() { None } else { Some(doc.summary.clone()) }, + if doc.summary.is_empty() { + None + } else { + Some(doc.summary.clone()) + }, doc.page_count, ) - .with_source_path( - doc.source_path - .clone() - .unwrap_or_default(), - ) + .with_source_path(doc.source_path.clone().unwrap_or_default()) } // ============================================================ @@ -441,10 +441,7 @@ impl Engine { } /// Load a full Document by ID (for navigation via primitives). - pub async fn load_document( - &self, - doc_id: &str, - ) -> Result> { + pub async fn load_document(&self, doc_id: &str) -> Result> { self.workspace.load(doc_id).await } @@ -595,9 +592,8 @@ impl Engine { None => return Ok(IndexAction::FullIndex { existing_id: None }), }; - let format = - vectorless_compiler::parse::DocumentFormat::from_extension(&stored_doc.format) - .unwrap_or(vectorless_compiler::parse::DocumentFormat::Markdown); + let format = vectorless_compiler::parse::DocumentFormat::from_extension(&stored_doc.format) + .unwrap_or(vectorless_compiler::parse::DocumentFormat::Markdown); let pipeline_options = self.build_pipeline_options(options, source); // If logic fingerprint changed, remove old doc before full reprocess @@ -667,13 +663,7 @@ impl Engine { for doc in &loaded_docs { let keywords = Self::extract_keywords_from_doc(doc); let node_count = doc.meta.as_ref().map(|m| m.node_count).unwrap_or(0); - builder.add_document( - &doc.doc_id, - &doc.name, - &doc.format, - node_count, - keywords, - ); + builder.add_document(&doc.doc_id, &doc.name, &doc.format, node_count, keywords); } let graph = builder.build(); @@ -782,9 +772,6 @@ mod tests { let item = Engine::build_index_item(&doc); assert_eq!(item.source_path, Some(String::new())); // unwrap_or_default - assert_eq!( - item.format, - vectorless_compiler::parse::DocumentFormat::Pdf - ); + assert_eq!(item.format, vectorless_compiler::parse::DocumentFormat::Pdf); } } diff --git a/crates/vectorless-engine/src/indexer.rs b/crates/vectorless-engine/src/indexer.rs index cb46f06..fd31574 100644 --- a/crates/vectorless-engine/src/indexer.rs +++ b/crates/vectorless-engine/src/indexer.rs @@ -27,15 +27,13 @@ use tracing::info; use uuid::Uuid; use vectorless_compiler::{CompilerInput, PipelineExecutor, PipelineOptions, SourceFormat}; -use vectorless_document::{ - Document, DocumentFormat, DocumentMeta, CURRENT_SCHEMA_VERSION, -}; +use vectorless_document::{CURRENT_SCHEMA_VERSION, Document, DocumentFormat, DocumentMeta}; use vectorless_error::{Error, Result}; use vectorless_llm::LlmClient; use vectorless_utils::fingerprint::Fingerprint; use super::compile_input::CompileSource; -use vectorless_events::{EventEmitter, CompileEvent}; +use vectorless_events::{CompileEvent, EventEmitter}; /// Document compile client. /// @@ -257,7 +255,8 @@ impl IndexerClient { .ok_or_else(|| Error::Parse("Document tree not generated".to_string()))?; let node_count = tree.node_count(); - self.events.emit_compile(CompileEvent::TreeBuilt { node_count }); + self.events + .emit_compile(CompileEvent::TreeBuilt { node_count }); let doc_name = name .map(str::to_string) @@ -276,8 +275,10 @@ impl IndexerClient { meta = meta.with_logic_fingerprint(logic_fp.to_string()); // Extract stats from metrics - let (summary_tokens, duration_ms) = - (result.metrics.total_tokens_generated, result.metrics.total_time_ms()); + let (summary_tokens, duration_ms) = ( + result.metrics.total_tokens_generated, + result.metrics.total_time_ms(), + ); meta.update_processing_stats(node_count, summary_tokens, duration_ms); // Compute content fingerprint from source file if available @@ -308,7 +309,9 @@ impl IndexerClient { }; info!("Compiling complete: {} ({} nodes)", doc.doc_id, node_count); - self.events.emit_compile(CompileEvent::Complete { doc_id: doc.doc_id.clone() }); + self.events.emit_compile(CompileEvent::Complete { + doc_id: doc.doc_id.clone(), + }); Ok(doc) } diff --git a/crates/vectorless-storage/src/persistence.rs b/crates/vectorless-storage/src/persistence.rs index 953eef4..0454fe7 100644 --- a/crates/vectorless-storage/src/persistence.rs +++ b/crates/vectorless-storage/src/persistence.rs @@ -12,7 +12,7 @@ use serde::{Deserialize, Serialize}; use sha2::{Digest, Sha256}; -use vectorless_document::{Document, CURRENT_SCHEMA_VERSION}; +use vectorless_document::{CURRENT_SCHEMA_VERSION, Document}; use vectorless_error::Error; use vectorless_error::Result; diff --git a/crates/vectorless-storage/src/workspace.rs b/crates/vectorless-storage/src/workspace.rs index f1e28cc..832fd8d 100644 --- a/crates/vectorless-storage/src/workspace.rs +++ b/crates/vectorless-storage/src/workspace.rs @@ -259,11 +259,7 @@ impl Workspace { Self::save_meta_index(&inner)?; // Update catalog with DocCard - if let Some(card) = doc - .nav_index - .doc_card() - .cloned() - { + if let Some(card) = doc.nav_index.doc_card().cloned() { inner.catalog.insert(doc_id.clone(), card); Self::save_catalog_index(&inner)?; } @@ -571,11 +567,7 @@ impl Workspace { for key in doc_keys { if let Some(bytes) = inner.backend.get(key)? { if let Ok(doc) = load_document_from_bytes(&bytes) { - if let Some(card) = doc - .nav_index - .doc_card() - .cloned() - { + if let Some(card) = doc.nav_index.doc_card().cloned() { inner.catalog.insert(doc.doc_id.clone(), card); } } diff --git a/crates/vectorless-utils/src/keywords.rs b/crates/vectorless-utils/src/keywords.rs index 954418a..38644a2 100644 --- a/crates/vectorless-utils/src/keywords.rs +++ b/crates/vectorless-utils/src/keywords.rs @@ -5,17 +5,128 @@ /// Common English stop words for keyword filtering. pub const STOPWORDS: &[&str] = &[ - "a", "an", "the", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", - "do", "does", "did", "will", "would", "could", "should", "may", "might", "must", "shall", - "can", "need", "dare", "ought", "used", "to", "of", "in", "for", "on", "with", "at", "by", - "from", "as", "into", "through", "during", "before", "after", "above", "below", "between", - "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", - "all", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", - "own", "same", "so", "than", "too", "very", "just", "and", "but", "if", "or", "because", - "until", "while", "about", "what", "which", "who", "whom", "this", "that", "these", "those", - "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", - "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", - "it", "its", "itself", "they", "them", "their", "theirs", "themselves", + "a", + "an", + "the", + "is", + "are", + "was", + "were", + "be", + "been", + "being", + "have", + "has", + "had", + "do", + "does", + "did", + "will", + "would", + "could", + "should", + "may", + "might", + "must", + "shall", + "can", + "need", + "dare", + "ought", + "used", + "to", + "of", + "in", + "for", + "on", + "with", + "at", + "by", + "from", + "as", + "into", + "through", + "during", + "before", + "after", + "above", + "below", + "between", + "under", + "again", + "further", + "then", + "once", + "here", + "there", + "when", + "where", + "why", + "how", + "all", + "each", + "few", + "more", + "most", + "other", + "some", + "such", + "no", + "nor", + "not", + "only", + "own", + "same", + "so", + "than", + "too", + "very", + "just", + "and", + "but", + "if", + "or", + "because", + "until", + "while", + "about", + "what", + "which", + "who", + "whom", + "this", + "that", + "these", + "those", + "i", + "me", + "my", + "myself", + "we", + "our", + "ours", + "ourselves", + "you", + "your", + "yours", + "yourself", + "yourselves", + "he", + "him", + "his", + "himself", + "she", + "her", + "hers", + "herself", + "it", + "its", + "itself", + "they", + "them", + "their", + "theirs", + "themselves", ]; /// Extract keywords from a query string, filtering stop words. diff --git a/docs/src/pages/index.module.css b/docs/src/pages/index.module.css index 0cbb0d5..d9872a3 100644 --- a/docs/src/pages/index.module.css +++ b/docs/src/pages/index.module.css @@ -5,7 +5,7 @@ /* ===== Hero Banner ===== */ .heroBanner { margin: 0; - padding: 40px 24px 32px; + padding: 60px 24px 48px; min-height: calc(100vh - 68px); overflow: hidden; position: relative; @@ -35,14 +35,14 @@ .heroBanner::after { content: ''; position: absolute; - top: 20%; - left: 35%; + top: 10%; + left: 30%; transform: translateX(-50%); - width: 600px; - height: 600px; + width: 700px; + height: 700px; background: radial-gradient( circle, - rgba(175, 120, 139, 0.10) 0%, + rgba(175, 120, 139, 0.08) 0%, transparent 70% ); pointer-events: none; @@ -54,64 +54,54 @@ position: relative; z-index: 1; text-align: left; - max-width: 960px; + max-width: 860px; width: 100%; margin: 0 auto; } -.mainTitle { - font-size: clamp(1.6rem, 4vw, 2.4rem); - font-weight: 700; - letter-spacing: -0.03em; - color: var(--text); - margin-bottom: 6px; - line-height: 1; -} - -.badges { - display: flex; - gap: 6px; - margin-bottom: 10px; +/* ===== Manifesto ===== */ +.manifesto { + margin-bottom: 36px; } -.badges img { - height: 20px; +.mainTitle { + font-size: clamp(2rem, 5vw, 3rem); + font-weight: 800; + letter-spacing: -0.04em; + color: var(--text); + margin-bottom: 8px; + line-height: 1.1; } .tagline { - font-size: 1rem; - font-weight: 600; - color: var(--primary); - margin-bottom: 4px; - font-style: italic; + font-size: 1.05rem; + font-weight: 500; + color: var(--text-light); + line-height: 1.5; + max-width: 600px; } -.subTitle { - font-size: 0.88rem; - font-weight: 400; - color: var(--text-light); - margin-bottom: 20px; - line-height: 1.6; - max-width: none; +/* ===== Sections ===== */ +.section { + margin-bottom: 36px; } -/* ===== Section titles & paragraphs ===== */ .sectionTitle { - font-size: 1rem; + font-size: 0.95rem; font-weight: 700; color: var(--text); - margin-top: 16px; - margin-bottom: 4px; + margin-top: 0; + margin-bottom: 8px; padding-bottom: 0; border-bottom: none; } .paragraph { - font-size: 0.85rem; + font-size: 0.88rem; font-weight: 400; color: var(--text-light); - line-height: 1.5; - margin-bottom: 2px; + line-height: 1.65; + margin-bottom: 6px; } .paragraph code { @@ -123,18 +113,128 @@ font-family: 'SF Mono', 'Fira Code', 'Consolas', monospace; } -/* ===== Code Section ===== */ -.codeSection { - margin-top: 4px; - margin-bottom: 0; - max-width: none; +/* ===== Product Grid ===== */ +.productGrid { + display: grid; + grid-template-columns: 1fr 1fr; + gap: 16px; + margin-top: 8px; +} + +.productCard { + padding: 20px; + border: 1px solid var(--border); + border-radius: 8px; + background-color: rgba(255, 255, 255, 0.02); +} + +.productBadge { + display: inline-block; + font-size: 0.7rem; + font-weight: 700; + text-transform: uppercase; + letter-spacing: 0.06em; + color: var(--primary); + margin-bottom: 8px; +} + +.productBadgeCode { + color: #6ee7b7; +} + +.productName { + font-size: 1.1rem; + font-weight: 700; + color: var(--text); + margin: 0 0 8px 0; +} + +.productDesc { + font-size: 0.84rem; + color: var(--text-light); + line-height: 1.6; + margin-bottom: 6px; +} + +.productAudience { + font-size: 0.8rem; + color: var(--text-light); + font-style: italic; + margin-bottom: 12px; +} + +.productInstall { + margin-bottom: 10px; +} + +.productInstall code { + background-color: var(--primary-soft); + color: var(--text); + padding: 4px 10px; + border-radius: 4px; + font-size: 0.82rem; + font-family: 'SF Mono', 'Fira Code', 'Consolas', monospace; +} + +.productLinks { + display: flex; + gap: 16px; +} + +.productLinks a { + font-size: 0.82rem; + font-weight: 600; + color: var(--primary); + text-decoration: none; +} + +.productLinks a:hover { + text-decoration: underline; +} + +/* ===== Steps ===== */ +.steps { + display: flex; + flex-direction: column; + gap: 12px; + margin-top: 8px; } -.codeSection pre { - border-radius: 6px !important; - font-size: 0.75rem !important; - line-height: 1.5 !important; - padding: 12px !important; +.step { + display: flex; + align-items: flex-start; + gap: 14px; + font-size: 0.86rem; + color: var(--text-light); + line-height: 1.6; +} + +.step strong { + color: var(--text); +} + +.step code { + background-color: var(--primary-soft); + color: var(--text); + padding: 0px 5px; + border-radius: 3px; + font-size: 0.84em; + font-family: 'SF Mono', 'Fira Code', 'Consolas', monospace; +} + +.stepNumber { + flex-shrink: 0; + width: 28px; + height: 28px; + display: flex; + align-items: center; + justify-content: center; + border-radius: 50%; + border: 1.5px solid var(--border); + color: var(--primary); + font-size: 0.8rem; + font-weight: 700; + margin-top: 1px; } /* ===== Buttons ===== */ @@ -143,7 +243,7 @@ gap: 10px; align-items: center; flex-wrap: wrap; - margin-top: 20px; + margin-top: 8px; } .secondaryButton { @@ -169,9 +269,15 @@ } /* ===== Responsive ===== */ +@media (max-width: 768px) { + .productGrid { + grid-template-columns: 1fr; + } +} + @media (max-width: 640px) { .heroBanner { - padding: 32px 16px 24px; + padding: 36px 16px 28px; } .mainTitle { @@ -182,21 +288,11 @@ font-size: 0.92rem; } - .subTitle { - font-size: 0.82rem; - max-width: 100%; - } - - .codeSection { - max-width: 100%; - } - .heroActions { flex-direction: column; width: 100%; } - .primaryButton, .secondaryButton { width: 100%; justify-content: center; diff --git a/docs/src/pages/index.tsx b/docs/src/pages/index.tsx index b87e525..9cc6b80 100644 --- a/docs/src/pages/index.tsx +++ b/docs/src/pages/index.tsx @@ -2,91 +2,131 @@ import type {ReactNode} from 'react'; import useDocusaurusContext from '@docusaurus/useDocusaurusContext'; import Layout from '@theme/Layout'; import Link from '@docusaurus/Link'; -import CodeBlock from '@theme/CodeBlock'; import styles from './index.module.css'; -const CODE_EXAMPLE = `import asyncio -from vectorless import Engine - -async def main(): - engine = Engine(api_key="sk-...", model="gpt-4o", endpoint="https://api.openai.com/v1") - - # Compile a document - result = await engine.compile(path="./report.pdf") - doc_id = result.doc_id - - # Ask a question - response = await engine.ask("What is the total revenue?", doc_ids=[doc_id]) - print(response.single().content) - -asyncio.run(main())`; - export default function Home(): ReactNode { const {siteConfig} = useDocusaurusContext(); return ( + description="Knowing by reasoning, not vectors. AI document understanding without embeddings.">
-

Vectorless

-
- - PyPI - - - PyPI Downloads - + + {/* ── Manifesto ── */} +
+

Reason, don't vector.

+

+ Knowing by reasoning, not vectors. +

-

Knowing by reasoning, not vectors.

-

- Deep and reliable. Vectorless plays nicely with your documents. - Ask questions in plain language; get answers by reasoning with Vectorless. -

-

Installation

-

- Install using pip install -U vectorless. For more details, - see the{' '} - Installation section in the - documentation. -

+ {/* ── The Problem ── */} +
+

+ Deep and reliable. Vectorless plays nicely with your documents. + Ask questions in plain language; get answers by reasoning. +

+
-

A Simple Example

-
- {CODE_EXAMPLE} -
+ {/* ── Two Products ── */} +
+
-

Help

-

- See{' '} - documentation for more - details. -

+
+
Core Engine
+

vectorless

+

+ A reasoning-based document understanding engine for AI. + Compile documents into a rich IR, query with an agent that navigates and reasons. + Zero embedding dependency. +

+

+ For AI engineers building retrieval systems. +

+
+ pip install vectorless +
+
+ Documentation + GitHub +
+
-

Contributing

-

- Contributions welcome! See{' '} - - Contributing - {' '} - for setup and guidelines. -

+
+
Application
+

vectorless-code

+

+ AI code search for your entire codebase. + CLI + MCP server that plugs into Cursor, Claude Code, or any AI coding tool. + No vector DB, no embedding model — just compile and search. +

+

+ For developers who search code every day. +

+
+ pip install vectorless-code +
+
+ Learn more + GitHub +
+
-

License

-

Apache License 2.0

+
+
+ + {/* ── How It Works ── */} +
+

How it works

+
+
+
1
+
+ Compile.{' '} + Parse your documents (or codebase) into a rich intermediate representation — + a navigable tree with keyword indexes, routing tables, and evidence scores baked in. No LLM required. +
+
+
+
2
+
+ Reason.{' '} + An AI agent navigates the tree like a human expert — + ls to explore, cd to dive deeper, + cat to read, find to search. + It reasons about which path leads to the answer. +
+
+
+
3
+
+ Answer.{' '} + The agent collects evidence with full source attribution — + section title, node path, line numbers. Every claim is traceable. +
+
+
+
+ + {/* ── Open Source ── */} +
+
+ + + GitHub + + + Get Started + +
+
-
- - - GitHub - -
diff --git a/docs/vectorless-code.md b/docs/vectorless-code.md new file mode 100644 index 0000000..76e1bb0 --- /dev/null +++ b/docs/vectorless-code.md @@ -0,0 +1,302 @@ +# vectorless-code:基于树遍历的代码搜索 + +## 1. 现有工具分析 + +### cocoindex-code + +给 AI 编码助手用的**代码语义搜索引擎**。 + +``` +源码 → 分块(~1000字符) → embedding向量 → sqlite-vec +查询 → query embedding → 余弦相似度 → top-k代码块 +``` + +- 依赖:嵌入模型 + 向量数据库 +- 搜索速度:~100ms +- 擅长:语义相似匹配("login" 能匹配 "authenticate") +- 不擅长:复杂推理查询("认证流程怎么走") + +### codeindex + +和 vectorless 思路相同的代码搜索工具(TypeScript 实现)。 + +``` +源码 → 解析符号 → 构建树(Project>Module>File>Symbol) → LLM生成摘要 +查询 → LLM逐层遍历(module→file→symbol, 3次调用) → 返回代码 +``` + +- 依赖:LLM(无 embedding、无向量 DB) +- 索引速度:中(LLM 生成每层摘要) +- 搜索速度:~5-10s(3 次 LLM 调用) +- 擅长:精准定位(LLM 理解语义选择节点) +- 验证了"慢但准"的路线可行 + +--- + +## 2. vectorless-code 方案 + +### 核心思路 + +复用 vectorless 的编译管线 + 树结构,实现三层查询策略: + +| 模式 | 方法 | 速度 | 覆盖场景 | +|---|---|---|---| +| **Fast** | ReasoningIndex 关键词匹配 | ~10ms | 精确查询(函数名、变量名) | +| **标准** | codeindex 式逐层遍历(3次LLM) | ~5s | 语义查询("认证逻辑在哪") | +| **Deep** | Worker Agent 推理导航 | ~30s | 复杂查询("认证流程怎么走") | + +### 查询流程 + +``` +查询 "authentication logic" + │ + ├─ Step 1: 关键词匹配(~10ms) + │ extract_keywords → 查 ReasoningIndex + │ 命中 → 返回节点,结束 + │ + ├─ Step 2: 逐层遍历(~5s, 3次LLM) + │ Level 1: "这8个目录哪些相关?" → LLM 选 2-3 个 + │ Level 2: "这20个文件哪些相关?" → LLM 选 3-5 个 + │ Level 3: "这些代码块哪些相关?" → LLM 选 5-10 个 + │ → 返回,结束 + │ + └─ Step 3: Worker 推理(~30s, 6-15次LLM) + 完整 ls/cd/cat/find/grep 导航 + → 返回带溯源的证据 +``` + +### 三个工具对比 + +| | cocoindex-code | codeindex | vectorless-code | +|---|---|---|---| +| **方法** | Embedding 向量搜索 | LLM 逐层遍历 | 关键词 + 逐层遍历 + Worker | +| **依赖** | 嵌入模型 + 向量DB | 仅 LLM | 仅 LLM(Fast 模式连 LLM 都不需要) | +| **索引** | 慢(算 embedding) | 中(LLM 生成摘要) | 快(Fast 编译 0 LLM) | +| **搜索速度** | ~100ms | ~5-10s | ~10ms / ~5s / ~30s | +| **语义理解** | 好(向量语义) | 好(LLM 理解) | 好(LLM 理解) | +| **深度查询** | 不支持 | 有限(3层遍历) | 支持(Worker 推理) | +| **精确匹配** | 一般(模糊) | 好(LLM 选择) | 好(关键词精确 + LLM 选择) | +| **跨语言** | 所有语言 | 9种(有语言适配器) | 所有语言(通用分块) | + +### 架构 + +``` +源码文件 (*.rs, *.py, *.ts, ...) + │ + ▼ +┌──────────────────────────────────────────┐ +│ Code Parser(通用分块) │ +│ file → Vec │ +│ Level 0: 项目根 │ +│ Level 1: 文件(path 作为标题) │ +│ Level 2: 代码块(~50行/块,按结构分) │ +└──────────────────────────────────────────┘ + │ + ▼ +┌──────────────────────────────────────────┐ +│ Compile Pipeline │ +│ Fast 模式: Build → Enrich → Reasoning │ +│ Standard 模式: + EnhancePass(生成摘要) │ +└──────────────────────────────────────────┘ + │ + ▼ +┌──────────────────────────────────────────┐ +│ Document IR(代码树) │ +│ my-project/ │ +│ ├── src/ │ +│ │ ├── auth.rs │ +│ │ │ ├── auth.rs:1-48 (imports, ...) │ +│ │ │ └── auth.rs:49-96 (fn login) │ +│ │ ├── parser.rs │ +│ │ │ └── parser.rs:1-55 │ +│ │ └── engine.rs │ +│ │ └── engine.rs:1-60 │ +│ └── tests/ │ +│ └── integration.rs │ +│ └── integration.rs:1-40 │ +└──────────────────────────────────────────┘ + │ + ▼ +┌──────────────────────────────────────────┐ +│ 查询(三层策略) │ +│ Fast: 关键词 → ReasoningIndex → 节点 │ +│ 标准: LLM 逐层遍历 (3次调用) │ +│ Deep: Worker Agent 推理导航 │ +└──────────────────────────────────────────┘ +``` + +--- + +## 3. 工作分解 + +### vectorless 改动 + +#### 3.1 加 Code 格式(~5 个文件) + +| 文件 | 改动 | +|---|---| +| `vectorless-document/src/format.rs` | 加 `Code` variant,映射 `.rs/.py/.ts/.go/.java/.cpp/...` | +| `vectorless-compiler/src/parse/code/` | 新模块:通用代码分块器 | +| `vectorless-compiler/src/parse/mod.rs` | 加 `DocumentFormat::Code =>` match arm | +| `vectorless-engine/src/indexer.rs` | 加 format 映射 | +| `vectorless-engine/src/engine.rs` | 加 pipeline options 映射 | + +#### 3.2 通用代码分块器 + +语言无关的启发式分块: + +```rust +fn parse_code(content: &str, file_path: &str) -> Vec { + let mut nodes = vec![]; + + // Level 1: 文件节点(标题 = 相对路径) + nodes.push(RawNode { + title: file_path.to_string(), + level: 1, + ..Default::default() + }); + + // Level 2: 代码块(按结构分块) + for chunk in split_by_structure(content, max_lines=50) { + nodes.push(RawNode { + title: format!("{}:{}-{}", file_path, chunk.start, chunk.end), + content: chunk.text, + level: 2, + ..Default::default() + }); + } + + nodes +} +``` + +分块策略:空行优先 → 缩进变化 → 行数硬切(~50行)。 + +**可选增强**:tree-sitter 把 Level 2 从"代码块"升级为"函数/类/方法"(50+ 语言)。 + +#### 3.3 暴露关键词搜索 API + +```rust +impl DocumentNavigator { + /// 关键词检索,毫秒级 + pub fn search_by_keywords(&self, query: &str) -> Vec { + let keywords = extract_keywords(query); + let mut scored: HashMap = HashMap::new(); + for kw in &keywords { + if let Some(entries) = self.reasoning_index.topic_paths.get(kw) { + for entry in entries { + *scored.entry(entry.node_id).or_default() += entry.weight; + } + } + } + let mut results: Vec<_> = scored.into_iter() + .map(|(node_id, score)| SearchResult { node_id, score }) + .collect(); + results.sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap()); + results.truncate(10); + results + } +} +``` + +#### 3.4 逐层遍历查询(codeindex 式) + +标准模式的查询策略,3 次 LLM 调用: + +```python +async def traverse_search(tree, query, llm): + # Level 1: 选目录 + modules = tree.children_of_root() + selected = await llm.select_nodes(query, modules, max_select=3) + + # Level 2: 选文件 + files = [f for m in selected for f in tree.children_of(m)] + selected = await llm.select_nodes(query, files, max_select=5) + + # Level 3: 选代码块 + chunks = [c for f in selected for c in tree.children_of(f)] + selected = await llm.select_nodes(query, chunks, max_select=10) + + return selected +``` + +### vectorless-code 独立项目 + +``` +vectorless-code/ # 新项目,独立仓库 +├── pyproject.toml # 依赖: vectorless, mcp, typer, pathspec +├── src/ +│ └── vectorless_code/ +│ ├── __init__.py +│ ├── indexer.py # 遍历文件 → engine.compile(format="code") +│ ├── search.py # 三层查询策略 +│ ├── traversal.py # 逐层遍历(标准模式) +│ ├── server.py # MCP server(search tool) +│ ├── cli.py # CLI: vc init / index / search / mcp +│ └── settings.py # .gitignore, include/exclude 配置 +└── README.md +``` + +**MCP 接口**: + +```python +@mcp.tool() +async def search(query: str, limit: int = 5, mode: str = "auto") -> list[dict]: + """Search codebase. + + mode: "fast" (keyword), "standard" (traversal), "deep" (worker), "auto" + """ + if mode == "fast" or mode == "auto": + results = doc.search_by_keywords(query) + if results: + return format_results(results[:limit]) + + if mode == "standard" or mode == "auto": + results = await traverse_search(tree, query, llm) + if results: + return format_results(results[:limit]) + + # Deep mode + answer = await engine.ask(query, doc_ids=[doc_id]) + return format_evidence(answer.evidence[:limit]) +``` + +**CLI**: + +| 命令 | 功能 | +|---|---| +| `vc init` | 初始化配置 | +| `vc index [--mode fast|standard]` | 编译代码库 | +| `vc search [--mode auto|fast|standard|deep]` | 搜索代码 | +| `vc mcp` | 启动 MCP server | +| `vc status` | 查看索引状态 | + +--- + +## 4. 不需要改的 + +- `DocumentTree` / arena 结构 — 完全复用 +- `BuildPass` — `RawNode.level` 驱动,天然兼容 +- `ReasoningIndex` — 关键词倒排索引,Fast 模式核心 +- Worker 核心循环 — Deep 模式复用 +- PyO3 绑定框架 — 增量添加新方法 +- Engine / Workspace / Cache — 完全复用 +- SplitPass — 自动处理超大代码文件 +- 增量编译 — fingerprint + 增量更新已有 + +## 5. 优势 + +1. **无需嵌入模型** — 不需要向量 DB、不需要 embedding API、不需要 GPU +2. **三层速度** — 10ms / 5s / 30s,按需选择 +3. **Fast 模式零 LLM** — 索引和查询都不需要 LLM(纯 CPU) +4. **深度查询** — Worker 模式处理 embedding 无法回答的复杂问题 +5. **所有语言** — 通用分块器,不依赖 tree-sitter +6. **增量编译** — 代码变更只重编译改动的文件 + +## 6. 实施步骤 + +1. **vectorless 加 Code 格式** — 通用分块器 + 关键词搜索 API +2. **vectorless-code CLI** — `vc init / index / search`,验证三层查询 +3. **逐层遍历实现** — 标准模式(3 次 LLM),对标 codeindex 效果 +4. **vectorless-code MCP server** — 暴露 `search` tool,接入 Claude Code +5. **(可选)tree-sitter 增强** — 精确 AST 分块,替换通用分块器