Skip to content
Merged

Dev #119

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ members = [
"crates/vectorless-document",
"crates/vectorless-config",
"crates/vectorless-utils",
"crates/vectorless-scoring",
"crates/vectorless-graph",
"crates/vectorless-events",
"crates/vectorless-metrics",
Expand All @@ -18,7 +17,7 @@ members = [
resolver = "2"

[workspace.package]
version = "0.1.12"
version = "0.1.13"
description = "Knowing by reasoning, not vectors."
edition = "2024"
authors = ["zTgx <beautifularea@gmail.com>"]
Expand All @@ -32,7 +31,6 @@ documentation = "https://docs.rs/vectorless"
tokio = { version = "1", features = ["full"] }
async-trait = "0.1"
futures = "0.3"

# Serialization
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
Expand Down
1 change: 0 additions & 1 deletion crates/vectorless-compiler/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ vectorless-document = { path = "../vectorless-document" }
vectorless-error = { path = "../vectorless-error" }
vectorless-llm = { path = "../vectorless-llm" }
vectorless-metrics = { path = "../vectorless-metrics" }
vectorless-scoring = { path = "../vectorless-scoring" }
vectorless-storage = { path = "../vectorless-storage" }
vectorless-utils = { path = "../vectorless-utils" }
tokio = { workspace = true }
Expand Down
62 changes: 20 additions & 42 deletions crates/vectorless-compiler/src/incremental/detector.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
//! enabling precise identification of changed nodes without full reprocessing.

use std::collections::HashMap;
use std::hash::{Hash, Hasher};
use std::path::Path;
use std::time::SystemTime;

Expand Down Expand Up @@ -208,17 +207,10 @@ impl ChangeDetector {
self
}

/// Compute hash of content (simple u64 hash).
fn hash_content(content: &str) -> u64 {
let mut hasher = std::collections::hash_map::DefaultHasher::new();
content.hash(&mut hasher);
hasher.finish()
}

/// Check if a file needs reindexing based on mtime.
pub fn needs_reindex_by_mtime(&self, doc_id: &str, path: &Path) -> bool {
/// Check if a file needs recompilation based on mtime.
pub fn needs_recompile_by_mtime(&self, doc_id: &str, path: &Path) -> bool {
let Some(recorded_mtime) = self.mtimes.get(doc_id) else {
return true; // Never indexed
return true; // Never compiled
};

let Ok(metadata) = std::fs::metadata(path) else {
Expand All @@ -232,8 +224,8 @@ impl ChangeDetector {
current_mtime > *recorded_mtime
}

/// Check if content needs reindexing based on fingerprint.
pub fn needs_reindex_by_hash(&self, doc_id: &str, content: &str) -> bool {
/// Check if content needs recompilation based on fingerprint.
pub fn needs_recompile_by_hash(&self, doc_id: &str, content: &str) -> bool {
let current_fp = Fingerprint::from_str(content);

match self.content_fps.get(doc_id) {
Expand All @@ -242,23 +234,23 @@ impl ChangeDetector {
}
}

/// Check if document needs reindexing based on fingerprint.
pub fn needs_reindex_by_fingerprint(&self, doc_id: &str, new_fp: &Fingerprint) -> bool {
/// Check if document needs recompilation based on fingerprint.
pub fn needs_recompile_by_fingerprint(&self, doc_id: &str, new_fp: &Fingerprint) -> bool {
match self.content_fps.get(doc_id) {
Some(recorded_fp) => recorded_fp != new_fp,
None => true,
}
}

/// Check if processing version has changed.
pub fn needs_reindex_by_version(&self, doc_id: &str) -> bool {
pub fn needs_recompile_by_version(&self, doc_id: &str) -> bool {
match self.processing_versions.get(doc_id) {
Some(recorded_version) => *recorded_version < self.current_processing_version,
None => true,
}
}

/// Record document state after indexing.
/// Record document state after compiling.
pub fn record(&mut self, doc_id: &str, content: &str, path: Option<&Path>) {
self.record_with_tree(doc_id, content, None, path);
}
Expand Down Expand Up @@ -415,7 +407,7 @@ impl ChangeDetector {
let mut needs_reprocess = Vec::new();

// If processing version changed, all nodes need reprocessing
if self.needs_reindex_by_version(doc_id) {
if self.needs_recompile_by_version(doc_id) {
return Some(new_fps.keys().cloned().collect());
}

Expand Down Expand Up @@ -501,20 +493,6 @@ pub fn compute_tree_fingerprint(tree: &DocumentTree) -> Fingerprint {
root_fp.subtree
}

/// Compute content fingerprint for a single node.
fn compute_node_content_fp(tree: &DocumentTree, node_id: NodeId) -> Fingerprint {
let node = match tree.get(node_id) {
Some(n) => n,
None => return Fingerprint::zero(),
};

Fingerprinter::new()
.with_str(&node.title)
.with_str(&node.content)
.with_option_str(node.node_id.as_deref())
.into_fingerprint()
}

/// Compute fingerprint for a node and its subtree.
fn compute_node_fingerprint(tree: &DocumentTree, node_id: NodeId) -> NodeFingerprint {
let node = match tree.get(node_id) {
Expand Down Expand Up @@ -578,20 +556,20 @@ mod tests {
}

#[test]
fn test_needs_reindex_by_hash() {
fn test_needs_recompile_by_hash() {
let mut detector = ChangeDetector::new();

// First time: always needs reindex
assert!(detector.needs_reindex_by_hash("doc1", "content"));
// First time: always needs recompilation
assert!(detector.needs_recompile_by_hash("doc1", "content"));

// Record the content
detector.record("doc1", "content", None);

// Same content: no reindex needed
assert!(!detector.needs_reindex_by_hash("doc1", "content"));
// Same content: no recompilation needed
assert!(!detector.needs_recompile_by_hash("doc1", "content"));

// Different content: needs reindex
assert!(detector.needs_reindex_by_hash("doc1", "new content"));
// Different content: needs recompilation
assert!(detector.needs_recompile_by_hash("doc1", "new content"));
}

#[test]
Expand All @@ -614,12 +592,12 @@ mod tests {
let mut detector = ChangeDetector::new().with_processing_version(2);
detector.record("doc1", "content", None);

// Version matches, no reindex needed
assert!(!detector.needs_reindex_by_version("doc1"));
// Version matches, no recompilation needed
assert!(!detector.needs_recompile_by_version("doc1"));

// Create new detector with higher version
let detector2 = ChangeDetector::new().with_processing_version(3);
assert!(detector2.needs_reindex_by_version("doc1"));
assert!(detector2.needs_recompile_by_version("doc1"));
}

#[test]
Expand Down
1 change: 0 additions & 1 deletion crates/vectorless-compiler/src/incremental/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@

mod detector;
mod resolver;
mod updater;

pub use detector::ChangeDetector;
pub use resolver::{IndexAction, SkipInfo, resolve_action};
Expand Down
43 changes: 27 additions & 16 deletions crates/vectorless-compiler/src/incremental/resolver.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,7 @@
use tracing::info;

use crate::config::PipelineOptions;
use vectorless_document::DocumentFormat;
use vectorless_document::DocumentTree;
use vectorless_storage::PersistedDocument;
use vectorless_document::{Document, DocumentFormat, DocumentTree};
use vectorless_utils::fingerprint::Fingerprint;

/// Action to take for a source during indexing.
Expand Down Expand Up @@ -64,42 +62,55 @@ pub struct SkipInfo {
/// with the old tree for partial reprocessing.
pub fn resolve_action(
file_bytes: &[u8],
stored_doc: &PersistedDocument,
stored_doc: &Document,
pipeline_options: &PipelineOptions,
format: DocumentFormat,
) -> IndexAction {
let current_fp = Fingerprint::from_bytes(file_bytes);
let current_fp_hex = current_fp.to_string();

// Get the stored DocumentMeta (if present)
let stored_meta = match stored_doc.meta.as_ref() {
Some(m) => m,
None => {
// No meta → must be a very old format, full reprocess
return IndexAction::FullIndex {
existing_id: Some(stored_doc.doc_id.clone()),
};
}
};

// Layer 1: File-level content fingerprint
if !stored_doc
.meta
.needs_reprocessing(&current_fp, pipeline_options.processing_version)
{
if !stored_meta.needs_reprocessing(&current_fp_hex, pipeline_options.processing_version) {
info!("File fingerprint unchanged, skipping");
return IndexAction::Skip(SkipInfo {
doc_id: stored_doc.meta.id.clone(),
name: stored_doc.meta.name.clone(),
doc_id: stored_doc.doc_id.clone(),
name: stored_doc.name.clone(),
format,
description: stored_doc.meta.description.clone(),
page_count: stored_doc.meta.page_count,
description: if stored_doc.summary.is_empty() {
None
} else {
Some(stored_doc.summary.clone())
},
page_count: stored_doc.page_count,
});
}

// Layer 2: Logic fingerprint (pipeline config changed?)
let current_logic_fp = pipeline_options.logic_fingerprint();
if stored_doc.meta.logic_fingerprint != current_logic_fp
&& !stored_doc.meta.logic_fingerprint.is_zero()
if stored_meta.logic_fingerprint != current_logic_fp.to_string()
&& !stored_meta.logic_fingerprint.is_empty()
{
info!("Logic fingerprint changed, full reprocess required");
return IndexAction::FullIndex {
existing_id: Some(stored_doc.meta.id.clone()),
existing_id: Some(stored_doc.doc_id.clone()),
};
}

// Layer 3: Content changed, pipeline unchanged → incremental update
info!("Content changed, pipeline unchanged → incremental update");
IndexAction::IncrementalUpdate {
old_tree: stored_doc.tree.clone(),
existing_id: stored_doc.meta.id.clone(),
existing_id: stored_doc.doc_id.clone(),
}
}
Loading
Loading