From 25cc0fd70c691aae15ce40a529ab64dea8587720 Mon Sep 17 00:00:00 2001 From: Vlad Lasky Date: Wed, 17 Jun 2026 09:57:27 +1000 Subject: [PATCH 1/4] perf(index): remove redundant canonicalize in scan_files The walker uses follow_links(false), so any path it produces without ".." is guaranteed to be within the project root. The per-file canonicalize (two realpath syscalls per file) was added alongside follow_links(false) in 6f94b42 as belt-and-suspenders, but provides no safety benefit when symlinks are not followed. On large projects this eliminates thousands of unnecessary syscalls per search. --- colgrep/src/index/mod.rs | 37 +++++++++++++------------------------ 1 file changed, 13 insertions(+), 24 deletions(-) diff --git a/colgrep/src/index/mod.rs b/colgrep/src/index/mod.rs index 7559c48..36b26c2 100644 --- a/colgrep/src/index/mod.rs +++ b/colgrep/src/index/mod.rs @@ -2678,31 +2678,20 @@ fn is_file_too_large(path: &Path) -> bool { /// The check is done by canonicalizing both paths and verifying /// the resolved path starts with the project root. fn is_within_project_root(project_root: &Path, relative_path: &Path) -> bool { - // Check for obvious path traversal patterns first (fast path) let path_str = relative_path.to_string_lossy(); - if path_str.contains("..") { - // Could be a traversal attempt - do full canonicalization check - let full_path = project_root.join(relative_path); - match full_path.canonicalize() { - Ok(canonical) => { - // Canonicalize project root as well for accurate comparison - match project_root.canonicalize() { - Ok(canonical_root) => canonical.starts_with(&canonical_root), - Err(_) => false, - } - } - Err(_) => false, // If canonicalization fails, reject the path - } - } else { - // No ".." in path, but still verify the path doesn't escape via symlinks - let full_path = project_root.join(relative_path); - if !full_path.exists() { - return true; // Non-existent paths will be skipped later anyway - } - match (full_path.canonicalize(), project_root.canonicalize()) { - (Ok(canonical), Ok(canonical_root)) => canonical.starts_with(&canonical_root), - _ => false, - } + if !path_str.contains("..") { + // The walker uses follow_links(false), so without ".." in the relative + // path there is no way for the entry to escape the project root. + return true; + } + // Path contains ".." - do full canonicalization check + let full_path = project_root.join(relative_path); + match full_path.canonicalize() { + Ok(canonical) => match project_root.canonicalize() { + Ok(canonical_root) => canonical.starts_with(&canonical_root), + Err(_) => false, + }, + Err(_) => false, } } From e92cb957ca5ab634d98957ab948a7e61fa73f331 Mon Sep 17 00:00:00 2001 From: Vlad Lasky Date: Wed, 17 Jun 2026 10:02:31 +1000 Subject: [PATCH 2/4] fix(index): tolerate size == 0 in mtime fast path for legacy states States written before the size field was added deserialize with size: 0 (via #[serde(default)]). The strict equality check `info.size == current_size` always fails for these entries, defeating the fast path and causing every file to be content-hashed on every search. Treat size == 0 as "unknown" and fall back to mtime-only comparison. While real files can have zero size (e.g. semaphore files), such files have no content to index so a false-positive skip is harmless to colgrep. This restores the fast path for any index state written by <= 1.5.4, eliminating O(n) hash_file calls per search when no files have changed. --- colgrep/src/index/mod.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/colgrep/src/index/mod.rs b/colgrep/src/index/mod.rs index 36b26c2..ec7c361 100644 --- a/colgrep/src/index/mod.rs +++ b/colgrep/src/index/mod.rs @@ -2886,11 +2886,17 @@ impl IndexBuilder { // every file in the repo before returning results. mtime is compared // at nanosecond precision and paired with size so a same-second edit // can't slip through. + // + // size == 0 means the field was absent when the state was written + // (serde default). Fall back to mtime-only in that case. A real + // zero-byte file (e.g. a semaphore) has no content to index so a + // false-positive skip is harmless. let current_stat = file_stat(&full_path).ok(); if let (Some(info), Some((current_mtime, current_size))) = (state.files.get(path), current_stat) { - if info.mtime == current_mtime && info.size == current_size { + let size_matches = info.size == 0 || info.size == current_size; + if info.mtime == current_mtime && size_matches { plan.unchanged += 1; continue; } From b8db1e722336ed914bceda12e2f3e18a3dea3218 Mon Sep 17 00:00:00 2001 From: Vlad Lasky Date: Wed, 17 Jun 2026 10:03:43 +1000 Subject: [PATCH 3/4] =?UTF-8?q?fix(index):=20migrate=20format=20version=20?= =?UTF-8?q?0=20=E2=86=92=201=20in=20place=20instead=20of=20full=20rebuild?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 0 → 1 format version transition added version tracking and the FileInfo.size field but did not change the on-disk index layout (chunk format, embedding pipeline, metadata schema). A full rebuild discards all embeddings and re-processes every file, which is very expensive for large projects. Instead, bump the version in state.json and save. The incremental update that follows will naturally fill in correct sizes via the self-healing touched-file path. Future format versions that change the actual index layout will still trigger a full rebuild. --- colgrep/src/index/mod.rs | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/colgrep/src/index/mod.rs b/colgrep/src/index/mod.rs index ec7c361..3eb8919 100644 --- a/colgrep/src/index/mod.rs +++ b/colgrep/src/index/mod.rs @@ -1592,7 +1592,7 @@ impl IndexBuilder { // instead of rebuilding from scratch. No-op for non-worktree projects. self.maybe_seed_from_worktree(force); - let state = IndexState::load(&self.index_dir)?; + let mut state = IndexState::load(&self.index_dir)?; let index_dir = get_vector_index_path(&self.index_dir); let index_path = index_dir.to_str().unwrap(); let index_exists = index_dir.join("metadata.json").exists(); @@ -1611,9 +1611,20 @@ impl IndexBuilder { && !state.files.is_empty() && state.index_format_version != INDEX_FORMAT_VERSION; - // Forced or index-format change: clean atomic rebuild. Drop any in-progress - // resumable-build marker so we don't try to resume an index we're discarding. - if force || format_mismatch { + // Forced: clean atomic rebuild. Drop any in-progress resumable-build + // marker so we don't try to resume an index we're discarding. + if force { + let _ = std::fs::remove_file(self.index_dir.join(BUILDING_MARKER)); + return self.full_rebuild(languages); + } + + // Format version 0 → 1 did not change the on-disk index layout, only + // added version tracking and the FileInfo.size field (which defaults to + // 0 via serde). Migrate in place rather than discarding the entire index. + if format_mismatch && state.index_format_version == 0 { + state.index_format_version = INDEX_FORMAT_VERSION; + state.save(&self.index_dir)?; + } else if format_mismatch { let _ = std::fs::remove_file(self.index_dir.join(BUILDING_MARKER)); return self.full_rebuild(languages); } From 422ef1e65444c3299921e42c0b0eab42bd8967b6 Mon Sep 17 00:00:00 2001 From: Vlad Lasky Date: Wed, 17 Jun 2026 10:46:20 +1000 Subject: [PATCH 4/4] fix(index): refresh file stats after worktree seeding MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When seeding a new worktree's index from a sibling, the copied state carries the source worktree's mtimes. Since git checkout stamps files with the current time, every mtime in the copied state mismatches the local files, causing the fast path to miss and triggering a full content-hash pass on the first search. After copying the source state, stat each tracked file in the current worktree and update its mtime and size while preserving the content hash (which remains valid as both worktrees share the same git objects). This makes the fast path effective immediately after seeding. Also enhances the format version 0 → 1 migration to stat all entries, filling in missing sizes and purging entries for files that no longer exist on disk. --- colgrep/src/index/mod.rs | 42 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/colgrep/src/index/mod.rs b/colgrep/src/index/mod.rs index 3eb8919..53cd6b2 100644 --- a/colgrep/src/index/mod.rs +++ b/colgrep/src/index/mod.rs @@ -1623,6 +1623,33 @@ impl IndexBuilder { // 0 via serde). Migrate in place rather than discarding the entire index. if format_mismatch && state.index_format_version == 0 { state.index_format_version = INDEX_FORMAT_VERSION; + // Stat every entry to fill in missing sizes and purge stale entries. + let mut gone = Vec::new(); + for (path, info) in state.files.iter_mut() { + let full = self.project_root.join(path); + match file_stat(&full) { + Ok((mtime, size)) => { + if info.size == 0 { + info.size = size; + } + // Legacy states store mtime in seconds; current code + // uses nanoseconds. Upgrade precision when the file + // hasn't been modified (same second), otherwise leave + // stale so the hash pass catches the real change. + if info.mtime < 10_000_000_000_000 { + if info.mtime == mtime / 1_000_000_000 { + info.mtime = mtime; + } + } else if info.mtime == 0 { + info.mtime = mtime; + } + } + Err(_) => gone.push(path.clone()), + } + } + for path in &gone { + state.files.remove(path); + } state.save(&self.index_dir)?; } else if format_mismatch { let _ = std::fs::remove_file(self.index_dir.join(BUILDING_MARKER)); @@ -1959,7 +1986,7 @@ impl IndexBuilder { let src_dir = &candidate.index_dir; // Validate the sibling holds a complete, format-compatible, non-dirty index that // isn't mid-build. Skip otherwise so we never seed from a half-built or stale store. - let Some(src_state) = seed_source_state(src_dir) else { + let Some(mut src_state) = seed_source_state(src_dir) else { continue; }; let src_vector = get_vector_index_path(src_dir); @@ -1978,6 +2005,19 @@ impl IndexBuilder { std::fs::rename(&tmp, &dest_vector) .context("Failed to move seeded index into place")?; + // Refresh stats to match this worktree's files. The content hashes + // from the source are still valid (same git content), but mtimes + // differ because git checkout stamps files with the current time. + // Without this, the mtime fast path would miss on every file and + // trigger a full content-hash pass on the first search. + for (path, info) in src_state.files.iter_mut() { + let full = self.project_root.join(path); + if let Ok((mtime, size)) = file_stat(&full) { + info.mtime = mtime; + info.size = size; + } + } + // Persist state (save() restamps the version/format fields) and a // fresh project.json pointing at THIS worktree, not the source. src_state.save(&self.index_dir)?;