From 48a49358fdeeff4a6f615c5eab35ded688f89a5f Mon Sep 17 00:00:00 2001
From: Raphael Sourty <raphael.sourty@lighton.ai>
Date: Wed, 17 Jun 2026 11:26:09 +0200
Subject: [PATCH] fix(colgrep): eliminate unnecessary re-indexing on every
 search
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Integrates #134 (by @vlasky) — which fixes several issues that made every
search re-hash every tracked file on large projects — with corrections.

From #134 (kept):
- Skip redundant canonicalize() in scan_files for paths without '..' (the
  walker's follow_links(false) + is_file() filter already block symlink escape).
- Migrate index format 0 -> 1 in place (stat to backfill sizes, upgrade legacy
  second-precision mtimes to nanoseconds) instead of discarding and re-embedding
  the whole index — the on-disk layout did not change across 1.5.4 -> 1.5.5.
- Refresh file stats after worktree seeding so git-checkout's 'now' mtimes don't
  defeat the fast path on first search.

Corrections:
- Drop the size==0 'mtime-only' fast-path tolerance: it reopened the same-second
  edit blind spot. Require a strict mtime+size match; legacy size==0 entries are
  hashed once (correct) and then backfilled by the migration / touched-persist.
- Stop purging stat-missing entries from state during migration; leave them so
  the normal incremental-delete path removes them from every store (vector +
  metadata + FTS5) deterministically, instead of relying on periodic orphan cleanup.

Tests:
- test_index_stays_synced_with_disk: deleting files on disk is detected, purges
  them from every store (not keyword-retrievable), keeps survivors, and a content
  change is detected as changed.
- test_format_migration_in_place_cleans_deleted_without_rebuild: migration reuses
  the legacy index (no model => a rebuild would fail) and ends in sync with disk.

Validated: clippy clean; cargo test -p colgrep (557 passed).

Co-authored-by: Raphael Sourty <raphael.sourty@lighton.ai>
Co-authored-by: vlasky <vlad.lasky@energyone.com>
---
 colgrep/src/index/mod.rs | 309 ++++++++++++++++++++++++++++++++++-----
 1 file changed, 275 insertions(+), 34 deletions(-)

diff --git a/colgrep/src/index/mod.rs b/colgrep/src/index/mod.rs
index 7559c48..d4ea4e9 100644
--- a/colgrep/src/index/mod.rs
+++ b/colgrep/src/index/mod.rs
@@ -1592,7 +1592,7 @@ impl IndexBuilder {
         // instead of rebuilding from scratch. No-op for non-worktree projects.
         self.maybe_seed_from_worktree(force);
 
-        let state = IndexState::load(&self.index_dir)?;
+        let mut state = IndexState::load(&self.index_dir)?;
         let index_dir = get_vector_index_path(&self.index_dir);
         let index_path = index_dir.to_str().unwrap();
         let index_exists = index_dir.join("metadata.json").exists();
@@ -1611,9 +1611,48 @@ impl IndexBuilder {
             && !state.files.is_empty()
             && state.index_format_version != INDEX_FORMAT_VERSION;
 
-        // Forced or index-format change: clean atomic rebuild. Drop any in-progress
-        // resumable-build marker so we don't try to resume an index we're discarding.
-        if force || format_mismatch {
+        // Forced: clean atomic rebuild. Drop any in-progress resumable-build
+        // marker so we don't try to resume an index we're discarding.
+        if force {
+            let _ = std::fs::remove_file(self.index_dir.join(BUILDING_MARKER));
+            return self.full_rebuild(languages);
+        }
+
+        // Format version 0 → 1 did not change the on-disk index layout, only
+        // added version tracking and the FileInfo.size field (which defaults to
+        // 0 via serde). Migrate in place rather than discarding the entire index.
+        if format_mismatch && state.index_format_version == 0 {
+            state.index_format_version = INDEX_FORMAT_VERSION;
+            // Stat every still-present entry to fill in missing sizes and upgrade
+            // legacy second-precision mtimes, restoring the stat-only fast path
+            // without re-embedding. Entries whose file is gone are deliberately
+            // LEFT in the state: the incremental update below will see them
+            // missing from disk, mark them deleted, and remove them from every
+            // store (vector index + metadata + FTS5) via delete_files_from_index.
+            // Dropping them here would instead orphan their index rows — the
+            // deleted content would stay searchable until a later periodic cleanup.
+            for (path, info) in state.files.iter_mut() {
+                let full = self.project_root.join(path);
+                let Ok((mtime, size)) = file_stat(&full) else {
+                    continue; // gone on disk → handled as a deletion by the update
+                };
+                if info.size == 0 {
+                    info.size = size;
+                }
+                // Legacy states store mtime in seconds; current code uses
+                // nanoseconds. Upgrade precision when the file hasn't been
+                // modified (same second), otherwise leave the value stale so the
+                // hash pass below catches the real change.
+                if info.mtime < 10_000_000_000_000 {
+                    if info.mtime == mtime / 1_000_000_000 {
+                        info.mtime = mtime;
+                    }
+                } else if info.mtime == 0 {
+                    info.mtime = mtime;
+                }
+            }
+            state.save(&self.index_dir)?;
+        } else if format_mismatch {
             let _ = std::fs::remove_file(self.index_dir.join(BUILDING_MARKER));
             return self.full_rebuild(languages);
         }
@@ -1948,7 +1987,7 @@ impl IndexBuilder {
             let src_dir = &candidate.index_dir;
             // Validate the sibling holds a complete, format-compatible, non-dirty index that
             // isn't mid-build. Skip otherwise so we never seed from a half-built or stale store.
-            let Some(src_state) = seed_source_state(src_dir) else {
+            let Some(mut src_state) = seed_source_state(src_dir) else {
                 continue;
             };
             let src_vector = get_vector_index_path(src_dir);
@@ -1967,6 +2006,19 @@ impl IndexBuilder {
             std::fs::rename(&tmp, &dest_vector)
                 .context("Failed to move seeded index into place")?;
 
+            // Refresh stats to match this worktree's files. The content hashes
+            // from the source are still valid (same git content), but mtimes
+            // differ because git checkout stamps files with the current time.
+            // Without this, the mtime fast path would miss on every file and
+            // trigger a full content-hash pass on the first search.
+            for (path, info) in src_state.files.iter_mut() {
+                let full = self.project_root.join(path);
+                if let Ok((mtime, size)) = file_stat(&full) {
+                    info.mtime = mtime;
+                    info.size = size;
+                }
+            }
+
             // Persist state (save() restamps the version/format fields) and a
             // fresh project.json pointing at THIS worktree, not the source.
             src_state.save(&self.index_dir)?;
@@ -2678,31 +2730,20 @@ fn is_file_too_large(path: &Path) -> bool {
 /// The check is done by canonicalizing both paths and verifying
 /// the resolved path starts with the project root.
 fn is_within_project_root(project_root: &Path, relative_path: &Path) -> bool {
-    // Check for obvious path traversal patterns first (fast path)
     let path_str = relative_path.to_string_lossy();
-    if path_str.contains("..") {
-        // Could be a traversal attempt - do full canonicalization check
-        let full_path = project_root.join(relative_path);
-        match full_path.canonicalize() {
-            Ok(canonical) => {
-                // Canonicalize project root as well for accurate comparison
-                match project_root.canonicalize() {
-                    Ok(canonical_root) => canonical.starts_with(&canonical_root),
-                    Err(_) => false,
-                }
-            }
-            Err(_) => false, // If canonicalization fails, reject the path
-        }
-    } else {
-        // No ".." in path, but still verify the path doesn't escape via symlinks
-        let full_path = project_root.join(relative_path);
-        if !full_path.exists() {
-            return true; // Non-existent paths will be skipped later anyway
-        }
-        match (full_path.canonicalize(), project_root.canonicalize()) {
-            (Ok(canonical), Ok(canonical_root)) => canonical.starts_with(&canonical_root),
-            _ => false,
-        }
+    if !path_str.contains("..") {
+        // The walker uses follow_links(false), so without ".." in the relative
+        // path there is no way for the entry to escape the project root.
+        return true;
+    }
+    // Path contains ".." - do full canonicalization check
+    let full_path = project_root.join(relative_path);
+    match full_path.canonicalize() {
+        Ok(canonical) => match project_root.canonicalize() {
+            Ok(canonical_root) => canonical.starts_with(&canonical_root),
+            Err(_) => false,
+        },
+        Err(_) => false,
     }
 }
 
@@ -2892,11 +2933,20 @@ impl IndexBuilder {
             }
             let full_path = self.project_root.join(path);
 
-            // Fast path: if mtime AND size are both unchanged, skip expensive
-            // content hashing. Without this gate every search reads and hashes
-            // every file in the repo before returning results. mtime is compared
-            // at nanosecond precision and paired with size so a same-second edit
-            // can't slip through.
+            // Fast path: skip expensive content hashing only when BOTH the
+            // nanosecond mtime and the size are unchanged. Without this gate
+            // every search reads and hashes every file in the repo before
+            // returning results. The size is a required second signal: pairing
+            // it with the mtime closes the same-second-edit blind spot (an edit
+            // that lands in the same wall-clock instant but changes the file).
+            //
+            // We do NOT skip on a matching mtime alone when the recorded size is
+            // unknown (0, from a legacy state written before the field existed):
+            // that would reopen the blind spot. Such entries fall through and are
+            // hashed once — correct, and cheap — after which the update persists a
+            // real size (see UpdatePlan.touched) and the format migration backfills
+            // sizes for the whole index in a single stat pass. A genuine zero-byte
+            // file matches strictly (0 == 0) and is skipped as expected.
             let current_stat = file_stat(&full_path).ok();
             if let (Some(info), Some((current_mtime, current_size))) =
                 (state.files.get(path), current_stat)
@@ -5441,6 +5491,197 @@ mod tests {
         }
     }
 
+    /// Core invariant: the index stays in sync with what is on disk. Files removed
+    /// from disk must be (a) detected by the update plan and (b) purged from *every*
+    /// store so keyword search can no longer retrieve them, while survivors remain
+    /// retrievable; and a file whose content changes on disk must be detected as
+    /// changed (the index never silently serves stale results).
+    #[test]
+    fn test_index_stays_synced_with_disk() {
+        let _serial = DELETE_TEST_SERIAL.lock().unwrap_or_else(|e| e.into_inner());
+
+        let proj = tempfile::tempdir().unwrap();
+        let index = tempfile::tempdir().unwrap();
+        let index_path = index.path().to_str().unwrap();
+        let files = ["a.rs", "b.rs", "c.rs", "d.rs", "e.rs"];
+
+        // Real files on disk, a matching index (vector + metadata + FTS5), and a
+        // state that tracks them with correct hash/mtime/size.
+        let mut state = IndexState::default();
+        for f in files {
+            let p = proj.path().join(f);
+            std::fs::write(&p, format!("fn {}() {{}}\n", f.replace('.', "_"))).unwrap();
+            state.files.insert(
+                PathBuf::from(f),
+                FileInfo::probe(&p, hash_file(&p).unwrap()).unwrap(),
+            );
+        }
+        build_fixture_index(index_path, &files, 1);
+        let builder = test_builder(proj.path(), &index.path().join("unused"));
+
+        // In sync to start: nothing added/changed/deleted, everything searchable.
+        let plan = builder.compute_update_plan(&state, None).unwrap();
+        assert_eq!(plan.unchanged, files.len());
+        assert!(plan.added.is_empty() && plan.changed.is_empty() && plan.deleted.is_empty());
+        for f in files {
+            assert_eq!(fts_hits(index_path, &fixture_token(f)), 1, "pre {f}");
+        }
+
+        // Remove two files from disk.
+        std::fs::remove_file(proj.path().join("a.rs")).unwrap();
+        std::fs::remove_file(proj.path().join("c.rs")).unwrap();
+
+        // (a) the plan detects exactly those deletions; survivors stay unchanged.
+        let plan = builder.compute_update_plan(&state, None).unwrap();
+        let mut deleted = plan.deleted.clone();
+        deleted.sort();
+        assert_eq!(
+            deleted,
+            vec![PathBuf::from("a.rs"), PathBuf::from("c.rs")],
+            "disk deletions must be detected"
+        );
+        assert!(plan.changed.is_empty());
+        assert_eq!(plan.unchanged, 3);
+
+        // (b) applying the deletions purges every store -> not retrievable; survivors kept.
+        assert_eq!(
+            delete_files_from_index(index_path, &plan.deleted).unwrap(),
+            2
+        );
+        assert_eq!(
+            fts_hits(index_path, &fixture_token("a.rs")),
+            0,
+            "deleted a still searchable"
+        );
+        assert_eq!(
+            fts_hits(index_path, &fixture_token("c.rs")),
+            0,
+            "deleted c still searchable"
+        );
+        for f in ["b.rs", "d.rs", "e.rs"] {
+            assert_eq!(
+                fts_hits(index_path, &fixture_token(f)),
+                1,
+                "survivor {f} lost"
+            );
+        }
+        let mut remaining = filtering::get_distinct_strings(index_path, "file").unwrap();
+        remaining.sort();
+        assert_eq!(
+            remaining,
+            vec!["b.rs".to_string(), "d.rs".to_string(), "e.rs".to_string()]
+        );
+        assert_eq!(
+            MmapIndex::load(index_path).unwrap().metadata.num_documents,
+            3,
+            "vector index doc count must match surviving files"
+        );
+
+        // (c) a content change on disk is detected as changed (never served stale).
+        std::fs::write(
+            proj.path().join("b.rs"),
+            "fn b_rs_changed() { let _ = 42; }\n",
+        )
+        .unwrap();
+        let plan = builder.compute_update_plan(&state, None).unwrap();
+        assert!(
+            plan.changed.contains(&PathBuf::from("b.rs")),
+            "modified file must be detected as changed"
+        );
+    }
+
+    /// Format-version 0 → 1 migration must (a) migrate **in place without re-embedding**
+    /// and (b) leave the index in sync with disk. Point (a) is enforced structurally:
+    /// `test_builder` has no model, so a full rebuild would fail trying to embed — the
+    /// run succeeding proves the legacy index was reused, not rebuilt. Point (b): a file
+    /// that disappeared while on the old version ends up untracked and unsearchable. The
+    /// migration deliberately leaves stat-missing entries in the state so the normal
+    /// incremental-delete path removes them from every store (deterministic), rather than
+    /// purging them from state and leaning on periodic orphan cleanup.
+    #[test]
+    fn test_format_migration_in_place_cleans_deleted_without_rebuild() {
+        let _serial = DELETE_TEST_SERIAL.lock().unwrap_or_else(|e| e.into_inner());
+
+        let proj = tempfile::tempdir().unwrap();
+        let idx = tempfile::tempdir().unwrap();
+        let vector = get_vector_index_path(idx.path());
+        let vector_str = vector.to_str().unwrap();
+
+        // a/b/c exist on disk; "gone.rs" exists only in the index + legacy state.
+        let present = ["a.rs", "b.rs", "c.rs"];
+        for f in present {
+            std::fs::write(
+                proj.path().join(f),
+                format!("fn {}() {{}}\n", f.replace('.', "_")),
+            )
+            .unwrap();
+        }
+        build_fixture_index(vector_str, &["a.rs", "b.rs", "c.rs", "gone.rs"], 1);
+
+        // Legacy state: format version 0, size 0 (field absent), second-precision mtime.
+        let mut state = IndexState::default();
+        assert_eq!(state.index_format_version, 0);
+        for f in present {
+            let (mt_ns, _) = file_stat(&proj.path().join(f)).unwrap();
+            state.files.insert(
+                PathBuf::from(f),
+                FileInfo {
+                    content_hash: hash_file(&proj.path().join(f)).unwrap(),
+                    mtime: mt_ns / 1_000_000_000, // legacy seconds
+                    size: 0,                      // legacy: field absent
+                },
+            );
+        }
+        state.files.insert(
+            PathBuf::from("gone.rs"),
+            FileInfo {
+                content_hash: 1,
+                mtime: 1,
+                size: 0,
+            },
+        );
+        state.save(idx.path()).unwrap();
+        assert_eq!(
+            fts_hits(vector_str, &fixture_token("gone.rs")),
+            1,
+            "precondition"
+        );
+
+        // Run the real entry point: migrates (0→1) in place, then syncs deletions.
+        let mut builder = test_builder(proj.path(), idx.path());
+        builder.index(None, false).unwrap();
+
+        let migrated = IndexState::load(idx.path()).unwrap();
+        assert_eq!(
+            migrated.index_format_version, INDEX_FORMAT_VERSION,
+            "format must be migrated"
+        );
+        // Survivors kept (not re-embedded — still present in every store).
+        for f in present {
+            assert!(
+                migrated.files.contains_key(&PathBuf::from(f)),
+                "{f} dropped"
+            );
+            assert_eq!(
+                fts_hits(vector_str, &fixture_token(f)),
+                1,
+                "survivor {f} lost"
+            );
+        }
+        // The disappeared file is fully gone — untracked AND unsearchable, not orphaned.
+        assert!(!migrated.files.contains_key(&PathBuf::from("gone.rs")));
+        assert_eq!(
+            fts_hits(vector_str, &fixture_token("gone.rs")),
+            0,
+            "file deleted before upgrade is still searchable after migration (orphaned)"
+        );
+        assert_eq!(
+            MmapIndex::load(vector_str).unwrap().metadata.num_documents,
+            3,
+            "index must hold exactly the 3 surviving docs"
+        );
+    }
+
     /// Issue #115 (bug 1): an index left dirty by an interrupted run must be cleaned once the
     /// repair has reconciled it, even when there are no file changes. Otherwise the dirty flag
     /// is never cleared and every future run pays for a needless repair — "permanently dirty".