diff --git a/colgrep/src/index/mod.rs b/colgrep/src/index/mod.rs index 7559c48..53cd6b2 100644 --- a/colgrep/src/index/mod.rs +++ b/colgrep/src/index/mod.rs @@ -1592,7 +1592,7 @@ impl IndexBuilder { // instead of rebuilding from scratch. No-op for non-worktree projects. self.maybe_seed_from_worktree(force); - let state = IndexState::load(&self.index_dir)?; + let mut state = IndexState::load(&self.index_dir)?; let index_dir = get_vector_index_path(&self.index_dir); let index_path = index_dir.to_str().unwrap(); let index_exists = index_dir.join("metadata.json").exists(); @@ -1611,9 +1611,47 @@ impl IndexBuilder { && !state.files.is_empty() && state.index_format_version != INDEX_FORMAT_VERSION; - // Forced or index-format change: clean atomic rebuild. Drop any in-progress - // resumable-build marker so we don't try to resume an index we're discarding. - if force || format_mismatch { + // Forced: clean atomic rebuild. Drop any in-progress resumable-build + // marker so we don't try to resume an index we're discarding. + if force { + let _ = std::fs::remove_file(self.index_dir.join(BUILDING_MARKER)); + return self.full_rebuild(languages); + } + + // Format version 0 → 1 did not change the on-disk index layout, only + // added version tracking and the FileInfo.size field (which defaults to + // 0 via serde). Migrate in place rather than discarding the entire index. + if format_mismatch && state.index_format_version == 0 { + state.index_format_version = INDEX_FORMAT_VERSION; + // Stat every entry to fill in missing sizes and purge stale entries. + let mut gone = Vec::new(); + for (path, info) in state.files.iter_mut() { + let full = self.project_root.join(path); + match file_stat(&full) { + Ok((mtime, size)) => { + if info.size == 0 { + info.size = size; + } + // Legacy states store mtime in seconds; current code + // uses nanoseconds. Upgrade precision when the file + // hasn't been modified (same second), otherwise leave + // stale so the hash pass catches the real change. + if info.mtime < 10_000_000_000_000 { + if info.mtime == mtime / 1_000_000_000 { + info.mtime = mtime; + } + } else if info.mtime == 0 { + info.mtime = mtime; + } + } + Err(_) => gone.push(path.clone()), + } + } + for path in &gone { + state.files.remove(path); + } + state.save(&self.index_dir)?; + } else if format_mismatch { let _ = std::fs::remove_file(self.index_dir.join(BUILDING_MARKER)); return self.full_rebuild(languages); } @@ -1948,7 +1986,7 @@ impl IndexBuilder { let src_dir = &candidate.index_dir; // Validate the sibling holds a complete, format-compatible, non-dirty index that // isn't mid-build. Skip otherwise so we never seed from a half-built or stale store. - let Some(src_state) = seed_source_state(src_dir) else { + let Some(mut src_state) = seed_source_state(src_dir) else { continue; }; let src_vector = get_vector_index_path(src_dir); @@ -1967,6 +2005,19 @@ impl IndexBuilder { std::fs::rename(&tmp, &dest_vector) .context("Failed to move seeded index into place")?; + // Refresh stats to match this worktree's files. The content hashes + // from the source are still valid (same git content), but mtimes + // differ because git checkout stamps files with the current time. + // Without this, the mtime fast path would miss on every file and + // trigger a full content-hash pass on the first search. + for (path, info) in src_state.files.iter_mut() { + let full = self.project_root.join(path); + if let Ok((mtime, size)) = file_stat(&full) { + info.mtime = mtime; + info.size = size; + } + } + // Persist state (save() restamps the version/format fields) and a // fresh project.json pointing at THIS worktree, not the source. src_state.save(&self.index_dir)?; @@ -2678,31 +2729,20 @@ fn is_file_too_large(path: &Path) -> bool { /// The check is done by canonicalizing both paths and verifying /// the resolved path starts with the project root. fn is_within_project_root(project_root: &Path, relative_path: &Path) -> bool { - // Check for obvious path traversal patterns first (fast path) let path_str = relative_path.to_string_lossy(); - if path_str.contains("..") { - // Could be a traversal attempt - do full canonicalization check - let full_path = project_root.join(relative_path); - match full_path.canonicalize() { - Ok(canonical) => { - // Canonicalize project root as well for accurate comparison - match project_root.canonicalize() { - Ok(canonical_root) => canonical.starts_with(&canonical_root), - Err(_) => false, - } - } - Err(_) => false, // If canonicalization fails, reject the path - } - } else { - // No ".." in path, but still verify the path doesn't escape via symlinks - let full_path = project_root.join(relative_path); - if !full_path.exists() { - return true; // Non-existent paths will be skipped later anyway - } - match (full_path.canonicalize(), project_root.canonicalize()) { - (Ok(canonical), Ok(canonical_root)) => canonical.starts_with(&canonical_root), - _ => false, - } + if !path_str.contains("..") { + // The walker uses follow_links(false), so without ".." in the relative + // path there is no way for the entry to escape the project root. + return true; + } + // Path contains ".." - do full canonicalization check + let full_path = project_root.join(relative_path); + match full_path.canonicalize() { + Ok(canonical) => match project_root.canonicalize() { + Ok(canonical_root) => canonical.starts_with(&canonical_root), + Err(_) => false, + }, + Err(_) => false, } } @@ -2897,11 +2937,17 @@ impl IndexBuilder { // every file in the repo before returning results. mtime is compared // at nanosecond precision and paired with size so a same-second edit // can't slip through. + // + // size == 0 means the field was absent when the state was written + // (serde default). Fall back to mtime-only in that case. A real + // zero-byte file (e.g. a semaphore) has no content to index so a + // false-positive skip is harmless. let current_stat = file_stat(&full_path).ok(); if let (Some(info), Some((current_mtime, current_size))) = (state.files.get(path), current_stat) { - if info.mtime == current_mtime && info.size == current_size { + let size_matches = info.size == 0 || info.size == current_size; + if info.mtime == current_mtime && size_matches { plan.unchanged += 1; continue; }