Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
106 changes: 76 additions & 30 deletions colgrep/src/index/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1592,7 +1592,7 @@ impl IndexBuilder {
// instead of rebuilding from scratch. No-op for non-worktree projects.
self.maybe_seed_from_worktree(force);

let state = IndexState::load(&self.index_dir)?;
let mut state = IndexState::load(&self.index_dir)?;
let index_dir = get_vector_index_path(&self.index_dir);
let index_path = index_dir.to_str().unwrap();
let index_exists = index_dir.join("metadata.json").exists();
Expand All @@ -1611,9 +1611,47 @@ impl IndexBuilder {
&& !state.files.is_empty()
&& state.index_format_version != INDEX_FORMAT_VERSION;

// Forced or index-format change: clean atomic rebuild. Drop any in-progress
// resumable-build marker so we don't try to resume an index we're discarding.
if force || format_mismatch {
// Forced: clean atomic rebuild. Drop any in-progress resumable-build
// marker so we don't try to resume an index we're discarding.
if force {
let _ = std::fs::remove_file(self.index_dir.join(BUILDING_MARKER));
return self.full_rebuild(languages);
}

// Format version 0 → 1 did not change the on-disk index layout, only
// added version tracking and the FileInfo.size field (which defaults to
// 0 via serde). Migrate in place rather than discarding the entire index.
if format_mismatch && state.index_format_version == 0 {
state.index_format_version = INDEX_FORMAT_VERSION;
// Stat every entry to fill in missing sizes and purge stale entries.
let mut gone = Vec::new();
for (path, info) in state.files.iter_mut() {
let full = self.project_root.join(path);
match file_stat(&full) {
Ok((mtime, size)) => {
if info.size == 0 {
info.size = size;
}
// Legacy states store mtime in seconds; current code
// uses nanoseconds. Upgrade precision when the file
// hasn't been modified (same second), otherwise leave
// stale so the hash pass catches the real change.
if info.mtime < 10_000_000_000_000 {
if info.mtime == mtime / 1_000_000_000 {
info.mtime = mtime;
}
} else if info.mtime == 0 {
info.mtime = mtime;
}
}
Err(_) => gone.push(path.clone()),
}
}
for path in &gone {
state.files.remove(path);
}
state.save(&self.index_dir)?;
} else if format_mismatch {
let _ = std::fs::remove_file(self.index_dir.join(BUILDING_MARKER));
return self.full_rebuild(languages);
}
Expand Down Expand Up @@ -1948,7 +1986,7 @@ impl IndexBuilder {
let src_dir = &candidate.index_dir;
// Validate the sibling holds a complete, format-compatible, non-dirty index that
// isn't mid-build. Skip otherwise so we never seed from a half-built or stale store.
let Some(src_state) = seed_source_state(src_dir) else {
let Some(mut src_state) = seed_source_state(src_dir) else {
continue;
};
let src_vector = get_vector_index_path(src_dir);
Expand All @@ -1967,6 +2005,19 @@ impl IndexBuilder {
std::fs::rename(&tmp, &dest_vector)
.context("Failed to move seeded index into place")?;

// Refresh stats to match this worktree's files. The content hashes
// from the source are still valid (same git content), but mtimes
// differ because git checkout stamps files with the current time.
// Without this, the mtime fast path would miss on every file and
// trigger a full content-hash pass on the first search.
for (path, info) in src_state.files.iter_mut() {
let full = self.project_root.join(path);
if let Ok((mtime, size)) = file_stat(&full) {
info.mtime = mtime;
info.size = size;
}
}

// Persist state (save() restamps the version/format fields) and a
// fresh project.json pointing at THIS worktree, not the source.
src_state.save(&self.index_dir)?;
Expand Down Expand Up @@ -2678,31 +2729,20 @@ fn is_file_too_large(path: &Path) -> bool {
/// The check is done by canonicalizing both paths and verifying
/// the resolved path starts with the project root.
fn is_within_project_root(project_root: &Path, relative_path: &Path) -> bool {
// Check for obvious path traversal patterns first (fast path)
let path_str = relative_path.to_string_lossy();
if path_str.contains("..") {
// Could be a traversal attempt - do full canonicalization check
let full_path = project_root.join(relative_path);
match full_path.canonicalize() {
Ok(canonical) => {
// Canonicalize project root as well for accurate comparison
match project_root.canonicalize() {
Ok(canonical_root) => canonical.starts_with(&canonical_root),
Err(_) => false,
}
}
Err(_) => false, // If canonicalization fails, reject the path
}
} else {
// No ".." in path, but still verify the path doesn't escape via symlinks
let full_path = project_root.join(relative_path);
if !full_path.exists() {
return true; // Non-existent paths will be skipped later anyway
}
match (full_path.canonicalize(), project_root.canonicalize()) {
(Ok(canonical), Ok(canonical_root)) => canonical.starts_with(&canonical_root),
_ => false,
}
if !path_str.contains("..") {
// The walker uses follow_links(false), so without ".." in the relative
// path there is no way for the entry to escape the project root.
return true;
}
// Path contains ".." - do full canonicalization check
let full_path = project_root.join(relative_path);
match full_path.canonicalize() {
Ok(canonical) => match project_root.canonicalize() {
Ok(canonical_root) => canonical.starts_with(&canonical_root),
Err(_) => false,
},
Err(_) => false,
}
}

Expand Down Expand Up @@ -2897,11 +2937,17 @@ impl IndexBuilder {
// every file in the repo before returning results. mtime is compared
// at nanosecond precision and paired with size so a same-second edit
// can't slip through.
//
// size == 0 means the field was absent when the state was written
// (serde default). Fall back to mtime-only in that case. A real
// zero-byte file (e.g. a semaphore) has no content to index so a
// false-positive skip is harmless.
let current_stat = file_stat(&full_path).ok();
if let (Some(info), Some((current_mtime, current_size))) =
(state.files.get(path), current_stat)
{
if info.mtime == current_mtime && info.size == current_size {
let size_matches = info.size == 0 || info.size == current_size;
if info.mtime == current_mtime && size_matches {
plan.unchanged += 1;
continue;
}
Expand Down