diff --git a/CHANGELOG.md b/CHANGELOG.md index ef98df3..84e32fd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,57 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 +## [1.0.160] - 2026-06-02 + +### Fixed + +- **`evaluate_csharp_rebuild` no longer holds `config.write()` during git/fs I/O** — + the bootstrap timestamp computation (git subprocess + ≤10 000-entry filesystem + walk) previously ran while holding the `config` write-lock, blocking every + concurrent `config.read()` caller for the full scan duration. The lock is now + acquired only for the brief config update; the slow work runs with no lock held. +- **`evaluate_csharp_rebuild` offloaded to `spawn_blocking` in phase 2** — even + after the lock fix, the function ran synchronously on a Tokio worker thread. + Wrapped in `spawn_blocking` at the call site in `run_phase_2_csharp_scip` so + the async runtime stays responsive while processing all C# candidates. +- **`build_index()` in warmup and add-repo background task now use `spawn_blocking`** — + two sites called the CPU-heavy HNSW `build_index()` directly on async threads. + Both now follow the established pattern (`spawn_blocking` + `blocking_write()`). +- **`reload_if_changed` uses `safe_canonicalize`** — replaces the raw + `std::fs::canonicalize` that could leave Windows `\\?\` UNC prefixes on the + config path, causing path comparisons to silently fail. +- **Accurate doc-comments on `relocate_missing` / `prune_stale`** — both + methods perform disk I/O (filesystem traversal, git subprocess) and should + be called via `spawn_blocking` in async contexts; the comments now say so. +- **`ensure_hnsw_index_if_needed` extracted and tested** — the safety-net HNSW + rebuild logic (detects and repairs a DB with chunks but no index, e.g. after + cancellation) is now a named `pub(crate)` function with 3 unit tests + (unindexed-with-chunks rebuilds, already-indexed is idempotent, empty DB skips). +- **`metadata.json` schema consistency** — the normal index path now writes + `"partial": false` so readers always find the field regardless of whether + indexing completed or was cancelled. +- **Cancellation finalisation is best-effort** — metadata write, FileMetaStore + save, and stats read in the cancel path now log-and-continue on failure + instead of propagating `Err`, so the partial chunks remain searchable even + if any recovery step fails. + +## [1.0.156] - 2026-06-02 + +### Fixed + +- **`reconcile_all_paths` no longer blocks the Tokio async runtime** — the + function spawns git subprocesses and holds the config `RwLock` write-guard + while scanning the filesystem. It is now offloaded via + `tokio::task::spawn_blocking` so Tokio worker threads stay responsive during + startup reconciliation. +- **Phase 1 auto-prune now honours `config_path_override`** — the prune path + wrote `repos.json` via `config.save()`, bypassing `ServeState::persist_config`. + All save sites in `ServeState` must route through `persist_config` so the + override (used in integration tests) is respected. Fixed to use + `self.persist_config(&config)`. + + + ## [1.0.154] - 2026-06-02 ### Fixed diff --git a/Cargo.lock b/Cargo.lock index 5fe70f6..c51a7bd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -628,7 +628,7 @@ dependencies = [ [[package]] name = "codesearch" -version = "1.0.154" +version = "1.0.160" dependencies = [ "anyhow", "arroy", diff --git a/Cargo.toml b/Cargo.toml index d88f5ad..70e3c0e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "codesearch" -version = "1.0.154" +version = "1.0.160" edition = "2021" authors = ["codesearch contributors"] license = "Apache-2.0" diff --git a/src/db_discovery/repos.rs b/src/db_discovery/repos.rs index 878e1e4..171833a 100644 --- a/src/db_discovery/repos.rs +++ b/src/db_discovery/repos.rs @@ -401,10 +401,16 @@ impl ReposConfig { /// /// For each missing path a best-effort git-identity relocation is attempted /// ([`Self::try_relocate`]); successful matches rewrite the in-memory - /// `repos` map. This is pure (no disk I/O, no logging) so callers can decide - /// how to report and persist. Returns `(relocated, unresolved)` where - /// `relocated` is the list of `(alias, new_path)` rewrites and `unresolved` - /// is the list of aliases whose path is still missing. + /// `repos` map. + /// + /// **Note:** this method performs disk I/O (filesystem traversal, git + /// subprocess) and should not be called while holding an async lock or from + /// an async task without `spawn_blocking`. No logging is emitted — callers + /// are responsible for reporting results. + /// + /// Returns `(relocated, unresolved)` where `relocated` is the list of + /// `(alias, new_path)` rewrites and `unresolved` is the list of aliases + /// whose path is still missing. #[must_use] pub fn relocate_missing(&mut self) -> (Vec<(String, PathBuf)>, Vec) { let aliases: Vec = self.repos.keys().cloned().collect(); @@ -431,7 +437,12 @@ impl ReposConfig { } /// Prune stale entries: relocate what can be relocated, then unregister the - /// rest. Pure (no disk I/O, no logging). Returns `(relocated, removed)`. + /// rest. + /// + /// **Note:** this method performs disk I/O (filesystem traversal, git + /// subprocess) via [`Self::relocate_missing`]. No logging is emitted. + /// + /// Returns `(relocated, removed)`. #[must_use] pub fn prune_stale(&mut self) -> (Vec<(String, PathBuf)>, Vec) { let (relocated, unresolved) = self.relocate_missing(); diff --git a/src/index/mod.rs b/src/index/mod.rs index 798fb2c..ac442fe 100644 --- a/src/index/mod.rs +++ b/src/index/mod.rs @@ -23,6 +23,31 @@ pub use manager::{ is_database_locked, CSharpRebuildNotifier, IndexManager, IndexingStatusCallback, SharedStores, }; +/// Ensure the HNSW vector index is built if it was never built in a previous +/// (possibly cancelled) run. +/// +/// Returns `true` if the index was rebuilt, `false` if nothing needed to be +/// done (already indexed, or no chunks present). Returns an error only if the +/// database cannot be opened or `build_index` fails; a failure reading stats is +/// treated as a warning and returns `Ok(false)`. +pub(crate) fn ensure_hnsw_index_if_needed( + db_path: &Path, + dimensions: usize, +) -> anyhow::Result { + let mut vs = VectorStore::new(db_path, dimensions)?; + match vs.stats() { + Ok(s) if s.total_chunks > 0 && !s.indexed => { + vs.build_index()?; + Ok(true) + } + Ok(_) => Ok(false), + Err(e) => { + tracing::warn!("could not check vector index status: {}", e); + Ok(false) + } + } +} + /// Update metadata.json with current chunk/file counts so that `status(projects)` /// can report accurate numbers without opening LMDB. pub(crate) fn update_metadata_stats(db_path: &Path, total_chunks: usize, total_files: usize) { @@ -628,6 +653,26 @@ async fn index_with_options( // If no changes and no deleted files, we're done if changed_files.is_empty() && deleted_files.is_empty() { + // Safety net: if a previous run was cancelled/interrupted mid-way, + // the HNSW vector index may never have been built. Detect this and + // rebuild now so the database is usable without requiring --force. + match ensure_hnsw_index_if_needed(&db_path, model_type.dimensions()) { + Ok(true) => { + log_print!( + "\n{}", + "🔨 Vector index not built from previous run — rebuilt successfully." + .yellow() + ); + } + Ok(false) => {} // already indexed or no chunks — all good + Err(e) => { + log_print!( + "{}", + format!("⚠️ Could not rebuild vector index: {}", e).yellow() + ); + } + } + log_print!("\n{}", "✅ Database is up to date!".green()); return Ok(()); } @@ -896,38 +941,121 @@ async fn index_with_options( // Memory is freed here - chunks/embeddings dropped before next file } - // Handle cancellation: exit quickly without blocking on build_index + // Handle cancellation: still finalize the index properly so the database + // remains usable. Skipping build_index() was the old behaviour — it left + // the database in a broken state that a subsequent incremental run could + // not recover from (no changed files → early return → index never built). if cancelled { pb.finish_with_message("Cancelled!"); - log_print!("\n{}", "⚠️ Indexing cancelled by user".yellow()); + log_print!( + "\n{}", + "⚠️ Indexing cancelled — finalising partial index...".yellow() + ); - // Free ONNX model memory immediately + // Free ONNX model memory before build_index (releases hundreds of MB) drop(embedding_service); drop(chunker); - // Don't call build_index() — it blocks for 10-30 seconds on large datasets. - // The database is in a partially written state, user can re-run with --force. - // Commit FTS with retry to avoid index corruption on shutdown. + // Commit FTS if total_chunks > 0 { if let Err(e) = fts_store.commit() { - // Log the error - best-effort commit failed + log_print!("{} FTS commit warning: {}", "⚠️ ".yellow(), e); + } + } + drop(fts_store); + + // Build vector index from the chunks that were successfully inserted + if total_chunks > 0 { + log_print!( + " Building vector index for {} partial chunks...", + total_chunks + ); + store.build_index()?; + log_print!(" ✅ Vector index built"); + } + + // Save metadata — best-effort: log and continue on failure so the + // partial chunks we already built are still searchable. + let metadata_json = serde_json::to_string_pretty(&serde_json::json!({ + "model_short_name": model_type.short_name(), + "model_name": model_type.name(), + "dimensions": model_type.dimensions(), + "indexed_at": chrono::Utc::now().to_rfc3339(), + "partial": true, + })); + match metadata_json { + Ok(json) => { + if let Err(e) = std::fs::write(db_path.join("metadata.json"), json) { + log_print!("{} metadata.json write warning: {}", "⚠️ ".yellow(), e); + } + } + Err(e) => { log_print!( - "{} FTS commit warning: {} (index may need recovery)", + "{} metadata.json serialise warning: {}", "⚠️ ".yellow(), e ); - log_print!( - "{} Run {} to rebuild the index cleanly if needed", - "💡 ".cyan(), - "codesearch index -f".bright_cyan() - ); + } + } + + // Update FileMetaStore with the files that were actually processed. + // Also best-effort: a failed save means the next incremental run will + // re-process those files, which is acceptable for a cancelled index. + if !file_chunks.is_empty() { + let save_result = if is_incremental { + let mut meta = file_meta_store.take().unwrap(); + for (file_path, chunk_ids) in file_chunks { + if let Err(e) = meta.update_file(Path::new(&file_path), chunk_ids) { + log_print!( + "{} file-meta update warning for '{}': {}", + "⚠️ ".yellow(), + file_path, + e + ); + } + } + meta.save(&db_path) } else { + let mut meta = FileMetaStore::new( + model_type.short_name().to_string(), + model_type.dimensions(), + ); + for (file_path, chunk_ids) in file_chunks { + if let Err(e) = meta.update_file(Path::new(&file_path), chunk_ids) { + log_print!( + "{} file-meta update warning for '{}': {}", + "⚠️ ".yellow(), + file_path, + e + ); + } + } + meta.save(&db_path) + }; + if let Err(e) = save_result { + log_print!("{} file-meta save warning: {}", "⚠️ ".yellow(), e); + } + } + + // Persist stats — best-effort, only for display; failures are non-fatal. + match store.stats() { + Ok(db_stats) => { + update_metadata_stats(&db_path, db_stats.total_chunks, db_stats.total_files); log_print!( - " Partial progress: {} chunks written (re-run with --force for clean index)", - total_chunks + " Partial index finalised: {} chunks, {} files", + db_stats.total_chunks, + db_stats.total_files ); } + Err(e) => { + log_print!("{} Could not read final stats: {}", "⚠️ ".yellow(), e); + } } + log_print!( + "{} Run {} to index the remaining files", + "💡 ".cyan(), + "codesearch index".bright_cyan() + ); return Ok(()); } @@ -1013,12 +1141,15 @@ async fn index_with_options( store.build_index()?; let _storage_duration = storage_start.elapsed(); - // Save model metadata + // Save model metadata. `partial: false` is explicit so the schema matches + // the cancel path (which writes `partial: true`); readers can always check + // the field regardless of how indexing completed. let metadata = serde_json::json!({ "model_short_name": model_short_name, "model_name": model_name, "dimensions": model_dimensions, "indexed_at": chrono::Utc::now().to_rfc3339(), + "partial": false, }); std::fs::write( db_path.join("metadata.json"), @@ -2236,3 +2367,103 @@ mod serve_probe_tests { ); } } + +#[cfg(test)] +mod index_quality_tests { + use super::ensure_hnsw_index_if_needed; + use crate::chunker::{Chunk, ChunkKind}; + use crate::embed::EmbeddedChunk; + use crate::vectordb::VectorStore; + use tempfile::tempdir; + + /// Helper: create a minimal EmbeddedChunk with `dims`-dimensional embedding. + fn fake_chunk(path: &str, dims: usize) -> EmbeddedChunk { + EmbeddedChunk::new( + Chunk::new( + "fn dummy() {}".to_string(), + 0, + 1, + ChunkKind::Function, + path.to_string(), + ), + vec![1.0_f32 / (dims as f32).sqrt(); dims], + ) + } + + /// `ensure_hnsw_index_if_needed` returns `true` and leaves the DB indexed + /// when there are chunks but the HNSW index was never built (simulates a + /// prior cancellation that finished inserting chunks but never called + /// `build_index`). + #[test] + fn rebuilds_unindexed_db_with_chunks() { + let dir = tempdir().unwrap(); + let db_path = dir.path().join("test.db"); + const DIMS: usize = 4; + + // Insert a chunk without calling build_index — simulate cancelled run. + { + let mut vs = VectorStore::new(&db_path, DIMS).unwrap(); + vs.insert_chunks(vec![fake_chunk("foo.rs", DIMS)]).unwrap(); + // Deliberately do NOT call vs.build_index() + let s = vs.stats().unwrap(); + assert!(s.total_chunks > 0, "precondition: DB has chunks"); + assert!(!s.indexed, "precondition: index not yet built"); + } + + // Safety-net must detect and rebuild the index. + let rebuilt = ensure_hnsw_index_if_needed(&db_path, DIMS) + .expect("ensure_hnsw_index_if_needed should not error"); + assert!( + rebuilt, + "expected the function to report it rebuilt the index" + ); + + // Verify: DB is now indexed. + let vs = VectorStore::new(&db_path, DIMS).unwrap(); + assert!( + vs.is_indexed(), + "VectorStore must be indexed after ensure_hnsw_index_if_needed" + ); + } + + /// `ensure_hnsw_index_if_needed` returns `false` (no rebuild) when the DB + /// is already indexed — repeated calls must be idempotent. + #[test] + fn no_rebuild_when_already_indexed() { + let dir = tempdir().unwrap(); + let db_path = dir.path().join("test.db"); + const DIMS: usize = 4; + + { + let mut vs = VectorStore::new(&db_path, DIMS).unwrap(); + vs.insert_chunks(vec![fake_chunk("bar.rs", DIMS)]).unwrap(); + vs.build_index().unwrap(); + assert!(vs.is_indexed(), "precondition: already indexed"); + } + + let rebuilt = ensure_hnsw_index_if_needed(&db_path, DIMS) + .expect("should succeed on already-indexed DB"); + assert!( + !rebuilt, + "should not report a rebuild on an already-indexed DB" + ); + } + + /// `ensure_hnsw_index_if_needed` returns `false` (no rebuild) for an empty + /// DB (no chunks, nothing to index). + #[test] + fn no_rebuild_for_empty_db() { + let dir = tempdir().unwrap(); + let db_path = dir.path().join("test.db"); + const DIMS: usize = 4; + + // Empty DB — no chunks inserted. + { + let _vs = VectorStore::new(&db_path, DIMS).unwrap(); + } + + let rebuilt = + ensure_hnsw_index_if_needed(&db_path, DIMS).expect("should succeed on empty DB"); + assert!(!rebuilt, "empty DB needs no rebuild"); + } +} diff --git a/src/serve/mod.rs b/src/serve/mod.rs index 8e31d0d..16e9896 100644 --- a/src/serve/mod.rs +++ b/src/serve/mod.rs @@ -463,6 +463,27 @@ impl ServeState { // Bootstrap last_changed_unix UP FRONT (before the has_index branch), // so the sort key is meaningful for both first-builds and refreshes. + // + // IMPORTANT: bootstrap_last_changed spawns a git subprocess and walks + // the filesystem (≤10,000 entries). Running that work while holding the + // config write-lock would block every concurrent config.read() call for + // the duration of the scan. Fix: check whether bootstrapping is needed + // under a *read* lock, perform the slow I/O outside any lock, then take + // the write lock only for the brief config update. + let needs_bootstrap = { + let cfg = match self.config.read() { + Ok(c) => c, + Err(_) => return RebuildDecision::ConfigPoisoned, + }; + cfg.meta(alias).last_changed_unix.is_none() + }; + // Slow git/fs work runs here — no lock held. + let bootstrapped_ts = if needs_bootstrap { + Self::bootstrap_last_changed(repo_path).or_else(|| Some(Self::now_unix_secs())) + } else { + None + }; + let (last_changed, last_scip, touched_bootstrap) = { let mut cfg = match self.config.write() { Ok(c) => c, @@ -471,8 +492,9 @@ impl ServeState { let mut meta = cfg.meta(alias); let mut touched = false; if meta.last_changed_unix.is_none() { - meta.last_changed_unix = - Self::bootstrap_last_changed(repo_path).or_else(|| Some(Self::now_unix_secs())); + // Another task may have bootstrapped between our read and write; + // the double-check here is intentional (TOCTOU-safe via the write lock). + meta.last_changed_unix = bootstrapped_ts; if let Some(ts) = meta.last_changed_unix { touched = cfg.touch_last_changed(alias, ts); } @@ -648,10 +670,12 @@ impl ServeState { self.repos.remove(alias); self.last_access.remove(alias); - // Unregister from repos.json + // Unregister from repos.json — route through persist_config + // so the config_path_override is honoured (same as all + // other save sites in ServeState). if let Ok(mut config) = self.config.write() { if config.unregister_alias(alias) { - if let Err(save_err) = config.save() { + if let Err(save_err) = self.persist_config(&config) { warn!( "phase-1: failed to save repos.json after pruning '{}': {}", alias, save_err @@ -691,7 +715,27 @@ impl ServeState { None => continue, }; let db_path = path.join(DB_DIR_NAME); - let decision = self.evaluate_csharp_rebuild(alias, &path, &db_path); + // evaluate_csharp_rebuild may spawn a git subprocess and walk the + // filesystem — offload to the blocking pool so the async runtime + // stays responsive while processing all candidates. + let state2 = self.clone(); + let alias2 = alias.clone(); + let path2 = path.clone(); + let db_path2 = db_path.clone(); + let decision = match tokio::task::spawn_blocking(move || { + state2.evaluate_csharp_rebuild(&alias2, &path2, &db_path2) + }) + .await + { + Ok(d) => d, + Err(e) => { + warn!( + "phase-2: evaluate_csharp_rebuild panicked for '{}': {:?}", + alias, e + ); + continue; + } + }; if !decision.needs_rebuild() { // If the SCIP index exists and is fresh, mark C# status as Ready // so the TUI shows the C# indicator (e.g. "C#·") instead of None. @@ -957,9 +1001,10 @@ impl ServeState { }, }; - // Canonicalize to resolve symlinks and prevent path traversal. + // Canonicalize to resolve symlinks, prevent path traversal, and strip + // Windows UNC prefix (\\?\) so paths compare correctly against stored values. // CodeQL: path derives from env var (CODESEARCH_REPOS_CONFIG) — validate before use. - let config_path = match std::fs::canonicalize(&config_path) { + let config_path = match safe_canonicalize(&config_path) { Ok(p) => p, Err(_) => return Ok(()), // file doesn't exist yet — nothing to reload }; @@ -1358,20 +1403,31 @@ impl ServeState { // When opening an existing DB, VectorStore starts with indexed=false. // Without this, search fails with "Index not built" until the background // refresh completes (which may take minutes for large repos). + // build_index() is CPU-heavy — offload to the blocking pool so the async + // runtime is not stalled while building the HNSW index for large repos. { - let mut vstore = stores.vector_store.write().await; - match vstore.stats() { - Ok(s) if s.total_chunks > 0 && !s.indexed => { - info!( - "Building vector index for '{}' ({} existing chunks)", - alias, s.total_chunks - ); - if let Err(e) = vstore.build_index() { - warn!("Failed to build vector index for '{}': {}", alias, e); + let vector_store = Arc::clone(&stores.vector_store); + let alias_owned = alias.to_string(); + match tokio::task::spawn_blocking(move || { + let mut vstore = vector_store.blocking_write(); + match vstore.stats() { + Ok(s) if s.total_chunks > 0 && !s.indexed => { + info!( + "Building vector index for '{}' ({} existing chunks)", + alias_owned, s.total_chunks + ); + if let Err(e) = vstore.build_index() { + warn!("Failed to build vector index for '{}': {}", alias_owned, e); + } } + Ok(_) => {} // already indexed or no chunks + Err(e) => warn!("Could not read stats for '{}': {}", alias_owned, e), } - Ok(_) => {} // already indexed or no chunks - Err(e) => warn!("Could not read stats for '{}': {}", alias, e), + }) + .await + { + Ok(()) => {} + Err(e) => warn!("warmup: build_index task panicked for '{}': {:?}", alias, e), } } @@ -2726,11 +2782,22 @@ async fn add_repo_handler( } } - // Build vector index from freshly indexed data + // Build vector index from freshly indexed data. + // build_index() is CPU-heavy — offload to the blocking pool. { - let mut vstore = stores.vector_store.write().await; - if let Err(e) = vstore.build_index() { - tracing::warn!("Failed to build vector index for '{}': {}", alias_bg, e); + let vector_store = Arc::clone(&stores.vector_store); + let alias_bi = alias_bg.clone(); + match tokio::task::spawn_blocking(move || { + let mut vstore = vector_store.blocking_write(); + vstore.build_index() + }) + .await + { + Ok(Ok(())) => {} + Ok(Err(e)) => { + tracing::warn!("Failed to build vector index for '{}': {}", alias_bi, e) + } + Err(e) => tracing::warn!("build_index task panicked for '{}': {:?}", alias_bi, e), } } @@ -3057,7 +3124,18 @@ pub async fn run_serve( { let phase_state = serve_state.clone(); tokio::spawn(async move { - phase_state.reconcile_all_paths(); + // reconcile_all_paths spawns git subprocesses and traverses the + // filesystem while holding the config RwLock write-guard. Running + // it on a Tokio worker thread would starve the async runtime and + // block all concurrent config.read() calls for the entire duration. + // spawn_blocking offloads the synchronous work to the blocking + // thread pool, then we await the handle before proceeding to Phase 1. + let reconcile_state = phase_state.clone(); + if let Err(e) = + tokio::task::spawn_blocking(move || reconcile_state.reconcile_all_paths()).await + { + warn!("reconcile: spawn_blocking panicked: {:?}", e); + } phase_state.run_phase_1_warmup_all().await; phase_state.run_phase_2_csharp_scip().await; phase_state.run_phase_3_prewarm().await;