diff --git a/CHANGELOG.md b/CHANGELOG.md index ef98df3..79603e2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,23 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 +## [1.0.156] - 2026-06-02 + +### Fixed + +- **`reconcile_all_paths` no longer blocks the Tokio async runtime** — the + function spawns git subprocesses and holds the config `RwLock` write-guard + while scanning the filesystem. It is now offloaded via + `tokio::task::spawn_blocking` so Tokio worker threads stay responsive during + startup reconciliation. +- **Phase 1 auto-prune now honours `config_path_override`** — the prune path + wrote `repos.json` via `config.save()`, bypassing `ServeState::persist_config`. + All save sites in `ServeState` must route through `persist_config` so the + override (used in integration tests) is respected. Fixed to use + `self.persist_config(&config)`. + + + ## [1.0.154] - 2026-06-02 ### Fixed diff --git a/Cargo.lock b/Cargo.lock index 5fe70f6..48f875a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -628,7 +628,7 @@ dependencies = [ [[package]] name = "codesearch" -version = "1.0.154" +version = "1.0.156" dependencies = [ "anyhow", "arroy", diff --git a/Cargo.toml b/Cargo.toml index d88f5ad..0fb6d8a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "codesearch" -version = "1.0.154" +version = "1.0.156" edition = "2021" authors = ["codesearch contributors"] license = "Apache-2.0" diff --git a/src/index/mod.rs b/src/index/mod.rs index 798fb2c..d988582 100644 --- a/src/index/mod.rs +++ b/src/index/mod.rs @@ -628,6 +628,34 @@ async fn index_with_options( // If no changes and no deleted files, we're done if changed_files.is_empty() && deleted_files.is_empty() { + // Safety net: if a previous run was cancelled/interrupted mid-way, + // the HNSW vector index may never have been built. Detect this and + // rebuild now so the database is usable without requiring --force. + { + let mut vs = VectorStore::new(&db_path, model_type.dimensions())?; + match vs.stats() { + Ok(s) if s.total_chunks > 0 && !s.indexed => { + log_print!( + "\n{}", + format!( + "🔨 Vector index not built ({} chunks found from previous run). Rebuilding...", + s.total_chunks + ) + .yellow() + ); + vs.build_index()?; + log_print!("{}", "✅ Vector index rebuilt successfully!".green()); + } + Ok(_) => {} // already indexed or no chunks — all good + Err(e) => { + log_print!( + "{}", + format!("⚠️ Could not check vector index status: {}", e).yellow() + ); + } + } + } + log_print!("\n{}", "✅ Database is up to date!".green()); return Ok(()); } @@ -896,39 +924,86 @@ async fn index_with_options( // Memory is freed here - chunks/embeddings dropped before next file } - // Handle cancellation: exit quickly without blocking on build_index + // Handle cancellation: still finalize the index properly so the database + // remains usable. Skipping build_index() was the old behaviour — it left + // the database in a broken state that a subsequent incremental run could + // not recover from (no changed files → early return → index never built). if cancelled { pb.finish_with_message("Cancelled!"); - log_print!("\n{}", "⚠️ Indexing cancelled by user".yellow()); + log_print!( + "\n{}", + "⚠️ Indexing cancelled — finalising partial index...".yellow() + ); - // Free ONNX model memory immediately + // Free ONNX model memory before build_index (releases hundreds of MB) drop(embedding_service); drop(chunker); - // Don't call build_index() — it blocks for 10-30 seconds on large datasets. - // The database is in a partially written state, user can re-run with --force. - // Commit FTS with retry to avoid index corruption on shutdown. + // Commit FTS if total_chunks > 0 { if let Err(e) = fts_store.commit() { - // Log the error - best-effort commit failed - log_print!( - "{} FTS commit warning: {} (index may need recovery)", - "⚠️ ".yellow(), - e - ); - log_print!( - "{} Run {} to rebuild the index cleanly if needed", - "💡 ".cyan(), - "codesearch index -f".bright_cyan() - ); + log_print!("{} FTS commit warning: {}", "⚠️ ".yellow(), e); + } + } + drop(fts_store); + + // Build vector index from the chunks that were successfully inserted + if total_chunks > 0 { + log_print!( + " Building vector index for {} partial chunks...", + total_chunks + ); + store.build_index()?; + log_print!(" ✅ Vector index built"); + } + + // Save metadata + std::fs::write( + db_path.join("metadata.json"), + serde_json::to_string_pretty(&serde_json::json!({ + "model_short_name": model_type.short_name(), + "model_name": model_type.name(), + "dimensions": model_type.dimensions(), + "indexed_at": chrono::Utc::now().to_rfc3339(), + "partial": true, + }))?, + )?; + + // Update FileMetaStore with the files that were actually processed + if !file_chunks.is_empty() { + if is_incremental { + let mut meta = file_meta_store.take().unwrap(); + for (file_path, chunk_ids) in file_chunks { + meta.update_file(Path::new(&file_path), chunk_ids)?; + } + meta.save(&db_path)?; } else { - log_print!( - " Partial progress: {} chunks written (re-run with --force for clean index)", - total_chunks + let mut meta = FileMetaStore::new( + model_type.short_name().to_string(), + model_type.dimensions(), ); + for (file_path, chunk_ids) in file_chunks { + meta.update_file(Path::new(&file_path), chunk_ids)?; + } + meta.save(&db_path)?; } } + // Persist stats + let db_stats = store.stats()?; + update_metadata_stats(&db_path, db_stats.total_chunks, db_stats.total_files); + + log_print!( + " Partial index finalised: {} chunks, {} files", + db_stats.total_chunks, + db_stats.total_files + ); + log_print!( + "{} Run {} to index the remaining files", + "💡 ".cyan(), + "codesearch index".bright_cyan() + ); + return Ok(()); } diff --git a/src/serve/mod.rs b/src/serve/mod.rs index 8e31d0d..ddb192f 100644 --- a/src/serve/mod.rs +++ b/src/serve/mod.rs @@ -648,10 +648,12 @@ impl ServeState { self.repos.remove(alias); self.last_access.remove(alias); - // Unregister from repos.json + // Unregister from repos.json — route through persist_config + // so the config_path_override is honoured (same as all + // other save sites in ServeState). if let Ok(mut config) = self.config.write() { if config.unregister_alias(alias) { - if let Err(save_err) = config.save() { + if let Err(save_err) = self.persist_config(&config) { warn!( "phase-1: failed to save repos.json after pruning '{}': {}", alias, save_err @@ -3057,7 +3059,18 @@ pub async fn run_serve( { let phase_state = serve_state.clone(); tokio::spawn(async move { - phase_state.reconcile_all_paths(); + // reconcile_all_paths spawns git subprocesses and traverses the + // filesystem while holding the config RwLock write-guard. Running + // it on a Tokio worker thread would starve the async runtime and + // block all concurrent config.read() calls for the entire duration. + // spawn_blocking offloads the synchronous work to the blocking + // thread pool, then we await the handle before proceeding to Phase 1. + let reconcile_state = phase_state.clone(); + if let Err(e) = + tokio::task::spawn_blocking(move || reconcile_state.reconcile_all_paths()).await + { + warn!("reconcile: spawn_blocking panicked: {:?}", e); + } phase_state.run_phase_1_warmup_all().await; phase_state.run_phase_2_csharp_scip().await; phase_state.run_phase_3_prewarm().await;