From 14acb4a4dfb53930f9faa29b4a084bbe908f0031 Mon Sep 17 00:00:00 2001 From: flupkede Date: Mon, 1 Jun 2026 19:33:11 +0200 Subject: [PATCH 1/9] [worker] stage 1/5: capture git remote identity per repo Add RepoMeta.git_remote (serde default, backward compatible) and a best-effort git_remote_url() helper. Populate it in register() and register_with_alias() so every registered repo records its remote.origin.url for later relocation of moved/renamed folders. Co-Authored-By: Claude Opus 4.8 (1M context) --- Cargo.lock | 2 +- Cargo.toml | 2 +- src/db_discovery/repos.rs | 38 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 40 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 9d36a43..7bae2c4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -628,7 +628,7 @@ dependencies = [ [[package]] name = "codesearch" -version = "1.0.142" +version = "1.0.143" dependencies = [ "anyhow", "arroy", diff --git a/Cargo.toml b/Cargo.toml index e41c0b9..101b3e5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "codesearch" -version = "1.0.142" +version = "1.0.143" edition = "2021" authors = ["codesearch contributors"] license = "Apache-2.0" diff --git a/src/db_discovery/repos.rs b/src/db_discovery/repos.rs index dbdd1eb..f626eb5 100644 --- a/src/db_discovery/repos.rs +++ b/src/db_discovery/repos.rs @@ -24,6 +24,10 @@ pub struct RepoMeta { /// Unix timestamp (seconds) of last successful SCIP index rebuild. #[serde(default, skip_serializing_if = "Option::is_none")] pub last_scip_indexed_unix: Option, + /// Git remote URL (`remote.origin.url`) captured at registration time. + /// Used to re-locate a repo whose folder was renamed/moved (best-effort). + #[serde(default, skip_serializing_if = "Option::is_none")] + pub git_remote: Option, } #[derive(Debug, Deserialize)] @@ -108,6 +112,9 @@ impl ReposConfig { } let alias = unique_alias_for_path(&self.repos, &canonical); + if let Some(remote) = git_remote_url(&canonical) { + self.repos_meta.entry(alias.clone()).or_default().git_remote = Some(remote); + } self.repos.insert(alias.clone(), canonical); alias } @@ -137,6 +144,12 @@ impl ReposConfig { None => unique_alias_for_path(&self.repos, &canonical), }; + if let Some(remote) = git_remote_url(&canonical) { + self.repos_meta + .entry(final_alias.clone()) + .or_default() + .git_remote = Some(remote); + } self.repos.insert(final_alias.clone(), canonical); Ok(final_alias) } @@ -354,6 +367,31 @@ fn normalize_path_for_compare(path: &Path) -> String { crate::cache::normalize_path(path) } +/// Best-effort lookup of a directory's git remote URL (`remote.origin.url`). +/// +/// Returns `None` when `git` is unavailable, the path is not a git repo, or the +/// repo has no `origin` remote. Used both to capture a repo's identity at +/// registration time and to match candidate directories during relocation. +pub(crate) fn git_remote_url(path: &Path) -> Option { + let output = std::process::Command::new("git") + .arg("-C") + .arg(path) + .args(["config", "--get", "remote.origin.url"]) + .output() + .ok()?; + + if !output.status.success() { + return None; + } + + let url = String::from_utf8_lossy(&output.stdout).trim().to_string(); + if url.is_empty() { + None + } else { + Some(url) + } +} + #[cfg(test)] mod tests { use super::*; From e826a7086e97e2bd12ec672ae5959059f57f1d85 Mon Sep 17 00:00:00 2001 From: flupkede Date: Mon, 1 Jun 2026 19:42:24 +0200 Subject: [PATCH 2/9] @ [worker] stage 2/5: relocate moved repos + reconcile pass + index prune - Best-effort git relocation: try_relocate() walks to nearest existing ancestor and bounded-depth scans for a git root with matching remote.origin.url; unambiguous single match rewrites repos.json. - ServeState::reconcile_all_paths() runs at startup before phase 1/2/3; relocates or warns+skips missing paths (never crashes). - Existence guards added to phase-2 SCIP and phase-3 prewarm consumers. - New `codesearch index prune` command: relocate-first, else unregister stale aliases, with summary output. - CODESEARCH_RELOCATE_MAX_DEPTH env (default 3). - Unit tests for capture-on-register and try_relocate (renamed leaf, path-exists, no-remote, ambiguous). Co-Authored-By: Claude Opus 4.8 (1M context) @ --- Cargo.lock | 2 +- Cargo.toml | 2 +- src/cli/mod.rs | 4 + src/constants.rs | 7 ++ src/db_discovery/repos.rs | 192 ++++++++++++++++++++++++++++++++++++++ src/index/mod.rs | 53 +++++++++++ src/serve/mod.rs | 78 ++++++++++++++++ 7 files changed, 336 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 7bae2c4..c053851 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -628,7 +628,7 @@ dependencies = [ [[package]] name = "codesearch" -version = "1.0.143" +version = "1.0.144" dependencies = [ "anyhow", "arroy", diff --git a/Cargo.toml b/Cargo.toml index 101b3e5..917d5b1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "codesearch" -version = "1.0.143" +version = "1.0.144" edition = "2021" authors = ["codesearch contributors"] license = "Apache-2.0" diff --git a/src/cli/mod.rs b/src/cli/mod.rs index 813dd0d..636d072 100644 --- a/src/cli/mod.rs +++ b/src/cli/mod.rs @@ -49,6 +49,9 @@ pub enum IndexCommands { #[arg(short = 'f', long)] force: bool, }, + + /// Remove stale entries from repos.json (relocates moved repos first) + Prune, } /// Cache subcommands @@ -556,6 +559,7 @@ pub async fn run(cancel_token: CancellationToken) -> Result<()> { IndexCommands::Symbol { alias, force } => { trigger_symbol_reindex_via_api(&alias, force).await } + IndexCommands::Prune => crate::index::prune_index().await, } } else { // Flag-based backward-compat path diff --git a/src/constants.rs b/src/constants.rs index 0b03770..ba26ee5 100644 --- a/src/constants.rs +++ b/src/constants.rs @@ -192,6 +192,13 @@ pub const DEFAULT_EMBEDDING_DIMENSIONS: usize = 384; /// Environment variable to override repos config file path. pub const REPOS_CONFIG_ENV: &str = "CODESEARCH_REPOS_CONFIG"; +/// Environment variable to override how deep relocation scans for a moved repo. +pub const RELOCATE_MAX_DEPTH_ENV: &str = "CODESEARCH_RELOCATE_MAX_DEPTH"; + +/// Default bounded depth for the relocation scan (directories below the nearest +/// existing ancestor of a stale repo path). +pub const DEFAULT_RELOCATE_MAX_DEPTH: usize = 3; + /// Environment variable to set MCP mode: "auto", "client", or "local". pub const MCP_MODE_ENV: &str = "CODESEARCH_MCP_MODE"; diff --git a/src/db_discovery/repos.rs b/src/db_discovery/repos.rs index f626eb5..01326d2 100644 --- a/src/db_discovery/repos.rs +++ b/src/db_discovery/repos.rs @@ -292,6 +292,48 @@ impl ReposConfig { .find(|(_, p)| normalize_path_for_compare(p) == normalize_path_for_compare(&canonical)) .map(|(alias, _)| alias.clone()) } + + /// Best-effort relocation of a registered repo whose stored path no longer + /// exists (e.g. its folder was renamed/moved). Starting from the nearest + /// still-existing ancestor of the stale path, scans (bounded depth) for a + /// git repository whose `remote.origin.url` matches the one captured at + /// registration time. Returns the new path only on a single unambiguous + /// match; `None` when the path still exists, no remote was recorded, or the + /// match is absent/ambiguous. + pub fn try_relocate(&self, alias: &str) -> Option { + let stale = self.repos.get(alias)?; + if stale.exists() { + return None; // path is fine — nothing to relocate + } + + let target_remote = self.repos_meta.get(alias)?.git_remote.clone()?; + + // Walk up to the nearest ancestor that still exists on disk. + let mut anchor = stale.parent(); + while let Some(dir) = anchor { + if dir.exists() { + break; + } + anchor = dir.parent(); + } + let anchor = anchor?; + + let mut matches = Vec::new(); + scan_for_remote(anchor, &target_remote, relocate_max_depth(), &mut matches); + + // Don't relocate onto a path already registered under another alias. + matches.retain(|p| { + !self.repos.iter().any(|(a, existing)| { + a != alias && normalize_path_for_compare(existing) == normalize_path_for_compare(p) + }) + }); + + if matches.len() == 1 { + Some(strip_unc_prefix(matches.into_iter().next().unwrap())) + } else { + None + } + } } pub fn config_dir() -> Result { @@ -392,11 +434,161 @@ pub(crate) fn git_remote_url(path: &Path) -> Option { } } +/// Configured relocation scan depth (`CODESEARCH_RELOCATE_MAX_DEPTH`, default 3). +fn relocate_max_depth() -> usize { + std::env::var(crate::constants::RELOCATE_MAX_DEPTH_ENV) + .ok() + .and_then(|v| v.trim().parse::().ok()) + .unwrap_or(crate::constants::DEFAULT_RELOCATE_MAX_DEPTH) +} + +/// Directory names never worth descending into during a relocation scan. +fn is_skippable_scan_dir(path: &Path) -> bool { + matches!( + path.file_name().and_then(|n| n.to_str()), + Some( + ".git" + | "node_modules" + | "target" + | "bin" + | "obj" + | "dist" + | "build" + | ".codesearch.db" + ) + ) +} + +/// Recursively collect git roots under `dir` (bounded by `depth`) whose +/// `remote.origin.url` matches `target_remote`. A matching git root is recorded +/// and not descended into (nested repos below it are ignored). +fn scan_for_remote(dir: &Path, target_remote: &str, depth: usize, out: &mut Vec) { + if dir.join(".git").exists() { + if git_remote_url(dir).as_deref() == Some(target_remote) { + out.push(dir.to_path_buf()); + } + return; + } + + if depth == 0 { + return; + } + + if let Ok(entries) = std::fs::read_dir(dir) { + for entry in entries.flatten() { + let child = entry.path(); + if child.is_dir() && !is_skippable_scan_dir(&child) { + scan_for_remote(&child, target_remote, depth - 1, out); + } + } + } +} + #[cfg(test)] mod tests { use super::*; use std::io::Write; + /// Initialise a git repo at `dir` with an `origin` remote pointing at `url`. + fn init_git_remote(dir: &Path, url: &str) { + let run = |args: &[&str]| { + std::process::Command::new("git") + .arg("-C") + .arg(dir) + .args(args) + .output() + .expect("git available in test env") + }; + run(&["init"]); + run(&["remote", "add", "origin", url]); + } + + #[test] + fn captures_git_remote_on_register() { + let tmp = tempfile::tempdir().unwrap(); + let repo = tmp.path().join("repo"); + std::fs::create_dir(&repo).unwrap(); + init_git_remote(&repo, "https://example.com/acme/repo.git"); + + let mut cfg = ReposConfig::default(); + let alias = cfg.register(repo); + assert_eq!( + cfg.meta(&alias).git_remote.as_deref(), + Some("https://example.com/acme/repo.git") + ); + } + + #[test] + fn try_relocate_finds_renamed_leaf() { + let tmp = tempfile::tempdir().unwrap(); + let original = tmp.path().join("myrepo"); + std::fs::create_dir(&original).unwrap(); + init_git_remote(&original, "https://example.com/acme/myrepo.git"); + + let mut cfg = ReposConfig::default(); + let alias = cfg.register(original.clone()); + + // Rename the leaf folder; stored path is now stale. + let renamed = tmp.path().join("myrepo-renamed"); + std::fs::rename(&original, &renamed).unwrap(); + + let found = cfg + .try_relocate(&alias) + .expect("should relocate renamed leaf"); + assert_eq!( + normalize_path_for_compare(&found), + normalize_path_for_compare(&renamed) + ); + } + + #[test] + fn try_relocate_returns_none_when_path_exists() { + let tmp = tempfile::tempdir().unwrap(); + let repo = tmp.path().join("live"); + std::fs::create_dir(&repo).unwrap(); + init_git_remote(&repo, "https://example.com/acme/live.git"); + + let mut cfg = ReposConfig::default(); + let alias = cfg.register(repo); + assert!(cfg.try_relocate(&alias).is_none()); + } + + #[test] + fn try_relocate_none_without_recorded_remote() { + let tmp = tempfile::tempdir().unwrap(); + let plain = tmp.path().join("plain"); + std::fs::create_dir(&plain).unwrap(); + + let mut cfg = ReposConfig::default(); + let alias = cfg.register(plain.clone()); + assert!(cfg.meta(&alias).git_remote.is_none()); + + std::fs::rename(&plain, tmp.path().join("plain-moved")).unwrap(); + assert!(cfg.try_relocate(&alias).is_none()); + } + + #[test] + fn try_relocate_none_when_ambiguous() { + let tmp = tempfile::tempdir().unwrap(); + let original = tmp.path().join("orig"); + std::fs::create_dir(&original).unwrap(); + init_git_remote(&original, "https://example.com/acme/dup.git"); + + let mut cfg = ReposConfig::default(); + let alias = cfg.register(original.clone()); + + // Two candidates with the same remote → ambiguous → no relocation. + let a = tmp.path().join("copy-a"); + let b = tmp.path().join("copy-b"); + std::fs::create_dir(&a).unwrap(); + std::fs::create_dir(&b).unwrap(); + init_git_remote(&a, "https://example.com/acme/dup.git"); + init_git_remote(&b, "https://example.com/acme/dup.git"); + std::fs::remove_dir_all(&original).unwrap(); + + assert!(cfg.try_relocate(&alias).is_none()); + } + #[test] fn test_unique_alias_generation() { let mut repos = HashMap::new(); diff --git a/src/index/mod.rs b/src/index/mod.rs index 14a2d57..df2ddef 100644 --- a/src/index/mod.rs +++ b/src/index/mod.rs @@ -1250,6 +1250,59 @@ fn print_repo_stats(repo_path: &Path, db_path: &Path) -> Result<()> { } /// Add a repository to the index (creates local or global) +/// Remove stale entries from `repos.json`. +/// +/// For each registered repo whose path no longer exists on disk (e.g. its +/// folder was renamed/moved), a best-effort git-identity relocation is tried +/// first; only entries that cannot be relocated are unregistered. Prints a +/// summary of what was relocated/removed. +pub async fn prune_index() -> Result<()> { + use crate::db_discovery::repos::ReposConfig; + + let mut config = ReposConfig::load()?; + let aliases: Vec = config.repos.keys().cloned().collect(); + + let mut relocated: Vec<(String, PathBuf)> = Vec::new(); + let mut removed: Vec = Vec::new(); + + for alias in &aliases { + let Some(path) = config.resolve(alias) else { + continue; + }; + if path.exists() { + continue; + } + + if let Some(new_path) = config.try_relocate(alias) { + config.repos.insert(alias.clone(), new_path.clone()); + relocated.push((alias.clone(), new_path)); + } else if config.unregister_alias(alias) { + removed.push(alias.clone()); + } + } + + if relocated.is_empty() && removed.is_empty() { + println!("✅ No stale repositories found — repos.json is clean."); + return Ok(()); + } + + config.save()?; + + for (alias, path) in &relocated { + println!("📍 relocated '{}' → {}", alias, path.display()); + } + for alias in &removed { + println!("🗑️ removed stale entry '{}'", alias); + } + println!( + "✅ Prune complete: {} relocated, {} removed.", + relocated.len(), + removed.len() + ); + + Ok(()) +} + pub async fn add_to_index( path: Option, global: bool, diff --git a/src/serve/mod.rs b/src/serve/mod.rs index 82c4c39..452462d 100644 --- a/src/serve/mod.rs +++ b/src/serve/mod.rs @@ -563,6 +563,66 @@ impl ServeState { }); } + /// Reconcile registered repo paths against the filesystem before warmup. + /// + /// For each alias whose stored path no longer exists (folder renamed/moved), + /// attempt a best-effort git-identity relocation and rewrite `repos.json`. + /// When relocation fails the entry is left in place and merely logged — it is + /// skipped safely at warmup and never crashes serve. Explicit cleanup of + /// unrecoverable entries is available via `codesearch index prune`. + pub(crate) fn reconcile_all_paths(self: &Arc) { + let aliases = self.aliases(); + if aliases.is_empty() { + return; + } + + let mut relocated = false; + + let mut config = match self.config.write() { + Ok(c) => c, + Err(e) => { + warn!("reconcile: config lock poisoned: {}", e); + return; + } + }; + + for alias in &aliases { + let Some(path) = config.resolve(alias) else { + continue; + }; + if path.exists() { + continue; + } + + match config.try_relocate(alias) { + Some(new_path) => { + info!( + "reconcile: relocated '{}' → {} (was {})", + alias, + new_path.display(), + path.display() + ); + config.repos.insert(alias.clone(), new_path); + relocated = true; + } + None => { + warn!( + "reconcile: '{}' path missing ({}); skipping — \ + run `codesearch index prune` to remove it", + alias, + path.display() + ); + } + } + } + + if relocated { + if let Err(e) = self.persist_config(&config) { + warn!("reconcile: failed to persist relocated paths: {}", e); + } + } + } + /// Phase 1: warm all repos sequentially, awaiting incremental refresh per repo. pub(crate) async fn run_phase_1_warmup_all(self: &Arc) { let aliases = self.aliases(); @@ -662,6 +722,18 @@ impl ServeState { return; } }; + // Guard against stale entries whose folder was removed/renamed + // and could not be relocated: skip rather than run SCIP on a + // non-existent path. + if !path.exists() { + warn!( + "phase-2: skip '{}' — path missing ({})", + alias, + path.display() + ); + drop(permit); + return; + } let db_path = path.join(DB_DIR_NAME); trigger_symbol_rebuild(&alias, &path, &db_path, &state).await; drop(permit); @@ -707,6 +779,11 @@ impl ServeState { None => continue, }; + // Skip stale entries whose folder no longer exists. + if !path.exists() { + continue; + } + // Only pre-warm repos that have a ready C# index let status = self .csharp_index_status @@ -2940,6 +3017,7 @@ pub async fn run_serve( { let phase_state = serve_state.clone(); tokio::spawn(async move { + phase_state.reconcile_all_paths(); phase_state.run_phase_1_warmup_all().await; phase_state.run_phase_2_csharp_scip().await; phase_state.run_phase_3_prewarm().await; From 5d83b3e6197a7d65848ead5b05681bd96770f6e3 Mon Sep 17 00:00:00 2001 From: flupkede Date: Mon, 1 Jun 2026 19:51:44 +0200 Subject: [PATCH 3/9] @ [worker] stage 3/5: remove user-settable --alias, always derive - Drop `--alias`/`-a` from `index add` subcommand and the legacy `index --add` flag path. Alias is always derived from the directory name via ReposConfig::register(). - add_to_index() loses its `alias` parameter; legacy current-dir local DBs are now auto-registered with a derived alias. - Serve delegation always sends None so serve derives the alias too. - Replace test_cli_index_add_accepts_alias_flag with test_cli_index_add_rejects_alias_flag + parses_without_alias. Co-Authored-By: Claude Opus 4.8 (1M context) @ --- Cargo.lock | 2 +- Cargo.toml | 2 +- src/cli/mod.rs | 42 ++++++++++++++++++---------------------- src/index/mod.rs | 50 +++++++++++++++++------------------------------- 4 files changed, 39 insertions(+), 57 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c053851..2b4b47c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -628,7 +628,7 @@ dependencies = [ [[package]] name = "codesearch" -version = "1.0.144" +version = "1.0.145" dependencies = [ "anyhow", "arroy", diff --git a/Cargo.toml b/Cargo.toml index 917d5b1..8fc6b77 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "codesearch" -version = "1.0.144" +version = "1.0.145" edition = "2021" authors = ["codesearch contributors"] license = "Apache-2.0" diff --git a/src/cli/mod.rs b/src/cli/mod.rs index 636d072..4c865dc 100644 --- a/src/cli/mod.rs +++ b/src/cli/mod.rs @@ -20,10 +20,6 @@ pub enum IndexCommands { /// Create global index instead of local #[arg(short = 'g', long)] global: bool, - - /// Alias for this repository (auto-generated from directory name if omitted) - #[arg(short, long)] - alias: Option, }, /// Remove the index (local or global, auto-detected) @@ -238,10 +234,6 @@ pub enum Commands { #[arg(short = 'g', long)] global: bool, - /// Alias for this repository (only with --add) - #[arg(short, long)] - alias: Option, - /// Remove the index (local or global, auto-detected) #[arg(long, visible_alias = "rm")] remove: bool, @@ -535,7 +527,6 @@ pub async fn run(cancel_token: CancellationToken) -> Result<()> { symbols, add, global, - alias, remove, keep_config, list, @@ -546,11 +537,7 @@ pub async fn run(cancel_token: CancellationToken) -> Result<()> { IndexCommands::Add { path: add_path, global, - alias, - } => { - crate::index::add_to_index(add_path, global, alias, cancel_token.clone()) - .await - } + } => crate::index::add_to_index(add_path, global, cancel_token.clone()).await, IndexCommands::Remove { path: rm_path, keep_config, @@ -573,8 +560,7 @@ pub async fn run(cancel_token: CancellationToken) -> Result<()> { if add || is_add_cmd { let effective_path = if is_add_cmd { None } else { path }; - crate::index::add_to_index(effective_path, global, alias, cancel_token.clone()) - .await + crate::index::add_to_index(effective_path, global, cancel_token.clone()).await } else if remove || is_rm_cmd { let effective_path = if is_rm_cmd { None } else { path }; crate::index::remove_from_index(effective_path, keep_config).await @@ -915,22 +901,32 @@ mod tests { } #[test] - fn test_cli_index_add_accepts_alias_flag() { - let cli = Cli::try_parse_from([ + fn test_cli_index_add_rejects_alias_flag() { + // The user-settable alias was removed; the flag must no longer parse. + let result = Cli::try_parse_from([ "codesearch", "index", "add", "/tmp/foo", "--alias", "myrepo", - ]) - .expect("cli parse should succeed"); + ]); + assert!( + result.is_err(), + "'--alias' flag should no longer be accepted on `index add`" + ); + } + + #[test] + fn test_cli_index_add_parses_without_alias() { + let cli = Cli::try_parse_from(["codesearch", "index", "add", "/tmp/foo"]) + .expect("cli parse should succeed"); match cli.command { Commands::Index { - command: Some(IndexCommands::Add { alias: Some(a), .. }), + command: Some(IndexCommands::Add { path: Some(p), .. }), .. - } => assert_eq!(a, "myrepo"), - _ => panic!("expected Index::Add subcommand with alias"), + } => assert_eq!(p, std::path::PathBuf::from("/tmp/foo")), + _ => panic!("expected Index::Add subcommand"), } } diff --git a/src/index/mod.rs b/src/index/mod.rs index df2ddef..cbecd00 100644 --- a/src/index/mod.rs +++ b/src/index/mod.rs @@ -1306,7 +1306,6 @@ pub async fn prune_index() -> Result<()> { pub async fn add_to_index( path: Option, global: bool, - alias: Option, cancel_token: CancellationToken, ) -> Result<()> { let project_path = path.as_deref().unwrap_or_else(|| Path::new(".")); @@ -1320,8 +1319,9 @@ pub async fn add_to_index( // Serve handles: register in repos.json + create index + warmup. let add_delegate = serve_delegate_with_warmup_wait(|| { let path = path.clone(); - let alias = alias.clone(); - async move { try_delegate_add_to_serve(&path, &alias, global).await } + // Alias is always derived from the directory name; the CLI no longer + // lets the user set it. Pass None so serve derives it consistently. + async move { try_delegate_add_to_serve(&path, &None, global).await } }) .await; @@ -1368,24 +1368,19 @@ pub async fn add_to_index( println!(" Type: {}", "Local".bright_green()); } - // If an alias is provided and this is a local DB in the current dir, - // register it in repos.json (for legacy DB's that predate auto-registration). - if alias.is_some() && db.is_current && !db.is_global { + // If this is a local DB in the current dir, ensure it is registered in + // repos.json (for legacy DBs that predate auto-registration). The alias + // is always derived from the directory name. + if db.is_current && !db.is_global { let mut config = crate::db_discovery::repos::ReposConfig::load().unwrap_or_default(); if let Some(existing) = config.alias_for_path(&canonical_path) { println!(" Already registered as '{}'.", existing); } else { - match config.register_with_alias(canonical_path.clone(), alias.clone()) { - Ok(assigned) => { - if let Err(e) = config.save() { - eprintln!("⚠️ Failed to save repos config: {}", e); - } else { - println!(" ✅ Registered as '{}'.", assigned); - } - } - Err(e) => { - eprintln!("⚠️ Registration failed: {}", e); - } + let assigned = config.register(canonical_path.clone()); + if let Err(e) = config.save() { + eprintln!("⚠️ Failed to save repos config: {}", e); + } else { + println!(" ✅ Registered as '{}'.", assigned); } } return Ok(()); @@ -1479,21 +1474,12 @@ pub async fn add_to_index( if let Some(existing) = config.alias_for_path(&canonical_path) { eprintln!("ℹ️ Already registered as '{}'.", existing); } else { - match config.register_with_alias(canonical_path.clone(), alias) { - Ok(assigned) => { - if let Err(e) = config.save() { - eprintln!("⚠️ Index created, but failed to save repos config: {}", e); - eprintln!(" Config path: {}", config_path.display()); - } else { - eprintln!("✅ Registered as '{}'.", assigned); - } - } - Err(e) => { - return Err(anyhow::anyhow!( - "Index created, but registration failed: {}", - e - )); - } + let assigned = config.register(canonical_path.clone()); + if let Err(e) = config.save() { + eprintln!("⚠️ Index created, but failed to save repos config: {}", e); + eprintln!(" Config path: {}", config_path.display()); + } else { + eprintln!("✅ Registered as '{}'.", assigned); } } } From e89a69f306b1f73f16bb3a40a504000203ebfd41 Mon Sep 17 00:00:00 2001 From: flupkede Date: Mon, 1 Jun 2026 19:57:57 +0200 Subject: [PATCH 4/9] @ [worker] stage 4/5: tolerate hand-edited repos.json via reconcile() - ReposConfig::reconcile() runs from load_from() on both new and legacy parse paths (in-memory only, no disk write): 1. drop entries with empty/blank alias keys 2. drop orphan repos_meta entries with no matching repo 3. prune group members referencing unknown aliases; drop empty groups - Never renames existing alias keys (would break group refs); a non-standard hand-edited alias is tolerated as-is. Never crashes. - Unit tests for empty-key, group-pruning/empty-group, and orphan-meta. Co-Authored-By: Claude Opus 4.8 (1M context) @ --- Cargo.lock | 2 +- Cargo.toml | 2 +- src/db_discovery/repos.rs | 109 ++++++++++++++++++++++++++++++++++++-- 3 files changed, 108 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 2b4b47c..3312e2d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -628,7 +628,7 @@ dependencies = [ [[package]] name = "codesearch" -version = "1.0.145" +version = "1.0.146" dependencies = [ "anyhow", "arroy", diff --git a/Cargo.toml b/Cargo.toml index 8fc6b77..ad093fc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "codesearch" -version = "1.0.145" +version = "1.0.146" edition = "2021" authors = ["codesearch contributors"] license = "Apache-2.0" diff --git a/src/db_discovery/repos.rs b/src/db_discovery/repos.rs index 01326d2..99a0444 100644 --- a/src/db_discovery/repos.rs +++ b/src/db_discovery/repos.rs @@ -51,7 +51,8 @@ impl ReposConfig { let content = fs::read_to_string(path)?; // New format - if let Ok(config) = serde_json::from_str::(&content) { + if let Ok(mut config) = serde_json::from_str::(&content) { + config.reconcile(); return Ok(config); } @@ -64,11 +65,13 @@ impl ReposConfig { repos.insert(alias, path); } - return Ok(Self { + let mut config = Self { repos, groups: HashMap::new(), repos_meta: HashMap::new(), - }); + }; + config.reconcile(); + return Ok(config); } // Both parses failed — file is corrupt @@ -78,6 +81,65 @@ impl ReposConfig { )) } + /// Harden an in-memory config loaded from disk so a hand-edited + /// `repos.json` can never crash the app. This is best-effort cleanup, + /// performed in memory only (no disk write here): + /// + /// 1. Drop repo entries whose alias key is empty/blank. + /// 2. Drop `repos_meta` entries that reference an unknown alias. + /// 3. Prune group members that reference unknown aliases; drop now-empty + /// groups. + /// + /// Existing (non-empty) alias keys are never renamed — that would break + /// group references — so a merely "non-standard" hand-edited alias is + /// tolerated as-is. + pub fn reconcile(&mut self) { + // 1. Drop empty/blank alias keys. + let empty_keys: Vec = self + .repos + .keys() + .filter(|alias| alias.trim().is_empty()) + .cloned() + .collect(); + for alias in empty_keys { + tracing::warn!("repos.json: dropping entry with empty alias key"); + self.repos.remove(&alias); + } + + // 2. Drop meta entries pointing at unknown aliases. + let orphan_meta: Vec = self + .repos_meta + .keys() + .filter(|alias| !self.repos.contains_key(*alias)) + .cloned() + .collect(); + for alias in orphan_meta { + tracing::warn!("repos.json: dropping orphan metadata for '{}'", alias); + self.repos_meta.remove(&alias); + } + + // 3. Prune group members referencing unknown aliases; drop empty groups. + let mut empty_groups: Vec = Vec::new(); + for (group, members) in self.groups.iter_mut() { + let before = members.len(); + members.retain(|alias| self.repos.contains_key(alias)); + if members.len() != before { + tracing::warn!( + "repos.json: pruned {} unknown alias(es) from group '{}'", + before - members.len(), + group + ); + } + if members.is_empty() { + empty_groups.push(group.clone()); + } + } + for group in empty_groups { + tracing::warn!("repos.json: dropping now-empty group '{}'", group); + self.groups.remove(&group); + } + } + pub fn save(&self) -> Result<()> { let path = Self::path()?; self.save_to(&path) @@ -567,6 +629,47 @@ mod tests { assert!(cfg.try_relocate(&alias).is_none()); } + #[test] + fn reconcile_drops_empty_alias_key() { + let mut cfg = ReposConfig::default(); + cfg.repos.insert(String::new(), PathBuf::from("/tmp/x")); + cfg.repos + .insert("good".to_string(), PathBuf::from("/tmp/good")); + cfg.reconcile(); + assert!(!cfg.repos.contains_key("")); + assert!(cfg.repos.contains_key("good")); + } + + #[test] + fn reconcile_prunes_unknown_group_members_and_empty_groups() { + let mut cfg = ReposConfig::default(); + cfg.repos + .insert("real".to_string(), PathBuf::from("/tmp/real")); + cfg.groups.insert( + "mix".to_string(), + vec!["real".to_string(), "ghost".to_string()], + ); + cfg.groups + .insert("dead".to_string(), vec!["ghost".to_string()]); + cfg.reconcile(); + assert_eq!(cfg.groups.get("mix"), Some(&vec!["real".to_string()])); + assert!( + !cfg.groups.contains_key("dead"), + "group with only unknown members should be dropped" + ); + } + + #[test] + fn reconcile_drops_orphan_meta() { + let mut cfg = ReposConfig::default(); + cfg.repos + .insert("real".to_string(), PathBuf::from("/tmp/real")); + cfg.repos_meta + .insert("ghost".to_string(), RepoMeta::default()); + cfg.reconcile(); + assert!(!cfg.repos_meta.contains_key("ghost")); + } + #[test] fn try_relocate_none_when_ambiguous() { let tmp = tempfile::tempdir().unwrap(); From 2d91da77c064264fc10284cf4986f297e50cf77b Mon Sep 17 00:00:00 2001 From: flupkede Date: Mon, 1 Jun 2026 20:03:00 +0200 Subject: [PATCH 5/9] @ [worker] stage 5/5: docs + tighten reconcile() visibility - Document stale-path relocation, `index prune`, derived-alias policy, and repos.json reconcile() in AGENTS.md and .claude/CLAUDE.md. - reconcile() is now pub(crate) (only used internally + same-module tests). Co-Authored-By: Claude Opus 4.8 (1M context) @ --- .claude/CLAUDE.md | 1 + AGENTS.md | 5 ++++- Cargo.lock | 2 +- Cargo.toml | 2 +- src/db_discovery/repos.rs | 2 +- 5 files changed, 8 insertions(+), 4 deletions(-) diff --git a/.claude/CLAUDE.md b/.claude/CLAUDE.md index 06c9ec9..9ec7462 100644 --- a/.claude/CLAUDE.md +++ b/.claude/CLAUDE.md @@ -22,6 +22,7 @@ Add symbol-aware reference lookups to codesearch via `find_impact` MCP tool. Ret - **`-with-csharp` release variants** — 6 release archives (3 plain + 3 with helper) - **Gated integration test** — `csharp_helper_integration` cargo feature for full-pipeline testing - **CI** — separate `csharp-integration-tests` job in `.github/workflows/ci.yml` +- **Stale-path resilience + derived alias** — moved/renamed indexed folders no longer crash serve: `git_remote` captured at registration, `reconcile_all_paths()` best-effort relocates by matching `remote.origin.url` (bounded depth, `CODESEARCH_RELOCATE_MAX_DEPTH`, default 3) else warn+skip; `codesearch index prune` for manual cleanup. The `--alias` flag was removed (alias always = directory name). `ReposConfig::reconcile()` hardens hand-edited `repos.json` on load. See AGENTS.md for details. ## Architecture diff --git a/AGENTS.md b/AGENTS.md index a6b967e..f983f28 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -26,7 +26,10 @@ Add symbol-aware reference lookups to codesearch via `find_impact` MCP tool. Ret - **Gated integration test** — `csharp_helper_integration` cargo feature for full-pipeline testing - **CI** — separate `csharp-integration-tests` job in `.github/workflows/ci.yml` - **Sequential phase-2 startup** — Phase 1 warms repos sequentially, Phase 2 runs gated C# SCIP rebuilds ordered by `last_changed_unix` under `Semaphore(concurrency)` via `CSHARP_SCIP_CONCURRENCY` env (default **2**, clamp [1,4]) -- **`repos_meta` tracking** — `RepoMeta` (last_changed_unix, last_scip_indexed_unix) persisted in `repos.json` with debounced save (10s window) +- **`repos_meta` tracking** — `RepoMeta` (last_changed_unix, last_scip_indexed_unix, git_remote) persisted in `repos.json` with debounced save (10s window) +- **Stale-path resilience** — a renamed/moved indexed folder no longer crashes serve. `git_remote` (`remote.origin.url`) is captured at registration; on startup `ServeState::reconcile_all_paths()` best-effort relocates a missing repo by scanning the nearest existing ancestor (bounded depth, env `CODESEARCH_RELOCATE_MAX_DEPTH`, default 3) for a git root with a matching remote — exactly one match rewrites `repos.json`, otherwise warn + skip. Phase-2/Phase-3 also guard `path.exists()`. Manual cleanup via **`codesearch index prune`** (relocate-first, else unregister stale aliases) +- **Alias is always derived** — the user-settable `--alias`/`-a` flag was removed from `index add`; the alias always equals the (sanitized) directory name via `ReposConfig::register()`. The alias remains the internal identifier (repos.json key, groups, `project` arg); only user override is gone. The `index symbol ` positional is a lookup key and is retained +- **Hand-edited `repos.json` tolerated** — `ReposConfig::reconcile()` runs in-memory on every load: drops empty-alias entries, drops orphan `repos_meta`, prunes group members referencing unknown aliases and empty groups. Never renames valid aliases, never crashes - **TUI C# indicator** — in status column: green `C#·` ready, yellow `C#…` indexing, red `C#!` error; footer shows helper availability; Calls column with tool call count - **Phase 2 & 3 TUI feedback** — Phase 2 pre-marks all queued candidates as `C#…` immediately on discovery (before semaphore slot); Phase 3 pre-warm sets `csharp_index_status = Indexing` before `batch-find-refs` and restores `Ready` after — TUI shows `C#…` throughout without touching `active_reindexes` (avoids blocking HTTP /reindex) - **Selective ref cache invalidation** — incremental rebuilds only purge cached refs for affected symbols, not entire cache diff --git a/Cargo.lock b/Cargo.lock index 3312e2d..e7c1852 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -628,7 +628,7 @@ dependencies = [ [[package]] name = "codesearch" -version = "1.0.146" +version = "1.0.147" dependencies = [ "anyhow", "arroy", diff --git a/Cargo.toml b/Cargo.toml index ad093fc..dcd1445 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "codesearch" -version = "1.0.146" +version = "1.0.147" edition = "2021" authors = ["codesearch contributors"] license = "Apache-2.0" diff --git a/src/db_discovery/repos.rs b/src/db_discovery/repos.rs index 99a0444..5876264 100644 --- a/src/db_discovery/repos.rs +++ b/src/db_discovery/repos.rs @@ -93,7 +93,7 @@ impl ReposConfig { /// Existing (non-empty) alias keys are never renamed — that would break /// group references — so a merely "non-standard" hand-edited alias is /// tolerated as-is. - pub fn reconcile(&mut self) { + pub(crate) fn reconcile(&mut self) { // 1. Drop empty/blank alias keys. let empty_keys: Vec = self .repos From fd9bae5aaf203dd6eda93f87d3a8e12fba656e23 Mon Sep 17 00:00:00 2001 From: flupkede Date: Mon, 1 Jun 2026 20:10:33 +0200 Subject: [PATCH 6/9] @ [worker] final review: use DB_DIR_NAME constant in relocation scan skip-list Replace hardcoded ".codesearch.db" literal with crate::constants::DB_DIR_NAME in is_skippable_scan_dir (no-hardcoded-config-strings rule). Co-Authored-By: Claude Opus 4.8 (1M context) @ --- Cargo.lock | 2 +- Cargo.toml | 2 +- src/db_discovery/repos.rs | 19 +++++++------------ 3 files changed, 9 insertions(+), 14 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e7c1852..b1ae05b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -628,7 +628,7 @@ dependencies = [ [[package]] name = "codesearch" -version = "1.0.147" +version = "1.0.148" dependencies = [ "anyhow", "arroy", diff --git a/Cargo.toml b/Cargo.toml index dcd1445..22e3aa2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "codesearch" -version = "1.0.147" +version = "1.0.148" edition = "2021" authors = ["codesearch contributors"] license = "Apache-2.0" diff --git a/src/db_discovery/repos.rs b/src/db_discovery/repos.rs index 5876264..8f63c66 100644 --- a/src/db_discovery/repos.rs +++ b/src/db_discovery/repos.rs @@ -506,19 +506,14 @@ fn relocate_max_depth() -> usize { /// Directory names never worth descending into during a relocation scan. fn is_skippable_scan_dir(path: &Path) -> bool { - matches!( - path.file_name().and_then(|n| n.to_str()), - Some( - ".git" - | "node_modules" - | "target" - | "bin" - | "obj" - | "dist" - | "build" - | ".codesearch.db" + let Some(name) = path.file_name().and_then(|n| n.to_str()) else { + return false; + }; + name == crate::constants::DB_DIR_NAME + || matches!( + name, + ".git" | "node_modules" | "target" | "bin" | "obj" | "dist" | "build" ) - ) } /// Recursively collect git roots under `dir` (bounded by `depth`) whose From a21571187ed6713e325316b87e435b678a2b85cc Mon Sep 17 00:00:00 2001 From: flupkede Date: Mon, 1 Jun 2026 20:50:26 +0200 Subject: [PATCH 7/9] @ [worker] tests: extract testable prune_stale/relocate_missing + expand coverage Refactor for testability (no behavior change): - Add pure ReposConfig::relocate_missing() -> (relocated, unresolved) and prune_stale() -> (relocated, removed); no disk I/O, no logging. - prune_index() and ServeState::reconcile_all_paths() now delegate to these, removing duplicated relocate-loop logic. New unit tests (8): - register_derives_alias_from_directory_name - try_relocate_finds_renamed_parent (parent-level rename within depth) - try_relocate_none_beyond_max_depth (depth bound enforced) - relocate_missing_rewrites_only_moved_repos - prune_stale_removes_unrelocatable_entries (+ group cleanup) - prune_stale_relocates_then_keeps_relocatable_entries - load_from_applies_reconcile_to_hand_edited_file (load-path reconcile) 24 repos lib tests pass; clippy clean. Co-Authored-By: Claude Opus 4.8 (1M context) @ --- Cargo.lock | 2 +- Cargo.toml | 2 +- src/db_discovery/repos.rs | 200 ++++++++++++++++++++++++++++++++++++++ src/index/mod.rs | 21 +--- src/serve/mod.rs | 41 +++----- 5 files changed, 214 insertions(+), 52 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b1ae05b..cb147e2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -628,7 +628,7 @@ dependencies = [ [[package]] name = "codesearch" -version = "1.0.148" +version = "1.0.149" dependencies = [ "anyhow", "arroy", diff --git a/Cargo.toml b/Cargo.toml index 22e3aa2..d8eb68a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "codesearch" -version = "1.0.148" +version = "1.0.149" edition = "2021" authors = ["codesearch contributors"] license = "Apache-2.0" diff --git a/src/db_discovery/repos.rs b/src/db_discovery/repos.rs index 8f63c66..bca28e9 100644 --- a/src/db_discovery/repos.rs +++ b/src/db_discovery/repos.rs @@ -396,6 +396,53 @@ impl ReposConfig { None } } + + /// Relocate every registered repo whose stored path no longer exists. + /// + /// For each missing path a best-effort git-identity relocation is attempted + /// ([`Self::try_relocate`]); successful matches rewrite the in-memory + /// `repos` map. This is pure (no disk I/O, no logging) so callers can decide + /// how to report and persist. Returns `(relocated, unresolved)` where + /// `relocated` is the list of `(alias, new_path)` rewrites and `unresolved` + /// is the list of aliases whose path is still missing. + #[must_use] + pub fn relocate_missing(&mut self) -> (Vec<(String, PathBuf)>, Vec) { + let aliases: Vec = self.repos.keys().cloned().collect(); + let mut relocated = Vec::new(); + let mut unresolved = Vec::new(); + + for alias in aliases { + let Some(path) = self.repos.get(&alias) else { + continue; + }; + if path.exists() { + continue; + } + match self.try_relocate(&alias) { + Some(new_path) => { + self.repos.insert(alias.clone(), new_path.clone()); + relocated.push((alias, new_path)); + } + None => unresolved.push(alias), + } + } + + (relocated, unresolved) + } + + /// Prune stale entries: relocate what can be relocated, then unregister the + /// rest. Pure (no disk I/O, no logging). Returns `(relocated, removed)`. + #[must_use] + pub fn prune_stale(&mut self) -> (Vec<(String, PathBuf)>, Vec) { + let (relocated, unresolved) = self.relocate_missing(); + let mut removed = Vec::new(); + for alias in unresolved { + if self.unregister_alias(&alias) { + removed.push(alias); + } + } + (relocated, removed) + } } pub fn config_dir() -> Result { @@ -575,6 +622,159 @@ mod tests { ); } + #[test] + fn register_derives_alias_from_directory_name() { + let tmp = tempfile::tempdir().unwrap(); + let repo = tmp.path().join("My.Cool-Repo"); + std::fs::create_dir(&repo).unwrap(); + + let mut cfg = ReposConfig::default(); + let alias = cfg.register(repo.clone()); + // Alias is derived from (and sanitized from) the directory name. + assert_eq!(alias, sanitize_alias("My.Cool-Repo")); + assert!(cfg.repos.contains_key(&alias)); + } + + #[test] + fn try_relocate_finds_renamed_parent() { + let tmp = tempfile::tempdir().unwrap(); + let parent = tmp.path().join("parent"); + let repo = parent.join("repo"); + std::fs::create_dir_all(&repo).unwrap(); + init_git_remote(&repo, "https://example.com/acme/parent-repo.git"); + + let mut cfg = ReposConfig::default(); + let alias = cfg.register(repo.clone()); + + // Rename the PARENT folder; the stored repo path is now stale, but the + // repo itself sits one level below the nearest existing ancestor (tmp). + std::fs::rename(&parent, tmp.path().join("parent-renamed")).unwrap(); + + let expected = tmp.path().join("parent-renamed").join("repo"); + let found = cfg + .try_relocate(&alias) + .expect("should relocate via renamed parent"); + assert_eq!( + normalize_path_for_compare(&found), + normalize_path_for_compare(&expected) + ); + } + + #[test] + fn try_relocate_none_beyond_max_depth() { + // Default max depth is 3. Bury the repo deeper than that below the + // nearest existing ancestor so the scan cannot reach it. + let tmp = tempfile::tempdir().unwrap(); + let deep = tmp.path().join("oldbox").join("l1").join("l2").join("repo"); + std::fs::create_dir_all(&deep).unwrap(); + init_git_remote(&deep, "https://example.com/acme/deep.git"); + + let mut cfg = ReposConfig::default(); + let alias = cfg.register(deep.clone()); + + // Rename the top box; nearest existing ancestor becomes tmp root, and + // the repo now sits 4 levels below it (box/l1/l2/repo) — out of reach. + std::fs::rename(tmp.path().join("oldbox"), tmp.path().join("box")).unwrap(); + + assert!( + cfg.try_relocate(&alias).is_none(), + "repo beyond CODESEARCH_RELOCATE_MAX_DEPTH must not be relocated" + ); + } + + #[test] + fn relocate_missing_rewrites_only_moved_repos() { + let tmp = tempfile::tempdir().unwrap(); + let moved = tmp.path().join("moved"); + let stable = tmp.path().join("stable"); + std::fs::create_dir(&moved).unwrap(); + std::fs::create_dir(&stable).unwrap(); + init_git_remote(&moved, "https://example.com/acme/moved.git"); + init_git_remote(&stable, "https://example.com/acme/stable.git"); + + let mut cfg = ReposConfig::default(); + let moved_alias = cfg.register(moved.clone()); + let stable_alias = cfg.register(stable.clone()); + + let renamed = tmp.path().join("moved-renamed"); + std::fs::rename(&moved, &renamed).unwrap(); + + let (relocated, unresolved) = cfg.relocate_missing(); + assert!(unresolved.is_empty()); + assert_eq!(relocated.len(), 1); + assert_eq!(relocated[0].0, moved_alias); + assert_eq!( + normalize_path_for_compare(cfg.repos.get(&moved_alias).unwrap()), + normalize_path_for_compare(&renamed) + ); + // The stable repo is untouched. + assert_eq!( + normalize_path_for_compare(cfg.repos.get(&stable_alias).unwrap()), + normalize_path_for_compare(&stable) + ); + } + + #[test] + fn prune_stale_removes_unrelocatable_entries() { + let tmp = tempfile::tempdir().unwrap(); + // No git remote → cannot be relocated → must be pruned. + let plain = tmp.path().join("plain"); + std::fs::create_dir(&plain).unwrap(); + + let mut cfg = ReposConfig::default(); + let alias = cfg.register(plain.clone()); + cfg.add_group("g".to_string(), vec![alias.clone()]).unwrap(); + + std::fs::rename(&plain, tmp.path().join("plain-moved")).unwrap(); + + let (relocated, removed) = cfg.prune_stale(); + assert!(relocated.is_empty()); + assert_eq!(removed, vec![alias.clone()]); + assert!(!cfg.repos.contains_key(&alias)); + // unregister_alias also cleans group membership. + assert!(!cfg.groups.contains_key("g")); + } + + #[test] + fn prune_stale_relocates_then_keeps_relocatable_entries() { + let tmp = tempfile::tempdir().unwrap(); + let repo = tmp.path().join("repo"); + std::fs::create_dir(&repo).unwrap(); + init_git_remote(&repo, "https://example.com/acme/keep.git"); + + let mut cfg = ReposConfig::default(); + let alias = cfg.register(repo.clone()); + + let renamed = tmp.path().join("repo-renamed"); + std::fs::rename(&repo, &renamed).unwrap(); + + let (relocated, removed) = cfg.prune_stale(); + assert!(removed.is_empty()); + assert_eq!(relocated.len(), 1); + assert!(cfg.repos.contains_key(&alias), "relocated entry is kept"); + } + + #[test] + fn load_from_applies_reconcile_to_hand_edited_file() { + // A hand-edited repos.json with an empty-alias entry and a group that + // references an unknown alias must be reconciled (not crash) on load. + let tmp = tempfile::tempdir().unwrap(); + let cfg_path = tmp.path().join("repos.json"); + let json = r#"{ + "repos": { "": "/tmp/blank", "good": "/tmp/good" }, + "groups": { "mix": ["good", "ghost"], "dead": ["ghost"] }, + "repos_meta": { "ghost": {} } + }"#; + std::fs::write(&cfg_path, json).unwrap(); + + let cfg = ReposConfig::load_from(&cfg_path).expect("load should succeed"); + assert!(!cfg.repos.contains_key(""), "empty alias dropped"); + assert!(cfg.repos.contains_key("good")); + assert_eq!(cfg.groups.get("mix"), Some(&vec!["good".to_string()])); + assert!(!cfg.groups.contains_key("dead"), "empty group dropped"); + assert!(!cfg.repos_meta.contains_key("ghost"), "orphan meta dropped"); + } + #[test] fn try_relocate_finds_renamed_leaf() { let tmp = tempfile::tempdir().unwrap(); diff --git a/src/index/mod.rs b/src/index/mod.rs index cbecd00..798fb2c 100644 --- a/src/index/mod.rs +++ b/src/index/mod.rs @@ -1260,26 +1260,7 @@ pub async fn prune_index() -> Result<()> { use crate::db_discovery::repos::ReposConfig; let mut config = ReposConfig::load()?; - let aliases: Vec = config.repos.keys().cloned().collect(); - - let mut relocated: Vec<(String, PathBuf)> = Vec::new(); - let mut removed: Vec = Vec::new(); - - for alias in &aliases { - let Some(path) = config.resolve(alias) else { - continue; - }; - if path.exists() { - continue; - } - - if let Some(new_path) = config.try_relocate(alias) { - config.repos.insert(alias.clone(), new_path.clone()); - relocated.push((alias.clone(), new_path)); - } else if config.unregister_alias(alias) { - removed.push(alias.clone()); - } - } + let (relocated, removed) = config.prune_stale(); if relocated.is_empty() && removed.is_empty() { println!("✅ No stale repositories found — repos.json is clean."); diff --git a/src/serve/mod.rs b/src/serve/mod.rs index 452462d..d1bb59b 100644 --- a/src/serve/mod.rs +++ b/src/serve/mod.rs @@ -576,8 +576,6 @@ impl ServeState { return; } - let mut relocated = false; - let mut config = match self.config.write() { Ok(c) => c, Err(e) => { @@ -586,37 +584,20 @@ impl ServeState { } }; - for alias in &aliases { - let Some(path) = config.resolve(alias) else { - continue; - }; - if path.exists() { - continue; - } + let (relocated, unresolved) = config.relocate_missing(); - match config.try_relocate(alias) { - Some(new_path) => { - info!( - "reconcile: relocated '{}' → {} (was {})", - alias, - new_path.display(), - path.display() - ); - config.repos.insert(alias.clone(), new_path); - relocated = true; - } - None => { - warn!( - "reconcile: '{}' path missing ({}); skipping — \ - run `codesearch index prune` to remove it", - alias, - path.display() - ); - } - } + for (alias, new_path) in &relocated { + info!("reconcile: relocated '{}' → {}", alias, new_path.display()); + } + for alias in &unresolved { + warn!( + "reconcile: '{}' path missing; skipping — \ + run `codesearch index prune` to remove it", + alias + ); } - if relocated { + if !relocated.is_empty() { if let Err(e) = self.persist_config(&config) { warn!("reconcile: failed to persist relocated paths: {}", e); } From d2ce861d805dce58bc3030f4ba39610863625d22 Mon Sep 17 00:00:00 2001 From: flupkede Date: Mon, 1 Jun 2026 20:52:28 +0200 Subject: [PATCH 8/9] @ docs: README + CHANGELOG for relocation, index prune, derived alias - README: document `codesearch index prune`, automatic relocation of moved/renamed repos (CODESEARCH_RELOCATE_MAX_DEPTH), the alias-always- derived policy (no --alias flag), and hand-edited repos.json tolerance. - CHANGELOG: consolidated 1.0.149 entry (Added/Changed/Fixed). - README language table + alias example updates (pre-existing). Co-Authored-By: Claude Opus 4.8 (1M context) @ --- CHANGELOG.md | 34 ++++++++++++++++++++++++++++++++++ Cargo.lock | 2 +- Cargo.toml | 2 +- README.md | 42 ++++++++++++++++++++++++++++++++++++------ 4 files changed, 72 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2c4a3d2..5566622 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,40 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 +## [1.0.149] - 2026-06-01 + +### Added + +- **Best-effort relocation of moved/renamed repositories** — every repo's git + remote (`remote.origin.url`) is now captured at registration. When a + registered folder is renamed or moved, `codesearch serve` no longer crashes: + on startup it reconciles all paths, and for each missing path it scans nearby + folders (bounded depth, override with `CODESEARCH_RELOCATE_MAX_DEPTH`, default + `3`) for a git checkout with the same remote. A single unambiguous match is + rewritten into `repos.json`; ambiguous/absent matches are logged and skipped + (the dead path is never indexed). Phase-2 (C# SCIP) and Phase-3 (pre-warm) + also guard `path.exists()` so a stale path can never reach heavy code paths. +- **`codesearch index prune`** — new command that relocates moved repos first, + then unregisters any remaining stale entries, printing a summary. + +### Changed + +- **The user-settable `--alias`/`-a` flag was removed from `index add`** — the + alias (the `repos.json` key, used by groups and the MCP `project` argument) is + now always derived from the repository directory name. In practice the alias + always had to equal the directory name, so a custom alias only caused + downstream mismatches. The `index symbol ` positional (a lookup key) is + unchanged. + +### Fixed + +- **A hand-edited or corrupt-ish `repos.json` no longer crashes the app** — on + load the config is reconciled in memory: entries with empty/blank alias keys + are dropped, orphaned `repos_meta` is removed, and group members referencing + unknown aliases (and groups left empty) are pruned. Valid aliases are never + renamed (that would break group references). + + ## [1.0.142] - 2026-06-01 ### Fixed diff --git a/Cargo.lock b/Cargo.lock index cb147e2..dd2828f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -628,7 +628,7 @@ dependencies = [ [[package]] name = "codesearch" -version = "1.0.149" +version = "1.0.150" dependencies = [ "anyhow", "arroy", diff --git a/Cargo.toml b/Cargo.toml index d8eb68a..76d7c53 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "codesearch" -version = "1.0.149" +version = "1.0.150" edition = "2021" authors = ["codesearch contributors"] license = "Apache-2.0" diff --git a/README.md b/README.md index e4afec5..08a2f07 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ codesearch gives AI agents (OpenCode, Claude Code, Cursor, and any MCP client) d - **Multi-repo serve mode**: Fan-out queries across repository groups with cross-repo RRF ranking - **Hybrid retrieval**: Vector embeddings + BM25 full-text search fused with Reciprocal Rank Fusion - **Symbol navigation**: Jump to definitions, find usages, trace imports and dependents — in the same tool -- **AST-aware chunking**: Tree-sitter parsing for 9 languages — chunks align to functions/classes, not arbitrary line ranges +- **AST-aware chunking**: Tree-sitter parsing for 13 languages — chunks align to functions/classes, not arbitrary line ranges - **Token-efficient**: Returns metadata by default; agents fetch full code only when needed via `get_chunk` - **Lightweight footprint**: Hundreds of MB on disk, runs on CPU only, no runtime model downloads (works behind enterprise proxies) - **Zero config for single repos**: `codesearch index && codesearch mcp` — done @@ -118,6 +118,9 @@ codesearch index rm /path/to/my-project # List registered repos codesearch index list + +# Remove stale entries (relocates moved repos first, then drops the rest) +codesearch index prune ``` `codesearch index add` is intended to be run from inside the repo you want to register. @@ -312,17 +315,39 @@ Repos are registered via `codesearch index add`: ```bash # Register a repo (creates index + adds to ~/.codesearch/repos.json) -codesearch index add /path/to/my-project --alias my-project +codesearch index add /path/to/my-project # Remove a repo codesearch index rm /path/to/my-project # List registered repos codesearch index list + +# Clean up stale entries (relocates moved repos, drops the rest) +codesearch index prune ``` +The repository **alias** (the key in `repos.json`, used for groups and the MCP +`project` argument) is always derived automatically from the directory name — +there is no `--alias` flag. + Serve reads `~/.codesearch/repos.json` on startup and manages all registered repos. +#### Moved or renamed repositories + +If you rename or move a registered folder, serve does **not** crash. On startup +it tries to **relocate** each missing repo automatically: it captures every +repo's git remote (`remote.origin.url`) at registration, and on a missing path +it scans nearby folders (bounded depth, override with +`CODESEARCH_RELOCATE_MAX_DEPTH`, default `3`) for a git checkout with the same +remote. A single unambiguous match is rewritten into `repos.json`; otherwise the +entry is logged and skipped (never indexed against a dead path). Run +`codesearch index prune` to relocate what can be relocated and drop the rest. + +A hand-edited `repos.json` is also tolerated: empty entries, orphaned metadata, +and group references to unknown repos are cleaned up on load rather than +crashing. + ### Groups Groups let you search across related repositories: @@ -410,14 +435,19 @@ Tree-sitter AST-aware chunking: | Language | Extensions | |----------|-----------| | Rust | `.rs` | -| Python | `.py` | -| JavaScript | `.js`, `.jsx` | -| TypeScript | `.ts`, `.tsx` | +| Python | `.py`, `.pyw`, `.pyi` | +| JavaScript | `.js`, `.mjs`, `.cjs` | +| TypeScript | `.ts`, `.tsx`, `.jsx`, `.mts`, `.cts` | | C | `.c`, `.h` | -| C++ | `.cpp`, `.hpp` | +| C++ | `.cpp`, `.cc`, `.cxx`, `.hpp`, `.hxx` | | C# | `.cs` | | Go | `.go` | | Java | `.java` | +| Shell/Bash | `.sh`, `.bash`, `.zsh` | +| Ruby | `.rb`, `.rake` | +| PHP | `.php` | +| YAML | `.yaml`, `.yml` | +| JSON | `.json` | All other text files use line-based chunking as fallback. From 05e0b4778ed79c33dd2ef6d18fa7d2b1d49d3800 Mon Sep 17 00:00:00 2001 From: flupkede Date: Mon, 1 Jun 2026 21:03:25 +0200 Subject: [PATCH 9/9] @ [worker] address review remarks: align CHANGELOG version + restore log path - CHANGELOG entry retitled to 1.0.151 to match the shipped Cargo.toml version (pre-commit bumps patch by 1 on this commit). - reconcile warn for unresolved repos again includes the missing path for diagnostics (lost during the relocate_missing extraction). Co-Authored-By: Claude Opus 4.8 (1M context) @ --- CHANGELOG.md | 2 +- Cargo.lock | 2 +- Cargo.toml | 2 +- src/serve/mod.rs | 9 +++++++-- 4 files changed, 10 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5566622..933af51 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 -## [1.0.149] - 2026-06-01 +## [1.0.151] - 2026-06-01 ### Added diff --git a/Cargo.lock b/Cargo.lock index dd2828f..ddd4acc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -628,7 +628,7 @@ dependencies = [ [[package]] name = "codesearch" -version = "1.0.150" +version = "1.0.151" dependencies = [ "anyhow", "arroy", diff --git a/Cargo.toml b/Cargo.toml index 76d7c53..0104aa6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "codesearch" -version = "1.0.150" +version = "1.0.151" edition = "2021" authors = ["codesearch contributors"] license = "Apache-2.0" diff --git a/src/serve/mod.rs b/src/serve/mod.rs index d1bb59b..eb0fac8 100644 --- a/src/serve/mod.rs +++ b/src/serve/mod.rs @@ -590,10 +590,15 @@ impl ServeState { info!("reconcile: relocated '{}' → {}", alias, new_path.display()); } for alias in &unresolved { + let missing = config + .repos + .get(alias) + .map(|p| p.display().to_string()) + .unwrap_or_default(); warn!( - "reconcile: '{}' path missing; skipping — \ + "reconcile: '{}' path missing ({}); skipping — \ run `codesearch index prune` to remove it", - alias + alias, missing ); }