diff --git a/.claude/CLAUDE.md b/.claude/CLAUDE.md index 06c9ec9..9ec7462 100644 --- a/.claude/CLAUDE.md +++ b/.claude/CLAUDE.md @@ -22,6 +22,7 @@ Add symbol-aware reference lookups to codesearch via `find_impact` MCP tool. Ret - **`-with-csharp` release variants** — 6 release archives (3 plain + 3 with helper) - **Gated integration test** — `csharp_helper_integration` cargo feature for full-pipeline testing - **CI** — separate `csharp-integration-tests` job in `.github/workflows/ci.yml` +- **Stale-path resilience + derived alias** — moved/renamed indexed folders no longer crash serve: `git_remote` captured at registration, `reconcile_all_paths()` best-effort relocates by matching `remote.origin.url` (bounded depth, `CODESEARCH_RELOCATE_MAX_DEPTH`, default 3) else warn+skip; `codesearch index prune` for manual cleanup. The `--alias` flag was removed (alias always = directory name). `ReposConfig::reconcile()` hardens hand-edited `repos.json` on load. See AGENTS.md for details. ## Architecture diff --git a/.claude/commands/merge.md b/.claude/commands/merge.md index 3931644..0898a0d 100644 --- a/.claude/commands/merge.md +++ b/.claude/commands/merge.md @@ -80,9 +80,11 @@ release — tagging happens only in `/release`. `PR=$(gh pr view --json number --jq .number)`. 8. **Auto-merge after CI** - - `gh pr merge "$PR" --auto --merge` so the PR lands automatically once required checks pass. + - This repo **disallows merge commits** — always use `--squash`. NEVER `--merge` + (it fails with "Merge commits are not allowed on this repository"). + - `gh pr merge "$PR" --auto --squash` so the PR lands automatically once required checks pass. - If auto-merge is not enabled on the repo (command errors), fall back: poll - `gh pr checks "$PR" --watch`, then `gh pr merge "$PR" --merge` once green. + `gh pr checks "$PR" --watch`, then `gh pr merge "$PR" --squash` once green. ## Report Branch, pending release version, doc updates made, PR URL, and merge status diff --git a/.claude/commands/release.md b/.claude/commands/release.md index 16f2641..df062ed 100644 --- a/.claude/commands/release.md +++ b/.claude/commands/release.md @@ -40,9 +40,10 @@ to `develop`). Then **wait for the develop PR to actually merge** (auto-merge wa matching history (e.g. `Release v1.0.142 — serve responsive during warmup`). - Body ends with: `🤖 Generated with [Claude Code](https://claude.com/claude-code)`. - Capture the PR number: `RELEASE_PR=$(gh pr view develop --json number --jq .number)`. -4. `gh pr merge "$RELEASE_PR" --auto --merge`. Wait until `state` is +4. This repo **disallows merge commits** — always use `--squash`, never `--merge`. + `gh pr merge "$RELEASE_PR" --auto --squash`. Wait until `state` is `MERGED` (poll as in Part 1). If auto-merge is unavailable, `gh pr checks "$RELEASE_PR" --watch` - then `gh pr merge "$RELEASE_PR" --merge`. If CI fails, STOP. + then `gh pr merge "$RELEASE_PR" --squash`. If CI fails, STOP. ## Part 3 — tag the release 1. `git fetch origin --tags && git checkout master && git pull --ff-only origin master`. diff --git a/AGENTS.md b/AGENTS.md index f239200..fa12e85 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -26,7 +26,10 @@ Add symbol-aware reference lookups to codesearch via `find_impact` MCP tool. Ret - **Gated integration test** — `csharp_helper_integration` cargo feature for full-pipeline testing - **CI** — separate `csharp-integration-tests` job in `.github/workflows/ci.yml` - **Sequential phase-2 startup** — Phase 1 warms repos sequentially, Phase 2 runs gated C# SCIP rebuilds ordered by `last_changed_unix` under `Semaphore(concurrency)` via `CSHARP_SCIP_CONCURRENCY` env (default **2**, clamp [1,4]) -- **`repos_meta` tracking** — `RepoMeta` (last_changed_unix, last_scip_indexed_unix) persisted in `repos.json` with debounced save (10s window) +- **`repos_meta` tracking** — `RepoMeta` (last_changed_unix, last_scip_indexed_unix, git_remote) persisted in `repos.json` with debounced save (10s window) +- **Stale-path resilience** — a renamed/moved indexed folder no longer crashes serve. `git_remote` (`remote.origin.url`) is captured at registration; on startup `ServeState::reconcile_all_paths()` best-effort relocates a missing repo by scanning the nearest existing ancestor (bounded depth, env `CODESEARCH_RELOCATE_MAX_DEPTH`, default 3) for a git root with a matching remote — exactly one match rewrites `repos.json`, otherwise warn + skip. Phase-2/Phase-3 also guard `path.exists()`. Manual cleanup via **`codesearch index prune`** (relocate-first, else unregister stale aliases) +- **Alias is always derived** — the user-settable `--alias`/`-a` flag was removed from `index add`; the alias always equals the (sanitized) directory name via `ReposConfig::register()`. The alias remains the internal identifier (repos.json key, groups, `project` arg); only user override is gone. The `index symbol ` positional is a lookup key and is retained +- **Hand-edited `repos.json` tolerated** — `ReposConfig::reconcile()` runs in-memory on every load: drops empty-alias entries, drops orphan `repos_meta`, prunes group members referencing unknown aliases and empty groups. Never renames valid aliases, never crashes - **TUI C# indicator** — in status column: green `C#·` ready, yellow `C#…` indexing, red `C#!` error; footer shows helper availability; Calls column with tool call count - **Phase 2 & 3 TUI feedback** — Phase 2 pre-marks all queued candidates as `C#…` immediately on discovery (before semaphore slot); Phase 3 pre-warm sets `csharp_index_status = Indexing` before `batch-find-refs` and restores `Ready` after — TUI shows `C#…` throughout without touching `active_reindexes` (avoids blocking HTTP /reindex) - **Selective ref cache invalidation** — incremental rebuilds only purge cached refs for affected symbols, not entire cache diff --git a/CHANGELOG.md b/CHANGELOG.md index acb6322..756caa3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,39 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 +## [1.0.152] - 2026-06-02 + +### Added + +- **Best-effort relocation of moved/renamed repositories** — every repo's git + remote (`remote.origin.url`) is now captured at registration. When a + registered folder is renamed or moved, `codesearch serve` no longer crashes: + on startup it reconciles all paths, and for each missing path it scans nearby + folders (bounded depth, override with `CODESEARCH_RELOCATE_MAX_DEPTH`, default + `3`) for a git checkout with the same remote. A single unambiguous match is + rewritten into `repos.json`; ambiguous/absent matches are logged and skipped + (the dead path is never indexed). Phase-2 (C# SCIP) and Phase-3 (pre-warm) + also guard `path.exists()` so a stale path can never reach heavy code paths. +- **`codesearch index prune`** — new command that relocates moved repos first, + then unregisters any remaining stale entries, printing a summary. + +### Changed + +- **The user-settable `--alias`/`-a` flag was removed from `index add`** — the + alias (the `repos.json` key, used by groups and the MCP `project` argument) is + now always derived from the repository directory name. In practice the alias + always had to equal the directory name, so a custom alias only caused + downstream mismatches. The `index symbol ` positional (a lookup key) is + unchanged. + +### Fixed + +- **A hand-edited or corrupt-ish `repos.json` no longer crashes the app** — on + load the config is reconciled in memory: entries with empty/blank alias keys + are dropped, orphaned `repos_meta` is removed, and group members referencing + unknown aliases (and groups left empty) are pruned. Valid aliases are never + renamed (that would break group references). + ## [1.0.146] - 2026-06-02 ### Added diff --git a/Cargo.lock b/Cargo.lock index c22ffaa..5d32baf 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -628,7 +628,7 @@ dependencies = [ [[package]] name = "codesearch" -version = "1.0.146" +version = "1.0.152" dependencies = [ "anyhow", "arroy", diff --git a/Cargo.toml b/Cargo.toml index 004caa1..33d2449 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "codesearch" -version = "1.0.146" +version = "1.0.152" edition = "2021" authors = ["codesearch contributors"] license = "Apache-2.0" diff --git a/README.md b/README.md index 8ee61fe..a374aab 100644 --- a/README.md +++ b/README.md @@ -118,6 +118,9 @@ codesearch index rm /path/to/my-project # List registered repos codesearch index list + +# Remove stale entries (relocates moved repos first, then drops the rest) +codesearch index prune ``` `codesearch index add` is intended to be run from inside the repo you want to register. @@ -312,17 +315,39 @@ Repos are registered via `codesearch index add`: ```bash # Register a repo (creates index + adds to ~/.codesearch/repos.json) -codesearch index add /path/to/my-project --alias my-project +codesearch index add /path/to/my-project # Remove a repo codesearch index rm /path/to/my-project # List registered repos codesearch index list + +# Clean up stale entries (relocates moved repos, drops the rest) +codesearch index prune ``` +The repository **alias** (the key in `repos.json`, used for groups and the MCP +`project` argument) is always derived automatically from the directory name — +there is no `--alias` flag. + Serve reads `~/.codesearch/repos.json` on startup and manages all registered repos. +#### Moved or renamed repositories + +If you rename or move a registered folder, serve does **not** crash. On startup +it tries to **relocate** each missing repo automatically: it captures every +repo's git remote (`remote.origin.url`) at registration, and on a missing path +it scans nearby folders (bounded depth, override with +`CODESEARCH_RELOCATE_MAX_DEPTH`, default `3`) for a git checkout with the same +remote. A single unambiguous match is rewritten into `repos.json`; otherwise the +entry is logged and skipped (never indexed against a dead path). Run +`codesearch index prune` to relocate what can be relocated and drop the rest. + +A hand-edited `repos.json` is also tolerated: empty entries, orphaned metadata, +and group references to unknown repos are cleaned up on load rather than +crashing. + ### Groups Groups let you search across related repositories: diff --git a/src/cli/mod.rs b/src/cli/mod.rs index 813dd0d..4c865dc 100644 --- a/src/cli/mod.rs +++ b/src/cli/mod.rs @@ -20,10 +20,6 @@ pub enum IndexCommands { /// Create global index instead of local #[arg(short = 'g', long)] global: bool, - - /// Alias for this repository (auto-generated from directory name if omitted) - #[arg(short, long)] - alias: Option, }, /// Remove the index (local or global, auto-detected) @@ -49,6 +45,9 @@ pub enum IndexCommands { #[arg(short = 'f', long)] force: bool, }, + + /// Remove stale entries from repos.json (relocates moved repos first) + Prune, } /// Cache subcommands @@ -235,10 +234,6 @@ pub enum Commands { #[arg(short = 'g', long)] global: bool, - /// Alias for this repository (only with --add) - #[arg(short, long)] - alias: Option, - /// Remove the index (local or global, auto-detected) #[arg(long, visible_alias = "rm")] remove: bool, @@ -532,7 +527,6 @@ pub async fn run(cancel_token: CancellationToken) -> Result<()> { symbols, add, global, - alias, remove, keep_config, list, @@ -543,11 +537,7 @@ pub async fn run(cancel_token: CancellationToken) -> Result<()> { IndexCommands::Add { path: add_path, global, - alias, - } => { - crate::index::add_to_index(add_path, global, alias, cancel_token.clone()) - .await - } + } => crate::index::add_to_index(add_path, global, cancel_token.clone()).await, IndexCommands::Remove { path: rm_path, keep_config, @@ -556,6 +546,7 @@ pub async fn run(cancel_token: CancellationToken) -> Result<()> { IndexCommands::Symbol { alias, force } => { trigger_symbol_reindex_via_api(&alias, force).await } + IndexCommands::Prune => crate::index::prune_index().await, } } else { // Flag-based backward-compat path @@ -569,8 +560,7 @@ pub async fn run(cancel_token: CancellationToken) -> Result<()> { if add || is_add_cmd { let effective_path = if is_add_cmd { None } else { path }; - crate::index::add_to_index(effective_path, global, alias, cancel_token.clone()) - .await + crate::index::add_to_index(effective_path, global, cancel_token.clone()).await } else if remove || is_rm_cmd { let effective_path = if is_rm_cmd { None } else { path }; crate::index::remove_from_index(effective_path, keep_config).await @@ -911,22 +901,32 @@ mod tests { } #[test] - fn test_cli_index_add_accepts_alias_flag() { - let cli = Cli::try_parse_from([ + fn test_cli_index_add_rejects_alias_flag() { + // The user-settable alias was removed; the flag must no longer parse. + let result = Cli::try_parse_from([ "codesearch", "index", "add", "/tmp/foo", "--alias", "myrepo", - ]) - .expect("cli parse should succeed"); + ]); + assert!( + result.is_err(), + "'--alias' flag should no longer be accepted on `index add`" + ); + } + + #[test] + fn test_cli_index_add_parses_without_alias() { + let cli = Cli::try_parse_from(["codesearch", "index", "add", "/tmp/foo"]) + .expect("cli parse should succeed"); match cli.command { Commands::Index { - command: Some(IndexCommands::Add { alias: Some(a), .. }), + command: Some(IndexCommands::Add { path: Some(p), .. }), .. - } => assert_eq!(a, "myrepo"), - _ => panic!("expected Index::Add subcommand with alias"), + } => assert_eq!(p, std::path::PathBuf::from("/tmp/foo")), + _ => panic!("expected Index::Add subcommand"), } } diff --git a/src/constants.rs b/src/constants.rs index 0b03770..ba26ee5 100644 --- a/src/constants.rs +++ b/src/constants.rs @@ -192,6 +192,13 @@ pub const DEFAULT_EMBEDDING_DIMENSIONS: usize = 384; /// Environment variable to override repos config file path. pub const REPOS_CONFIG_ENV: &str = "CODESEARCH_REPOS_CONFIG"; +/// Environment variable to override how deep relocation scans for a moved repo. +pub const RELOCATE_MAX_DEPTH_ENV: &str = "CODESEARCH_RELOCATE_MAX_DEPTH"; + +/// Default bounded depth for the relocation scan (directories below the nearest +/// existing ancestor of a stale repo path). +pub const DEFAULT_RELOCATE_MAX_DEPTH: usize = 3; + /// Environment variable to set MCP mode: "auto", "client", or "local". pub const MCP_MODE_ENV: &str = "CODESEARCH_MCP_MODE"; diff --git a/src/db_discovery/repos.rs b/src/db_discovery/repos.rs index dbdd1eb..bca28e9 100644 --- a/src/db_discovery/repos.rs +++ b/src/db_discovery/repos.rs @@ -24,6 +24,10 @@ pub struct RepoMeta { /// Unix timestamp (seconds) of last successful SCIP index rebuild. #[serde(default, skip_serializing_if = "Option::is_none")] pub last_scip_indexed_unix: Option, + /// Git remote URL (`remote.origin.url`) captured at registration time. + /// Used to re-locate a repo whose folder was renamed/moved (best-effort). + #[serde(default, skip_serializing_if = "Option::is_none")] + pub git_remote: Option, } #[derive(Debug, Deserialize)] @@ -47,7 +51,8 @@ impl ReposConfig { let content = fs::read_to_string(path)?; // New format - if let Ok(config) = serde_json::from_str::(&content) { + if let Ok(mut config) = serde_json::from_str::(&content) { + config.reconcile(); return Ok(config); } @@ -60,11 +65,13 @@ impl ReposConfig { repos.insert(alias, path); } - return Ok(Self { + let mut config = Self { repos, groups: HashMap::new(), repos_meta: HashMap::new(), - }); + }; + config.reconcile(); + return Ok(config); } // Both parses failed — file is corrupt @@ -74,6 +81,65 @@ impl ReposConfig { )) } + /// Harden an in-memory config loaded from disk so a hand-edited + /// `repos.json` can never crash the app. This is best-effort cleanup, + /// performed in memory only (no disk write here): + /// + /// 1. Drop repo entries whose alias key is empty/blank. + /// 2. Drop `repos_meta` entries that reference an unknown alias. + /// 3. Prune group members that reference unknown aliases; drop now-empty + /// groups. + /// + /// Existing (non-empty) alias keys are never renamed — that would break + /// group references — so a merely "non-standard" hand-edited alias is + /// tolerated as-is. + pub(crate) fn reconcile(&mut self) { + // 1. Drop empty/blank alias keys. + let empty_keys: Vec = self + .repos + .keys() + .filter(|alias| alias.trim().is_empty()) + .cloned() + .collect(); + for alias in empty_keys { + tracing::warn!("repos.json: dropping entry with empty alias key"); + self.repos.remove(&alias); + } + + // 2. Drop meta entries pointing at unknown aliases. + let orphan_meta: Vec = self + .repos_meta + .keys() + .filter(|alias| !self.repos.contains_key(*alias)) + .cloned() + .collect(); + for alias in orphan_meta { + tracing::warn!("repos.json: dropping orphan metadata for '{}'", alias); + self.repos_meta.remove(&alias); + } + + // 3. Prune group members referencing unknown aliases; drop empty groups. + let mut empty_groups: Vec = Vec::new(); + for (group, members) in self.groups.iter_mut() { + let before = members.len(); + members.retain(|alias| self.repos.contains_key(alias)); + if members.len() != before { + tracing::warn!( + "repos.json: pruned {} unknown alias(es) from group '{}'", + before - members.len(), + group + ); + } + if members.is_empty() { + empty_groups.push(group.clone()); + } + } + for group in empty_groups { + tracing::warn!("repos.json: dropping now-empty group '{}'", group); + self.groups.remove(&group); + } + } + pub fn save(&self) -> Result<()> { let path = Self::path()?; self.save_to(&path) @@ -108,6 +174,9 @@ impl ReposConfig { } let alias = unique_alias_for_path(&self.repos, &canonical); + if let Some(remote) = git_remote_url(&canonical) { + self.repos_meta.entry(alias.clone()).or_default().git_remote = Some(remote); + } self.repos.insert(alias.clone(), canonical); alias } @@ -137,6 +206,12 @@ impl ReposConfig { None => unique_alias_for_path(&self.repos, &canonical), }; + if let Some(remote) = git_remote_url(&canonical) { + self.repos_meta + .entry(final_alias.clone()) + .or_default() + .git_remote = Some(remote); + } self.repos.insert(final_alias.clone(), canonical); Ok(final_alias) } @@ -279,6 +354,95 @@ impl ReposConfig { .find(|(_, p)| normalize_path_for_compare(p) == normalize_path_for_compare(&canonical)) .map(|(alias, _)| alias.clone()) } + + /// Best-effort relocation of a registered repo whose stored path no longer + /// exists (e.g. its folder was renamed/moved). Starting from the nearest + /// still-existing ancestor of the stale path, scans (bounded depth) for a + /// git repository whose `remote.origin.url` matches the one captured at + /// registration time. Returns the new path only on a single unambiguous + /// match; `None` when the path still exists, no remote was recorded, or the + /// match is absent/ambiguous. + pub fn try_relocate(&self, alias: &str) -> Option { + let stale = self.repos.get(alias)?; + if stale.exists() { + return None; // path is fine — nothing to relocate + } + + let target_remote = self.repos_meta.get(alias)?.git_remote.clone()?; + + // Walk up to the nearest ancestor that still exists on disk. + let mut anchor = stale.parent(); + while let Some(dir) = anchor { + if dir.exists() { + break; + } + anchor = dir.parent(); + } + let anchor = anchor?; + + let mut matches = Vec::new(); + scan_for_remote(anchor, &target_remote, relocate_max_depth(), &mut matches); + + // Don't relocate onto a path already registered under another alias. + matches.retain(|p| { + !self.repos.iter().any(|(a, existing)| { + a != alias && normalize_path_for_compare(existing) == normalize_path_for_compare(p) + }) + }); + + if matches.len() == 1 { + Some(strip_unc_prefix(matches.into_iter().next().unwrap())) + } else { + None + } + } + + /// Relocate every registered repo whose stored path no longer exists. + /// + /// For each missing path a best-effort git-identity relocation is attempted + /// ([`Self::try_relocate`]); successful matches rewrite the in-memory + /// `repos` map. This is pure (no disk I/O, no logging) so callers can decide + /// how to report and persist. Returns `(relocated, unresolved)` where + /// `relocated` is the list of `(alias, new_path)` rewrites and `unresolved` + /// is the list of aliases whose path is still missing. + #[must_use] + pub fn relocate_missing(&mut self) -> (Vec<(String, PathBuf)>, Vec) { + let aliases: Vec = self.repos.keys().cloned().collect(); + let mut relocated = Vec::new(); + let mut unresolved = Vec::new(); + + for alias in aliases { + let Some(path) = self.repos.get(&alias) else { + continue; + }; + if path.exists() { + continue; + } + match self.try_relocate(&alias) { + Some(new_path) => { + self.repos.insert(alias.clone(), new_path.clone()); + relocated.push((alias, new_path)); + } + None => unresolved.push(alias), + } + } + + (relocated, unresolved) + } + + /// Prune stale entries: relocate what can be relocated, then unregister the + /// rest. Pure (no disk I/O, no logging). Returns `(relocated, removed)`. + #[must_use] + pub fn prune_stale(&mut self) -> (Vec<(String, PathBuf)>, Vec) { + let (relocated, unresolved) = self.relocate_missing(); + let mut removed = Vec::new(); + for alias in unresolved { + if self.unregister_alias(&alias) { + removed.push(alias); + } + } + (relocated, removed) + } } pub fn config_dir() -> Result { @@ -354,11 +518,375 @@ fn normalize_path_for_compare(path: &Path) -> String { crate::cache::normalize_path(path) } +/// Best-effort lookup of a directory's git remote URL (`remote.origin.url`). +/// +/// Returns `None` when `git` is unavailable, the path is not a git repo, or the +/// repo has no `origin` remote. Used both to capture a repo's identity at +/// registration time and to match candidate directories during relocation. +pub(crate) fn git_remote_url(path: &Path) -> Option { + let output = std::process::Command::new("git") + .arg("-C") + .arg(path) + .args(["config", "--get", "remote.origin.url"]) + .output() + .ok()?; + + if !output.status.success() { + return None; + } + + let url = String::from_utf8_lossy(&output.stdout).trim().to_string(); + if url.is_empty() { + None + } else { + Some(url) + } +} + +/// Configured relocation scan depth (`CODESEARCH_RELOCATE_MAX_DEPTH`, default 3). +fn relocate_max_depth() -> usize { + std::env::var(crate::constants::RELOCATE_MAX_DEPTH_ENV) + .ok() + .and_then(|v| v.trim().parse::().ok()) + .unwrap_or(crate::constants::DEFAULT_RELOCATE_MAX_DEPTH) +} + +/// Directory names never worth descending into during a relocation scan. +fn is_skippable_scan_dir(path: &Path) -> bool { + let Some(name) = path.file_name().and_then(|n| n.to_str()) else { + return false; + }; + name == crate::constants::DB_DIR_NAME + || matches!( + name, + ".git" | "node_modules" | "target" | "bin" | "obj" | "dist" | "build" + ) +} + +/// Recursively collect git roots under `dir` (bounded by `depth`) whose +/// `remote.origin.url` matches `target_remote`. A matching git root is recorded +/// and not descended into (nested repos below it are ignored). +fn scan_for_remote(dir: &Path, target_remote: &str, depth: usize, out: &mut Vec) { + if dir.join(".git").exists() { + if git_remote_url(dir).as_deref() == Some(target_remote) { + out.push(dir.to_path_buf()); + } + return; + } + + if depth == 0 { + return; + } + + if let Ok(entries) = std::fs::read_dir(dir) { + for entry in entries.flatten() { + let child = entry.path(); + if child.is_dir() && !is_skippable_scan_dir(&child) { + scan_for_remote(&child, target_remote, depth - 1, out); + } + } + } +} + #[cfg(test)] mod tests { use super::*; use std::io::Write; + /// Initialise a git repo at `dir` with an `origin` remote pointing at `url`. + fn init_git_remote(dir: &Path, url: &str) { + let run = |args: &[&str]| { + std::process::Command::new("git") + .arg("-C") + .arg(dir) + .args(args) + .output() + .expect("git available in test env") + }; + run(&["init"]); + run(&["remote", "add", "origin", url]); + } + + #[test] + fn captures_git_remote_on_register() { + let tmp = tempfile::tempdir().unwrap(); + let repo = tmp.path().join("repo"); + std::fs::create_dir(&repo).unwrap(); + init_git_remote(&repo, "https://example.com/acme/repo.git"); + + let mut cfg = ReposConfig::default(); + let alias = cfg.register(repo); + assert_eq!( + cfg.meta(&alias).git_remote.as_deref(), + Some("https://example.com/acme/repo.git") + ); + } + + #[test] + fn register_derives_alias_from_directory_name() { + let tmp = tempfile::tempdir().unwrap(); + let repo = tmp.path().join("My.Cool-Repo"); + std::fs::create_dir(&repo).unwrap(); + + let mut cfg = ReposConfig::default(); + let alias = cfg.register(repo.clone()); + // Alias is derived from (and sanitized from) the directory name. + assert_eq!(alias, sanitize_alias("My.Cool-Repo")); + assert!(cfg.repos.contains_key(&alias)); + } + + #[test] + fn try_relocate_finds_renamed_parent() { + let tmp = tempfile::tempdir().unwrap(); + let parent = tmp.path().join("parent"); + let repo = parent.join("repo"); + std::fs::create_dir_all(&repo).unwrap(); + init_git_remote(&repo, "https://example.com/acme/parent-repo.git"); + + let mut cfg = ReposConfig::default(); + let alias = cfg.register(repo.clone()); + + // Rename the PARENT folder; the stored repo path is now stale, but the + // repo itself sits one level below the nearest existing ancestor (tmp). + std::fs::rename(&parent, tmp.path().join("parent-renamed")).unwrap(); + + let expected = tmp.path().join("parent-renamed").join("repo"); + let found = cfg + .try_relocate(&alias) + .expect("should relocate via renamed parent"); + assert_eq!( + normalize_path_for_compare(&found), + normalize_path_for_compare(&expected) + ); + } + + #[test] + fn try_relocate_none_beyond_max_depth() { + // Default max depth is 3. Bury the repo deeper than that below the + // nearest existing ancestor so the scan cannot reach it. + let tmp = tempfile::tempdir().unwrap(); + let deep = tmp.path().join("oldbox").join("l1").join("l2").join("repo"); + std::fs::create_dir_all(&deep).unwrap(); + init_git_remote(&deep, "https://example.com/acme/deep.git"); + + let mut cfg = ReposConfig::default(); + let alias = cfg.register(deep.clone()); + + // Rename the top box; nearest existing ancestor becomes tmp root, and + // the repo now sits 4 levels below it (box/l1/l2/repo) — out of reach. + std::fs::rename(tmp.path().join("oldbox"), tmp.path().join("box")).unwrap(); + + assert!( + cfg.try_relocate(&alias).is_none(), + "repo beyond CODESEARCH_RELOCATE_MAX_DEPTH must not be relocated" + ); + } + + #[test] + fn relocate_missing_rewrites_only_moved_repos() { + let tmp = tempfile::tempdir().unwrap(); + let moved = tmp.path().join("moved"); + let stable = tmp.path().join("stable"); + std::fs::create_dir(&moved).unwrap(); + std::fs::create_dir(&stable).unwrap(); + init_git_remote(&moved, "https://example.com/acme/moved.git"); + init_git_remote(&stable, "https://example.com/acme/stable.git"); + + let mut cfg = ReposConfig::default(); + let moved_alias = cfg.register(moved.clone()); + let stable_alias = cfg.register(stable.clone()); + + let renamed = tmp.path().join("moved-renamed"); + std::fs::rename(&moved, &renamed).unwrap(); + + let (relocated, unresolved) = cfg.relocate_missing(); + assert!(unresolved.is_empty()); + assert_eq!(relocated.len(), 1); + assert_eq!(relocated[0].0, moved_alias); + assert_eq!( + normalize_path_for_compare(cfg.repos.get(&moved_alias).unwrap()), + normalize_path_for_compare(&renamed) + ); + // The stable repo is untouched. + assert_eq!( + normalize_path_for_compare(cfg.repos.get(&stable_alias).unwrap()), + normalize_path_for_compare(&stable) + ); + } + + #[test] + fn prune_stale_removes_unrelocatable_entries() { + let tmp = tempfile::tempdir().unwrap(); + // No git remote → cannot be relocated → must be pruned. + let plain = tmp.path().join("plain"); + std::fs::create_dir(&plain).unwrap(); + + let mut cfg = ReposConfig::default(); + let alias = cfg.register(plain.clone()); + cfg.add_group("g".to_string(), vec![alias.clone()]).unwrap(); + + std::fs::rename(&plain, tmp.path().join("plain-moved")).unwrap(); + + let (relocated, removed) = cfg.prune_stale(); + assert!(relocated.is_empty()); + assert_eq!(removed, vec![alias.clone()]); + assert!(!cfg.repos.contains_key(&alias)); + // unregister_alias also cleans group membership. + assert!(!cfg.groups.contains_key("g")); + } + + #[test] + fn prune_stale_relocates_then_keeps_relocatable_entries() { + let tmp = tempfile::tempdir().unwrap(); + let repo = tmp.path().join("repo"); + std::fs::create_dir(&repo).unwrap(); + init_git_remote(&repo, "https://example.com/acme/keep.git"); + + let mut cfg = ReposConfig::default(); + let alias = cfg.register(repo.clone()); + + let renamed = tmp.path().join("repo-renamed"); + std::fs::rename(&repo, &renamed).unwrap(); + + let (relocated, removed) = cfg.prune_stale(); + assert!(removed.is_empty()); + assert_eq!(relocated.len(), 1); + assert!(cfg.repos.contains_key(&alias), "relocated entry is kept"); + } + + #[test] + fn load_from_applies_reconcile_to_hand_edited_file() { + // A hand-edited repos.json with an empty-alias entry and a group that + // references an unknown alias must be reconciled (not crash) on load. + let tmp = tempfile::tempdir().unwrap(); + let cfg_path = tmp.path().join("repos.json"); + let json = r#"{ + "repos": { "": "/tmp/blank", "good": "/tmp/good" }, + "groups": { "mix": ["good", "ghost"], "dead": ["ghost"] }, + "repos_meta": { "ghost": {} } + }"#; + std::fs::write(&cfg_path, json).unwrap(); + + let cfg = ReposConfig::load_from(&cfg_path).expect("load should succeed"); + assert!(!cfg.repos.contains_key(""), "empty alias dropped"); + assert!(cfg.repos.contains_key("good")); + assert_eq!(cfg.groups.get("mix"), Some(&vec!["good".to_string()])); + assert!(!cfg.groups.contains_key("dead"), "empty group dropped"); + assert!(!cfg.repos_meta.contains_key("ghost"), "orphan meta dropped"); + } + + #[test] + fn try_relocate_finds_renamed_leaf() { + let tmp = tempfile::tempdir().unwrap(); + let original = tmp.path().join("myrepo"); + std::fs::create_dir(&original).unwrap(); + init_git_remote(&original, "https://example.com/acme/myrepo.git"); + + let mut cfg = ReposConfig::default(); + let alias = cfg.register(original.clone()); + + // Rename the leaf folder; stored path is now stale. + let renamed = tmp.path().join("myrepo-renamed"); + std::fs::rename(&original, &renamed).unwrap(); + + let found = cfg + .try_relocate(&alias) + .expect("should relocate renamed leaf"); + assert_eq!( + normalize_path_for_compare(&found), + normalize_path_for_compare(&renamed) + ); + } + + #[test] + fn try_relocate_returns_none_when_path_exists() { + let tmp = tempfile::tempdir().unwrap(); + let repo = tmp.path().join("live"); + std::fs::create_dir(&repo).unwrap(); + init_git_remote(&repo, "https://example.com/acme/live.git"); + + let mut cfg = ReposConfig::default(); + let alias = cfg.register(repo); + assert!(cfg.try_relocate(&alias).is_none()); + } + + #[test] + fn try_relocate_none_without_recorded_remote() { + let tmp = tempfile::tempdir().unwrap(); + let plain = tmp.path().join("plain"); + std::fs::create_dir(&plain).unwrap(); + + let mut cfg = ReposConfig::default(); + let alias = cfg.register(plain.clone()); + assert!(cfg.meta(&alias).git_remote.is_none()); + + std::fs::rename(&plain, tmp.path().join("plain-moved")).unwrap(); + assert!(cfg.try_relocate(&alias).is_none()); + } + + #[test] + fn reconcile_drops_empty_alias_key() { + let mut cfg = ReposConfig::default(); + cfg.repos.insert(String::new(), PathBuf::from("/tmp/x")); + cfg.repos + .insert("good".to_string(), PathBuf::from("/tmp/good")); + cfg.reconcile(); + assert!(!cfg.repos.contains_key("")); + assert!(cfg.repos.contains_key("good")); + } + + #[test] + fn reconcile_prunes_unknown_group_members_and_empty_groups() { + let mut cfg = ReposConfig::default(); + cfg.repos + .insert("real".to_string(), PathBuf::from("/tmp/real")); + cfg.groups.insert( + "mix".to_string(), + vec!["real".to_string(), "ghost".to_string()], + ); + cfg.groups + .insert("dead".to_string(), vec!["ghost".to_string()]); + cfg.reconcile(); + assert_eq!(cfg.groups.get("mix"), Some(&vec!["real".to_string()])); + assert!( + !cfg.groups.contains_key("dead"), + "group with only unknown members should be dropped" + ); + } + + #[test] + fn reconcile_drops_orphan_meta() { + let mut cfg = ReposConfig::default(); + cfg.repos + .insert("real".to_string(), PathBuf::from("/tmp/real")); + cfg.repos_meta + .insert("ghost".to_string(), RepoMeta::default()); + cfg.reconcile(); + assert!(!cfg.repos_meta.contains_key("ghost")); + } + + #[test] + fn try_relocate_none_when_ambiguous() { + let tmp = tempfile::tempdir().unwrap(); + let original = tmp.path().join("orig"); + std::fs::create_dir(&original).unwrap(); + init_git_remote(&original, "https://example.com/acme/dup.git"); + + let mut cfg = ReposConfig::default(); + let alias = cfg.register(original.clone()); + + // Two candidates with the same remote → ambiguous → no relocation. + let a = tmp.path().join("copy-a"); + let b = tmp.path().join("copy-b"); + std::fs::create_dir(&a).unwrap(); + std::fs::create_dir(&b).unwrap(); + init_git_remote(&a, "https://example.com/acme/dup.git"); + init_git_remote(&b, "https://example.com/acme/dup.git"); + std::fs::remove_dir_all(&original).unwrap(); + + assert!(cfg.try_relocate(&alias).is_none()); + } + #[test] fn test_unique_alias_generation() { let mut repos = HashMap::new(); diff --git a/src/index/mod.rs b/src/index/mod.rs index 14a2d57..798fb2c 100644 --- a/src/index/mod.rs +++ b/src/index/mod.rs @@ -1250,10 +1250,43 @@ fn print_repo_stats(repo_path: &Path, db_path: &Path) -> Result<()> { } /// Add a repository to the index (creates local or global) +/// Remove stale entries from `repos.json`. +/// +/// For each registered repo whose path no longer exists on disk (e.g. its +/// folder was renamed/moved), a best-effort git-identity relocation is tried +/// first; only entries that cannot be relocated are unregistered. Prints a +/// summary of what was relocated/removed. +pub async fn prune_index() -> Result<()> { + use crate::db_discovery::repos::ReposConfig; + + let mut config = ReposConfig::load()?; + let (relocated, removed) = config.prune_stale(); + + if relocated.is_empty() && removed.is_empty() { + println!("✅ No stale repositories found — repos.json is clean."); + return Ok(()); + } + + config.save()?; + + for (alias, path) in &relocated { + println!("📍 relocated '{}' → {}", alias, path.display()); + } + for alias in &removed { + println!("🗑️ removed stale entry '{}'", alias); + } + println!( + "✅ Prune complete: {} relocated, {} removed.", + relocated.len(), + removed.len() + ); + + Ok(()) +} + pub async fn add_to_index( path: Option, global: bool, - alias: Option, cancel_token: CancellationToken, ) -> Result<()> { let project_path = path.as_deref().unwrap_or_else(|| Path::new(".")); @@ -1267,8 +1300,9 @@ pub async fn add_to_index( // Serve handles: register in repos.json + create index + warmup. let add_delegate = serve_delegate_with_warmup_wait(|| { let path = path.clone(); - let alias = alias.clone(); - async move { try_delegate_add_to_serve(&path, &alias, global).await } + // Alias is always derived from the directory name; the CLI no longer + // lets the user set it. Pass None so serve derives it consistently. + async move { try_delegate_add_to_serve(&path, &None, global).await } }) .await; @@ -1315,24 +1349,19 @@ pub async fn add_to_index( println!(" Type: {}", "Local".bright_green()); } - // If an alias is provided and this is a local DB in the current dir, - // register it in repos.json (for legacy DB's that predate auto-registration). - if alias.is_some() && db.is_current && !db.is_global { + // If this is a local DB in the current dir, ensure it is registered in + // repos.json (for legacy DBs that predate auto-registration). The alias + // is always derived from the directory name. + if db.is_current && !db.is_global { let mut config = crate::db_discovery::repos::ReposConfig::load().unwrap_or_default(); if let Some(existing) = config.alias_for_path(&canonical_path) { println!(" Already registered as '{}'.", existing); } else { - match config.register_with_alias(canonical_path.clone(), alias.clone()) { - Ok(assigned) => { - if let Err(e) = config.save() { - eprintln!("⚠️ Failed to save repos config: {}", e); - } else { - println!(" ✅ Registered as '{}'.", assigned); - } - } - Err(e) => { - eprintln!("⚠️ Registration failed: {}", e); - } + let assigned = config.register(canonical_path.clone()); + if let Err(e) = config.save() { + eprintln!("⚠️ Failed to save repos config: {}", e); + } else { + println!(" ✅ Registered as '{}'.", assigned); } } return Ok(()); @@ -1426,21 +1455,12 @@ pub async fn add_to_index( if let Some(existing) = config.alias_for_path(&canonical_path) { eprintln!("ℹ️ Already registered as '{}'.", existing); } else { - match config.register_with_alias(canonical_path.clone(), alias) { - Ok(assigned) => { - if let Err(e) = config.save() { - eprintln!("⚠️ Index created, but failed to save repos config: {}", e); - eprintln!(" Config path: {}", config_path.display()); - } else { - eprintln!("✅ Registered as '{}'.", assigned); - } - } - Err(e) => { - return Err(anyhow::anyhow!( - "Index created, but registration failed: {}", - e - )); - } + let assigned = config.register(canonical_path.clone()); + if let Err(e) = config.save() { + eprintln!("⚠️ Index created, but failed to save repos config: {}", e); + eprintln!(" Config path: {}", config_path.display()); + } else { + eprintln!("✅ Registered as '{}'.", assigned); } } } diff --git a/src/serve/mod.rs b/src/serve/mod.rs index 82c4c39..eb0fac8 100644 --- a/src/serve/mod.rs +++ b/src/serve/mod.rs @@ -563,6 +563,52 @@ impl ServeState { }); } + /// Reconcile registered repo paths against the filesystem before warmup. + /// + /// For each alias whose stored path no longer exists (folder renamed/moved), + /// attempt a best-effort git-identity relocation and rewrite `repos.json`. + /// When relocation fails the entry is left in place and merely logged — it is + /// skipped safely at warmup and never crashes serve. Explicit cleanup of + /// unrecoverable entries is available via `codesearch index prune`. + pub(crate) fn reconcile_all_paths(self: &Arc) { + let aliases = self.aliases(); + if aliases.is_empty() { + return; + } + + let mut config = match self.config.write() { + Ok(c) => c, + Err(e) => { + warn!("reconcile: config lock poisoned: {}", e); + return; + } + }; + + let (relocated, unresolved) = config.relocate_missing(); + + for (alias, new_path) in &relocated { + info!("reconcile: relocated '{}' → {}", alias, new_path.display()); + } + for alias in &unresolved { + let missing = config + .repos + .get(alias) + .map(|p| p.display().to_string()) + .unwrap_or_default(); + warn!( + "reconcile: '{}' path missing ({}); skipping — \ + run `codesearch index prune` to remove it", + alias, missing + ); + } + + if !relocated.is_empty() { + if let Err(e) = self.persist_config(&config) { + warn!("reconcile: failed to persist relocated paths: {}", e); + } + } + } + /// Phase 1: warm all repos sequentially, awaiting incremental refresh per repo. pub(crate) async fn run_phase_1_warmup_all(self: &Arc) { let aliases = self.aliases(); @@ -662,6 +708,18 @@ impl ServeState { return; } }; + // Guard against stale entries whose folder was removed/renamed + // and could not be relocated: skip rather than run SCIP on a + // non-existent path. + if !path.exists() { + warn!( + "phase-2: skip '{}' — path missing ({})", + alias, + path.display() + ); + drop(permit); + return; + } let db_path = path.join(DB_DIR_NAME); trigger_symbol_rebuild(&alias, &path, &db_path, &state).await; drop(permit); @@ -707,6 +765,11 @@ impl ServeState { None => continue, }; + // Skip stale entries whose folder no longer exists. + if !path.exists() { + continue; + } + // Only pre-warm repos that have a ready C# index let status = self .csharp_index_status @@ -2940,6 +3003,7 @@ pub async fn run_serve( { let phase_state = serve_state.clone(); tokio::spawn(async move { + phase_state.reconcile_all_paths(); phase_state.run_phase_1_warmup_all().await; phase_state.run_phase_2_csharp_scip().await; phase_state.run_phase_3_prewarm().await;