:` prefix internally. The suspect detector strips that prefix before matching and suggesting, so root-level `vendor/`, `package-lock.json`, `go.sum`, etc. in any individual repo are detected and the emitted `--ignore` globs are repo-relative (drop-in for `extract --ignore` and `scan --extract-ignore`). Same-shape findings across repos collapse to one suggestion (`dist/*` applies everywhere rather than being listed once per repo).
+
Statistical heuristics (very high churn-per-commit, single-author bulk updates) are deliberately out of scope — their false-positive rate on hand-authored code is higher than the path-based list and we'd rather stay quiet than cry wolf.
### `--mailmap` off by default
diff --git a/internal/report/profile_template.go b/internal/report/profile_template.go
index 5bb8f0f..5fedc3a 100644
--- a/internal/report/profile_template.go
+++ b/internal/report/profile_template.go
@@ -134,6 +134,42 @@ footer { margin-top: 40px; padding-top: 16px; border-top: 1px solid #d0d7de; col
{{end}}
+{{if gt (len .Repos) 1}}
+Per-Repository Breakdown {{thousands (len .Repos)}} repositories
+How {{.Profile.Name}}'s work is split across repositories in this scan. Use this to point at the projects with the most activity, or to spot single-repo focus vs. broad multi-repo engagement.
+
+
+ | Repository |
+ Commits |
+ % of My Commits |
+ Churn |
+ % of My Churn |
+ Files |
+ Active days |
+ First → Last |
+
+{{range .Repos}}
+
+ | {{.Repo}} |
+ {{thousands .Commits}} |
+
+
+
+ {{printf "%.1f" .PctOfTotalCommits}}%
+
+ |
+ {{thousands .Churn}} |
+ {{printf "%.1f" .PctOfTotalChurn}}% |
+ {{thousands .Files}} |
+ {{.ActiveDays}} |
+ {{.FirstCommitDate}} → {{.LastCommitDate}} |
+
+{{end}}
+
+{{end}}
+
{{if .Profile.TopFiles}}
Top Files
Files this developer changed most (churn = additions + deletions). High churn on few files suggests deep ownership and potential knowledge concentration. · {{docRef "hotspots"}}
diff --git a/internal/report/report.go b/internal/report/report.go
index 80e875f..24c982f 100644
--- a/internal/report/report.go
+++ b/internal/report/report.go
@@ -641,6 +641,13 @@ type ProfileReportData struct {
MaxActivityCommits int
PatternGrid [7][24]int
MaxPattern int
+
+ // Repos is the per-repository breakdown filtered to this developer's
+ // commits. Empty on single-repo profile reports — gated in the
+ // template so existing single-repo callers see no change. The headline
+ // use case for `gitcortex scan --email me` lives here: the developer
+ // can see at a glance which repos they spent time in.
+ Repos []stats.RepoStat
}
func GenerateProfile(w io.Writer, ds *stats.Dataset, repoName, email string) error {
@@ -672,6 +679,7 @@ func GenerateProfile(w io.Writer, ds *stats.Dataset, repoName, email string) err
MaxActivityCommits: maxAct,
PatternGrid: p.WorkGrid,
MaxPattern: maxP,
+ Repos: stats.RepoBreakdown(ds, email),
}
return profileTmpl.Execute(w, data)
diff --git a/internal/report/report_test.go b/internal/report/report_test.go
index 2b4c826..45a91eb 100644
--- a/internal/report/report_test.go
+++ b/internal/report/report_test.go
@@ -5,6 +5,7 @@ import (
"fmt"
"os"
"path/filepath"
+ "regexp"
"strings"
"testing"
@@ -77,6 +78,117 @@ func TestGenerate_SmokeRender(t *testing.T) {
}
}
+// The Per-Repository Breakdown is a consolidation metric about the
+// developer's work, not about the repository set. It belongs on the
+// profile (--email) report where "3 of my commits in auth, 50 in
+// payments" tells a story; on the team report it would just restate
+// git-history distribution, which is tautological. Assert the team
+// render does NOT surface the section even with a multi-repo dataset.
+func TestGenerate_TeamReportOmitsPerRepoBreakdown(t *testing.T) {
+ dir := t.TempDir()
+ alpha := filepath.Join(dir, "alpha.jsonl")
+ beta := filepath.Join(dir, "beta.jsonl")
+ alphaRow := `{"type":"commit","sha":"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa","author_email":"me@x.com","author_name":"Me","author_date":"2024-01-01T00:00:00Z","additions":10,"deletions":0,"files_changed":1}
+{"type":"commit_file","commit":"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa","path_current":"a.go","additions":10,"deletions":0}
+`
+ betaRow := `{"type":"commit","sha":"bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb","author_email":"me@x.com","author_name":"Me","author_date":"2024-02-01T00:00:00Z","additions":20,"deletions":5,"files_changed":1}
+{"type":"commit_file","commit":"bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb","path_current":"b.go","additions":20,"deletions":5}
+`
+ if err := os.WriteFile(alpha, []byte(alphaRow), 0o644); err != nil {
+ t.Fatal(err)
+ }
+ if err := os.WriteFile(beta, []byte(betaRow), 0o644); err != nil {
+ t.Fatal(err)
+ }
+ ds, err := stats.LoadMultiJSONL([]string{alpha, beta})
+ if err != nil {
+ t.Fatalf("LoadMultiJSONL: %v", err)
+ }
+ var buf bytes.Buffer
+ if err := Generate(&buf, ds, "scan-fixture", 10, stats.StatsFlags{CouplingMinChanges: 1, NetworkMinFiles: 1}); err != nil {
+ t.Fatalf("Generate: %v", err)
+ }
+ out := buf.String()
+ if strings.Contains(out, "Per-Repository Breakdown") {
+ t.Error("team report should not render the Per-Repository Breakdown; it's a profile-only metric")
+ }
+ // Sanity: the scan JSONL prefix still shows up in file paths etc.
+ // — we didn't accidentally strip the repo context from the whole
+ // report.
+ if !strings.Contains(out, "alpha:") && !strings.Contains(out, "beta:") {
+ t.Error("expected repo-prefixed paths to appear somewhere in the team report; none found")
+ }
+}
+
+// End-to-end for the `gitcortex scan --email me@x.com --report …`
+// flow. Covers three assertions at once:
+// 1. GenerateProfile emits the Per-Repository Breakdown section
+// when the dataset is multi-repo (gated on len(Repos) > 1).
+// 2. Counts per repo are filtered to the dev — a commit by
+// someone-else@x.com in alpha doesn't bleed into my profile's
+// alpha row.
+// 3. Files counted per repo are only files THIS dev touched — a
+// colleague-exclusive file in alpha must not inflate my scope.
+func TestGenerateProfile_MultiRepoBreakdownFiltersByEmail(t *testing.T) {
+ dir := t.TempDir()
+ alpha := filepath.Join(dir, "alpha.jsonl")
+ beta := filepath.Join(dir, "beta.jsonl")
+
+ // alpha: me has 1 commit on a.go; colleague has 1 commit on
+ // colleague-only.go. beta: me has 1 commit on b.go.
+ alphaContent := `{"type":"commit","sha":"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa01","author_email":"me@x.com","author_name":"Me","author_date":"2024-01-10T00:00:00Z","additions":10,"deletions":0,"files_changed":1}
+{"type":"commit_file","commit":"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa01","path_current":"a.go","additions":10,"deletions":0}
+{"type":"commit","sha":"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa02","author_email":"colleague@x.com","author_name":"Col","author_date":"2024-01-11T00:00:00Z","additions":5,"deletions":0,"files_changed":1}
+{"type":"commit_file","commit":"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa02","path_current":"colleague-only.go","additions":5,"deletions":0}
+`
+ betaContent := `{"type":"commit","sha":"bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb01","author_email":"me@x.com","author_name":"Me","author_date":"2024-02-10T00:00:00Z","additions":20,"deletions":5,"files_changed":1}
+{"type":"commit_file","commit":"bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb01","path_current":"b.go","additions":20,"deletions":5}
+`
+ if err := os.WriteFile(alpha, []byte(alphaContent), 0o644); err != nil {
+ t.Fatal(err)
+ }
+ if err := os.WriteFile(beta, []byte(betaContent), 0o644); err != nil {
+ t.Fatal(err)
+ }
+
+ ds, err := stats.LoadMultiJSONL([]string{alpha, beta})
+ if err != nil {
+ t.Fatalf("LoadMultiJSONL: %v", err)
+ }
+
+ var buf bytes.Buffer
+ if err := GenerateProfile(&buf, ds, "scan-profile", "me@x.com"); err != nil {
+ t.Fatalf("GenerateProfile: %v", err)
+ }
+ out := buf.String()
+
+ if !strings.Contains(out, "Per-Repository Breakdown") {
+ t.Fatal("profile report missing the breakdown section — scan --email users won't see their cross-repo split")
+ }
+
+ // Each row renders as `alpha | ` followed by
+ // a `N | ` for commits. Assert 1 commit per repo — if the
+ // filter leaked, alpha would show 2 (me's + colleague's).
+ commitCountRe := regexp.MustCompile(`(alpha|beta) | \s*(\d+) | `)
+ rows := commitCountRe.FindAllStringSubmatch(out, -1)
+ if len(rows) != 2 {
+ t.Fatalf("expected 2 repo rows in breakdown, got %d: %v", len(rows), rows)
+ }
+ for _, r := range rows {
+ if r[2] != "1" {
+ t.Errorf("repo %s shows %s commits in profile breakdown — email filter leaked (want 1)", r[1], r[2])
+ }
+ }
+
+ // Colleague-exclusive file must not appear in my scope. The
+ // template renders file counts as `N | ` inside each row
+ // — indirect assertion: if the file-count bumped, the dev-filter
+ // on devCommits is wrong.
+ if strings.Contains(out, "colleague-only.go") {
+ t.Error("profile report mentions a file only the colleague touched")
+ }
+}
+
func TestGenerate_EmptyDataset(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "empty.jsonl")
diff --git a/internal/report/scan_index.go b/internal/report/scan_index.go
new file mode 100644
index 0000000..e8b943d
--- /dev/null
+++ b/internal/report/scan_index.go
@@ -0,0 +1,243 @@
+package report
+
+import (
+ "fmt"
+ "html/template"
+ "io"
+ "time"
+)
+
+// ScanIndexEntry is one repo's row on the scan-index landing page.
+// Successful repos populate the numeric fields; failed / pending
+// repos leave them zero and surface Error instead. ReportHref is the
+// relative URL the index uses to link into each per-repo report —
+// empty when no report exists for that entry.
+type ScanIndexEntry struct {
+ Slug string
+ Path string
+ Status string
+ Error string
+ Commits int
+ Devs int
+ Files int
+ Churn int64
+ FirstCommitDate string
+ LastCommitDate string
+ // LastCommitAgo is LastCommitDate humanized relative to the
+ // index generation time: "today" / "Nd ago" / "Nmo ago" / "Ny ago".
+ // Lets operators spot abandoned repos in a list of 30 without
+ // doing date math in their head.
+ LastCommitAgo string
+ // RecencyBucket classifies LastCommitAgo into "fresh" (≤ 30 days),
+ // "stable" (≤ 1 year), "stale" (> 1 year). Drives the badge color
+ // in the template so the cold repos stand out at a glance.
+ RecencyBucket string
+ ReportHref string
+}
+
+// HumanizeAgo is the caller entry point for the index: formats
+// `lastDate` (YYYY-MM-DD) as a compact "ago" phrase relative to
+// wall-clock now, and classifies recency into fresh / stable /
+// stale for template coloring. Empty pair on unparsable input so
+// failed/pending rows with no dates render nothing.
+func HumanizeAgo(lastDate string) (label, bucket string) {
+ return humanizeAgoAt(lastDate, time.Now())
+}
+
+// humanizeAgoAt is the testable core — same logic as HumanizeAgo
+// but takes an explicit "now" so tests can pin the reference time
+// and avoid drift.
+//
+// Future-commit policy: dates strictly AFTER today's UTC midnight
+// (e.g. tomorrow's YYYY-MM-DD due to clock skew or a future-dated
+// CI rewrite) yield empty — the index surfaces how STALE each repo
+// is, and labeling "in 3d" as "fresh" would actively mislead. Both
+// sides of the comparison reduce to UTC midnights so "tomorrow"
+// is detected even when the user's wall clock is mid-day; sub-day
+// committer-clock skew inside today's date still falls into
+// `days == 0 → "today"`, documented as the safe fallback at day
+// granularity.
+//
+// The earlier implementation computed `days` as
+// int(now.Sub(t).Hours() / 24), which truncates toward zero for
+// negative durations. A "tomorrow" date 12 hours ahead produced
+// days == 0 instead of -1, slipping past the future-date guard
+// and rendering "today" / fresh. Comparing the parsed date
+// directly against today's UTC midnight removes the truncation
+// gap.
+func humanizeAgoAt(lastDate string, now time.Time) (label, bucket string) {
+ t, err := time.Parse("2006-01-02", lastDate)
+ if err != nil {
+ return "", ""
+ }
+ today := now.UTC().Truncate(24 * time.Hour)
+ if t.After(today) {
+ return "", ""
+ }
+ days := int(today.Sub(t).Hours() / 24)
+ switch {
+ case days < 0:
+ // Defensive: t.After(today) above already covered this, but
+ // keep the branch so a later refactor that moves the guard
+ // can't silently produce a negative-days label.
+ return "", ""
+ case days == 0:
+ label = "today"
+ case days < 30:
+ label = fmt.Sprintf("%dd ago", days)
+ case days < 365:
+ // Cap the month reading at 11 — otherwise `days/30` emits
+ // "12mo ago" at days 360-364, which reads as older than
+ // "1y ago" even though it's actually in the still-stable
+ // (≤365d) band. Clamping keeps the label progression
+ // monotonic with the bucket color.
+ months := days / 30
+ if months > 11 {
+ months = 11
+ }
+ label = fmt.Sprintf("%dmo ago", months)
+ default:
+ label = fmt.Sprintf("%dy ago", days/365)
+ }
+ switch {
+ case days <= 30:
+ bucket = "fresh"
+ case days <= 365:
+ bucket = "stable"
+ default:
+ bucket = "stale"
+ }
+ return label, bucket
+}
+
+// ScanIndexData is the top-level template input for the index page.
+type ScanIndexData struct {
+ GeneratedAt string
+ Repos []ScanIndexEntry
+ // TotalRepos / OKRepos / FailedRepos / PendingRepos are
+ // precomputed so the template doesn't need conditional arithmetic.
+ // Pending is distinct from failed: a pending repo is one the
+ // worker never reached (cancelled mid-scan), a failed repo is one
+ // whose extract or render broke. Mixing them in the summary would
+ // read a cancel-shaped partial run as a fleet-of-errors.
+ TotalRepos int
+ OKRepos int
+ FailedRepos int
+ PendingRepos int
+ TotalCommits int
+ TotalDevs int
+ // Largest repo commit count — used to normalize the bar widths so
+ // the relative-volume bars are visually comparable across repos.
+ MaxCommits int
+}
+
+// GenerateScanIndex writes the scan landing page: a per-repo card
+// list with links to each repo's standalone report and a short
+// summary strip. Failures are surfaced inline rather than hidden so
+// operators can spot them at a glance and dig into the manifest.
+func GenerateScanIndex(w io.Writer, data ScanIndexData) error {
+ if data.GeneratedAt == "" {
+ data.GeneratedAt = time.Now().Format("2006-01-02 15:04")
+ }
+ return scanIndexTmpl.Execute(w, data)
+}
+
+var scanIndexTmpl = template.Must(template.New("scan-index").Funcs(funcMap).Parse(scanIndexHTML))
+
+const scanIndexHTML = `
+
+
+
+gitcortex scan index ({{.TotalRepos}} repositories)
+
+
+
+
+
+
Repositories
{{thousands .OKRepos}}{{if gt .FailedRepos 0}} ({{.FailedRepos}} failed){{end}}{{if gt .PendingRepos 0}} ({{.PendingRepos}} pending){{end}}
+
Total commits
{{humanize .TotalCommits}}
+
Unique devs
{{thousands .TotalDevs}}
+
+
+Each repo below links to its own standalone report. Metrics are per-repository — no cross-repo aggregation that would mix signals from unrelated codebases. For a consolidated developer view, use gitcortex scan --report <file> --email <address>.
+
+{{$max := .MaxCommits}}
+{{range .Repos}}
+
+
+
+ {{if .ReportHref}}
{{.Slug}}{{else}}{{.Slug}}{{end}}
+ {{if ne .Status "ok"}}
{{.Status}}{{end}}
+
+
{{.Path}}
+ {{if .Error}}
{{.Error}}
{{end}}
+
+ {{if eq .Status "ok"}}
+
+ {{humanize .Commits}}
+ commits
+
+
+ {{humanize .Churn}}
+ churn
+
+
+ {{.Devs}}
+ devs
+
+
+ {{humanize .Files}}
+ files
+
+
+ {{if .LastCommitAgo}}{{.LastCommitAgo}}
{{end}}
+ {{if .LastCommitDate}}{{.FirstCommitDate}}
→ {{.LastCommitDate}}{{end}}
+
+ {{else}}
+
No report available.
+ {{end}}
+
+{{if eq .Status "ok"}}
+
+{{end}}
+{{end}}
+
+
+
+
+
+`
diff --git a/internal/report/scan_index_test.go b/internal/report/scan_index_test.go
new file mode 100644
index 0000000..470c10a
--- /dev/null
+++ b/internal/report/scan_index_test.go
@@ -0,0 +1,126 @@
+package report
+
+import (
+ "bytes"
+ "strings"
+ "testing"
+ "time"
+)
+
+func TestHumanizeAgoAt(t *testing.T) {
+ now := time.Date(2026, 4, 20, 12, 0, 0, 0, time.UTC)
+
+ cases := []struct {
+ name string
+ lastDate string
+ wantLabel string
+ wantBucket string
+ }{
+ // Boundary on day 0 — "today" reads cleaner than "0d ago".
+ {"same day", "2026-04-20", "today", "fresh"},
+ {"one day", "2026-04-19", "1d ago", "fresh"},
+ {"29 days", "2026-03-22", "29d ago", "fresh"},
+ // Transition from days to months at 30.
+ {"30 days exact", "2026-03-21", "1mo ago", "fresh"},
+ {"two months", "2026-02-15", "2mo ago", "stable"},
+ {"eleven months", "2025-05-25", "11mo ago", "stable"},
+ // Month label must not exceed "11mo" — days/30 = 12 on the
+ // 360-day boundary, but the label is clamped so the stable
+ // band always reads as sub-year.
+ {"month clamp at 360 days", "2025-04-25", "11mo ago", "stable"},
+ {"month clamp at 364 days", "2025-04-21", "11mo ago", "stable"},
+ // 365-day boundary: still "stable" at exactly one year,
+ // "stale" the day after. 2025-04-20 is 365 days before now.
+ {"one year exact (stable boundary)", "2025-04-20", "1y ago", "stable"},
+ {"one year + one day (stale)", "2025-04-19", "1y ago", "stale"},
+ {"two years stale", "2024-04-10", "2y ago", "stale"},
+ // Parse failure yields empty.
+ {"bad input", "not-a-date", "", ""},
+ // Future dates (clock skew) yield empty — we don't label "in 3d"
+ // on an index that exists to surface recency of PAST commits.
+ {"full-day future", "2026-05-01", "", ""},
+ // Regression for the truncation-gap bug: tomorrow (~12h ahead
+ // of `now`) used to yield int(-0.5) == 0 and silently render
+ // as "today" / fresh. Comparing UTC date midnights instead
+ // of raw durations rejects it correctly.
+ {"tomorrow (sub-24h future)", "2026-04-21", "", ""},
+ {"two days future", "2026-04-22", "", ""},
+ // Sub-day future (committer-date hours ahead of scanner clock)
+ // lands in days==0 → "today". Documented: the index works at
+ // day granularity and "today" is the least-misleading fallback
+ // for intra-day skew within today's date.
+ {"same-day future (sub-day skew)", "2026-04-20", "today", "fresh"},
+ }
+ for _, c := range cases {
+ t.Run(c.name, func(t *testing.T) {
+ label, bucket := humanizeAgoAt(c.lastDate, now)
+ if label != c.wantLabel {
+ t.Errorf("label: got %q, want %q", label, c.wantLabel)
+ }
+ if bucket != c.wantBucket {
+ t.Errorf("bucket: got %q, want %q", bucket, c.wantBucket)
+ }
+ })
+ }
+}
+
+// End-to-end check that the recency chip renders for an ok entry
+// and is absent for failed entries (which have no dates). Confirms
+// the CSS bucket class is reachable by the template.
+func TestGenerateScanIndex_RecencyChipRenders(t *testing.T) {
+ data := ScanIndexData{
+ GeneratedAt: "2026-04-20 12:00",
+ TotalRepos: 2,
+ OKRepos: 1,
+ FailedRepos: 1,
+ MaxCommits: 10,
+ Repos: []ScanIndexEntry{
+ {
+ Slug: "alive",
+ Path: "/work/alive",
+ Status: "ok",
+ Commits: 10,
+ FirstCommitDate: "2024-01-01",
+ LastCommitDate: "2026-04-18",
+ LastCommitAgo: "2d ago",
+ RecencyBucket: "fresh",
+ ReportHref: "alive.html",
+ },
+ {
+ Slug: "broken",
+ Path: "/work/broken",
+ Status: "failed",
+ Error: "boom",
+ },
+ },
+ }
+ var buf bytes.Buffer
+ if err := GenerateScanIndex(&buf, data); err != nil {
+ t.Fatal(err)
+ }
+ out := buf.String()
+ if !strings.Contains(out, `class="recency fresh"`) {
+ t.Error("fresh recency chip missing from ok entry")
+ }
+ if !strings.Contains(out, `>2d ago<`) {
+ t.Error("recency label text missing")
+ }
+ // Anchor the chip inside the `.dates` wrapper — without this,
+ // a future template change that moved the chip outside the
+ // dates cell (losing the date context) would still satisfy the
+ // class-only assertion above.
+ if !strings.Contains(out, `class="dates"`) {
+ t.Error("`.dates` wrapper missing — the recency chip has lost its structural anchor")
+ }
+ datesIdx := strings.Index(out, `class="dates"`)
+ recencyIdx := strings.Index(out, `class="recency fresh"`)
+ if datesIdx < 0 || recencyIdx < 0 || recencyIdx < datesIdx {
+ t.Errorf("recency chip not inside `.dates` wrapper: dates@%d recency@%d", datesIdx, recencyIdx)
+ }
+ // Failed entry has no dates block, so no recency chip should
+ // render for it — a weak but useful guard against a template
+ // restructure leaking the chip into the failure render.
+ if strings.Count(out, `class="recency`) != 1 {
+ t.Errorf("expected exactly one recency chip (ok entry only); got %d", strings.Count(out, `class="recency`))
+ }
+}
diff --git a/internal/scan/discovery.go b/internal/scan/discovery.go
new file mode 100644
index 0000000..8fc96e5
--- /dev/null
+++ b/internal/scan/discovery.go
@@ -0,0 +1,357 @@
+package scan
+
+import (
+ "context"
+ "crypto/sha1"
+ "encoding/hex"
+ "fmt"
+ "log"
+ "os"
+ "path/filepath"
+ "sort"
+ "strings"
+)
+
+// Repo describes one git repository discovered during scan.
+type Repo struct {
+ // AbsPath is the absolute path to the working tree root.
+ AbsPath string
+ // RelPath is AbsPath relative to the scan root it was found under.
+ RelPath string
+ // Slug is a filesystem-safe identifier used to derive the output
+ // JSONL file name and the path prefix that stats will use as the
+ // repo label. Must be unique across the scan (collisions are resolved
+ // by appending a short hash of AbsPath).
+ Slug string
+}
+
+// Discover walks the given roots and returns every git repository it finds.
+// Directories matched by the ignore matcher are pruned. Each repo's Slug is
+// unique across the full result so downstream JSONL files don't collide,
+// and the prefix LoadMultiJSONL derives (basename-minus-ext) still groups
+// correctly.
+//
+// A cancelled ctx aborts the walk promptly — important because on large
+// roots (home directory scans, monorepos of repos) discovery is often
+// the longest phase of scan and Ctrl+C needs to land in real time, not
+// "after the walk naturally finishes". Each filepath.WalkDir callback
+// checks ctx.Done() before doing any filesystem work, so the abort
+// window is bounded by the cost of one stat call per dir entry.
+func Discover(ctx context.Context, roots []string, matcher *Matcher, maxDepth int) ([]Repo, error) {
+ if matcher == nil {
+ matcher = NewMatcher(nil)
+ }
+ if ctx == nil {
+ ctx = context.Background()
+ }
+ var repos []Repo
+ seen := make(map[string]bool)
+
+ for _, root := range roots {
+ if err := ctx.Err(); err != nil {
+ return nil, err
+ }
+ abs, err := filepath.Abs(root)
+ if err != nil {
+ return nil, fmt.Errorf("resolve %s: %w", root, err)
+ }
+ // Canonicalize via EvalSymlinks. os.Stat above followed the
+ // symlink for the is-directory check, but filepath.WalkDir
+ // treats a symlink ROOT specially: it visits only the link
+ // itself (reported with ModeSymlink) and does NOT descend
+ // into the target. For a user whose ~/work is a symlink to
+ // /mnt/data/work — a common setup — the walk would yield
+ // zero callbacks past the root and scan would conclude "no
+ // git repositories found under ..." despite the target being
+ // full of repos. EvalSymlinks dereferences the root once so
+ // WalkDir starts with a real directory path; links ENCOUNTERED
+ // during the walk are still left as-is (default WalkDir
+ // behavior), which is what we want — we don't chase every
+ // symlink we come across, only the ones the user explicitly
+ // named as a root.
+ resolved, err := filepath.EvalSymlinks(abs)
+ if err != nil {
+ return nil, fmt.Errorf("resolve symlinks for %s: %w", abs, err)
+ }
+ abs = resolved
+ info, err := os.Stat(abs)
+ if err != nil {
+ return nil, fmt.Errorf("stat %s: %w", abs, err)
+ }
+ if !info.IsDir() {
+ return nil, fmt.Errorf("%s is not a directory", abs)
+ }
+
+ err = filepath.WalkDir(abs, func(path string, d os.DirEntry, werr error) error {
+ // Return ctx.Err() (not SkipDir) so WalkDir short-circuits
+ // the entire walk. Without this check, a cancelled scan
+ // would keep stat'ing every directory in a large tree
+ // before the caller saw the interrupt.
+ if err := ctx.Err(); err != nil {
+ return err
+ }
+ if werr != nil {
+ // Permission errors on one subtree shouldn't abort the
+ // whole scan. Log and keep walking.
+ log.Printf("scan: skip %s: %v", path, werr)
+ if d != nil && d.IsDir() {
+ return filepath.SkipDir
+ }
+ return nil
+ }
+ if !d.IsDir() {
+ return nil
+ }
+
+ rel, _ := filepath.Rel(abs, path)
+ rel = filepath.ToSlash(rel)
+
+ if maxDepth > 0 && rel != "." {
+ depth := strings.Count(rel, "/") + 1
+ if depth > maxDepth {
+ return filepath.SkipDir
+ }
+ }
+
+ if rel != "." && matcher.Match(rel, true) {
+ // Don't SkipDir blindly — if any negation rule could
+ // re-include a descendant (e.g. `vendor/` + `!vendor/keep`),
+ // pruning here would drop the re-inclusion before its
+ // target is visited.
+ if !matcher.CouldReinclude(rel) {
+ return filepath.SkipDir
+ }
+ // The dir itself is ignored; we only walk in so a
+ // negated descendant can be visited in its own turn.
+ // Return early so the .git detection below doesn't
+ // record this ignored directory as a repo (and so its
+ // `return filepath.SkipDir` doesn't cut the descent
+ // off before the negation's target is seen). A
+ // descendant whose rel path is re-included by `!rule`
+ // will be examined by the next WalkDir callback
+ // invocation.
+ return nil
+ }
+
+ // Two repo shapes to detect:
+ // 1. Working tree: path/.git is a dir (normal) or a
+ // `gitdir: …` pointer file (worktree/submodule).
+ // 2. Bare repo: path itself contains HEAD + objects/
+ // + refs/ — common for clones used as fixtures or
+ // mirrors (`git clone --bare`, GitHub-style server dirs
+ // named foo.git).
+ // Validating the .git entry (not just its presence) matters:
+ // a random regular file named `.git` would otherwise be
+ // accepted, the parent path recorded as a repo, and the
+ // walker SkipDir'd out of the subtree — hiding any real
+ // repos nested underneath and guaranteeing a downstream
+ // "not a git repository" failure during extract. isBareRepo
+ // separately requires HEAD + objects + refs so a stray HEAD
+ // or empty objects dir can't false-positive either.
+ gitEntry := filepath.Join(path, ".git")
+ if isWorkingTreeEntry(gitEntry) || isBareRepo(path) {
+ if seen[path] {
+ return filepath.SkipDir
+ }
+ seen[path] = true
+ repos = append(repos, Repo{
+ AbsPath: path,
+ RelPath: rel,
+ })
+ // Don't descend into a repo — nested repos (submodules,
+ // vendored repos) get picked up separately only if they
+ // live outside this repo's worktree, which is rare. If
+ // users need submodule coverage they can list the parent
+ // and the submodule paths as separate roots.
+ return filepath.SkipDir
+ }
+
+ return nil
+ })
+ if err != nil {
+ return nil, err
+ }
+ }
+
+ assignSlugs(repos)
+ // Sort by slug so output ordering is deterministic across runs
+ // (WalkDir ordering is, but concatenating multiple roots could shuffle).
+ sort.Slice(repos, func(i, j int) bool { return repos[i].Slug < repos[j].Slug })
+ return repos, nil
+}
+
+// initialSlugHashLen controls the default hash-suffix length in hex
+// chars when two repos share a basename. Six chars (24 bits) gives
+// readable slugs in the common case; the retry loop in assignSlugs
+// grows the length if a truncation collision occurs.
+const initialSlugHashLen = 6
+
+// isReservedSlug reports whether base would collide with a name
+// downstream consumers already use for their own output file.
+// `scan --report-dir` writes /index.html as the landing page;
+// a repo whose basename was literally `index` would overwrite it
+// (or be overwritten by it). Forcing the hash branch for reserved
+// names avoids the collision without losing the repo.
+//
+// Switch, not a package-level map, to keep the reserved set
+// immutable — a mutable map would let one test silently leak an
+// entry into every other test that runs afterward. Case folded to
+// align with the case-insensitivity elsewhere in assignSlugs.
+func isReservedSlug(base string) bool {
+ switch strings.ToLower(base) {
+ case "index":
+ return true
+ }
+ return false
+}
+
+// assignSlugs derives a unique slug per repo from its basename, falling
+// back to `-` when two repos share a name.
+// The slug is also the JSONL filename stem and the persistence key
+// (`.state`), so uniqueness AND determinism across runs both matter:
+// if a re-run swaps which sibling gets the bare name, the per-repo
+// state file is orphaned and the affected repos are re-extracted from
+// scratch (or worse, two repos collide onto one state).
+//
+// Two-pass: count basenames first, then suffix with a hash whenever the
+// base appears more than once anywhere in the result. This makes the
+// slug a pure function of (absPath, set of basenames seen), independent
+// of WalkDir traversal order.
+//
+// Short-hash truncation (6 hex = 24 bits) admits a small but nonzero
+// collision probability — two absolute paths whose SHA-1 digests share
+// the same 6-hex prefix would land on the same slug, and scan would
+// silently overwrite one repo's JSONL + state file with the other's.
+// The retry loop walks the proposed slug set; on any duplicate we
+// redo the pass with a longer hash. Grows up to the full 40 hex chars
+// before panicking — needing that much is a cryptographic event, not
+// a scan-time bug.
+//
+// Case-insensitive uniqueness: on macOS (APFS/HFS+ in the default
+// configuration) and Windows (NTFS), `Repo.jsonl` and `repo.jsonl`
+// resolve to the same file on disk. A case-sensitive slug compare
+// would treat the two repos as distinct, hand each the bare basename,
+// and then quietly let the filesystem merge their JSONL and state
+// files. Fold to lower case for BOTH the duplicate-detection count
+// and the uniqueness seen-set so collision-triggered hashing fires
+// on case-only differences. The emitted slug retains the original
+// case for readability (so paths named Repo and repo produce
+// `Repo-` and `repo-` — visually distinct to humans,
+// case-insensitively distinct on disk).
+func assignSlugs(repos []Repo) {
+ counts := make(map[string]int)
+ bases := make([]string, len(repos))
+ for i := range repos {
+ base := sanitizeSlug(filepath.Base(repos[i].AbsPath))
+ if base == "" {
+ base = "repo"
+ }
+ bases[i] = base
+ counts[strings.ToLower(base)]++
+ }
+
+ for hashLen := initialSlugHashLen; hashLen <= 40; hashLen += 2 {
+ proposed := make([]string, len(repos))
+ seen := make(map[string]int, len(repos))
+ collided := false
+ for i := range repos {
+ base := bases[i]
+ slug := base
+ if counts[strings.ToLower(base)] > 1 || isReservedSlug(base) {
+ h := sha1.Sum([]byte(repos[i].AbsPath))
+ slug = fmt.Sprintf("%s-%s", base, hex.EncodeToString(h[:])[:hashLen])
+ }
+ key := strings.ToLower(slug)
+ if prev, ok := seen[key]; ok && prev != i {
+ collided = true
+ break
+ }
+ seen[key] = i
+ proposed[i] = slug
+ }
+ if !collided {
+ for i := range repos {
+ repos[i].Slug = proposed[i]
+ }
+ return
+ }
+ }
+ panic("scan: slug assignment failed to find unique hash within SHA-1 range")
+}
+
+// isBareRepo returns true when path is a bare git repository — i.e. the
+// directory itself holds HEAD, objects/, and refs/ rather than wrapping
+// them in a .git subdirectory. All three entries are required because
+// a stray HEAD file or empty refs/ dir alone is not enough to be a real
+// repo and we don't want false positives polluting the manifest.
+func isBareRepo(path string) bool {
+ for _, name := range []string{"HEAD", "objects", "refs"} {
+ if _, err := os.Stat(filepath.Join(path, name)); err != nil {
+ return false
+ }
+ }
+ return true
+}
+
+// isWorkingTreeEntry reports whether gitEntry (the path of a `.git`
+// dirent inside a candidate repo) marks a real git working tree.
+// Accepts:
+// - A directory named `.git` — the standard layout.
+// - A regular file named `.git` beginning with `gitdir: ` — the
+// pointer format git writes for linked worktrees and submodules.
+// Rejects everything else: symlinks (by existing policy — the target
+// could be anywhere), irregular files, and — the case that motivated
+// the validation — a regular file named `.git` that happens to exist
+// with unrelated content. Without this check, any plain file called
+// `.git` (source dump, build artifact, user mistake) made discovery
+// record its parent as a repo and SkipDir out of the subtree, hiding
+// nested real repos and producing a guaranteed extract failure.
+func isWorkingTreeEntry(gitEntry string) bool {
+ info, err := os.Lstat(gitEntry)
+ if err != nil {
+ return false
+ }
+ if info.Mode()&os.ModeSymlink != 0 {
+ return false
+ }
+ if info.IsDir() {
+ return true
+ }
+ if !info.Mode().IsRegular() {
+ return false
+ }
+ // A valid gitdir pointer file starts with literally "gitdir: ".
+ // Checking only the first 8 bytes is sufficient (and cheap) — the
+ // full content isn't parsed here; extract will fail loudly if the
+ // pointer target is broken, which is a better signal than silently
+ // accepting every regular file and leaving the same confusion for
+ // the extract phase.
+ f, openErr := os.Open(gitEntry)
+ if openErr != nil {
+ return false
+ }
+ defer f.Close()
+ const prefix = "gitdir: "
+ buf := make([]byte, len(prefix))
+ n, _ := f.Read(buf)
+ return n == len(prefix) && string(buf) == prefix
+}
+
+// sanitizeSlug strips characters that would break the LoadMultiJSONL prefix
+// contract (`:`) or filesystem paths. Keeps alphanumerics, dash,
+// underscore, and dot.
+func sanitizeSlug(s string) string {
+ var b strings.Builder
+ for _, r := range s {
+ switch {
+ case r >= 'a' && r <= 'z',
+ r >= 'A' && r <= 'Z',
+ r >= '0' && r <= '9',
+ r == '-', r == '_', r == '.':
+ b.WriteRune(r)
+ default:
+ b.WriteRune('_')
+ }
+ }
+ return b.String()
+}
diff --git a/internal/scan/discovery_test.go b/internal/scan/discovery_test.go
new file mode 100644
index 0000000..97d9e70
--- /dev/null
+++ b/internal/scan/discovery_test.go
@@ -0,0 +1,682 @@
+package scan
+
+import (
+ "context"
+ "crypto/sha1"
+ "encoding/hex"
+ "fmt"
+ "os"
+ "path/filepath"
+ "strings"
+ "testing"
+ "time"
+)
+
+func TestDiscover_FindsRepos(t *testing.T) {
+ root := t.TempDir()
+
+ mustMkRepo(t, filepath.Join(root, "a"))
+ mustMkRepo(t, filepath.Join(root, "b"))
+ mustMkRepo(t, filepath.Join(root, "nested", "c"))
+ // Plain dir without .git — must NOT be picked up.
+ if err := os.MkdirAll(filepath.Join(root, "not-a-repo", "src"), 0o755); err != nil {
+ t.Fatal(err)
+ }
+
+ repos, err := Discover(context.Background(), []string{root}, NewMatcher(nil), 0)
+ if err != nil {
+ t.Fatalf("Discover: %v", err)
+ }
+ if len(repos) != 3 {
+ t.Fatalf("expected 3 repos, got %d: %+v", len(repos), repos)
+ }
+ got := map[string]bool{}
+ for _, r := range repos {
+ got[r.Slug] = true
+ }
+ for _, want := range []string{"a", "b", "c"} {
+ if !got[want] {
+ t.Errorf("expected slug %q in %v", want, got)
+ }
+ }
+}
+
+func TestDiscover_RespectsIgnore(t *testing.T) {
+ root := t.TempDir()
+ mustMkRepo(t, filepath.Join(root, "keep"))
+ mustMkRepo(t, filepath.Join(root, "node_modules", "garbage"))
+
+ matcher := NewMatcher([]string{"node_modules"})
+ repos, err := Discover(context.Background(), []string{root}, matcher, 0)
+ if err != nil {
+ t.Fatal(err)
+ }
+ if len(repos) != 1 || repos[0].Slug != "keep" {
+ t.Fatalf("expected only `keep`, got %+v", repos)
+ }
+}
+
+// Regression: `vendor/` + `!vendor/keep` must descend into vendor/
+// so the negation has a chance to re-include vendor/keep. Before the
+// fix, the walker SkipDir'd vendor unconditionally and the re-included
+// repo was silently dropped.
+func TestDiscover_HonorsNegatedDescendant(t *testing.T) {
+ root := t.TempDir()
+ mustMkRepo(t, filepath.Join(root, "app"))
+ mustMkRepo(t, filepath.Join(root, "vendor", "garbage"))
+ mustMkRepo(t, filepath.Join(root, "vendor", "keep"))
+
+ matcher := NewMatcher([]string{"vendor/", "!vendor/keep"})
+ repos, err := Discover(context.Background(), []string{root}, matcher, 0)
+ if err != nil {
+ t.Fatal(err)
+ }
+ got := map[string]bool{}
+ for _, r := range repos {
+ got[r.RelPath] = true
+ }
+ if !got["app"] {
+ t.Errorf("app should be included: %+v", repos)
+ }
+ if !got["vendor/keep"] {
+ t.Errorf("vendor/keep should be re-included by negation rule: %+v", repos)
+ }
+ if got["vendor/garbage"] {
+ t.Errorf("vendor/garbage should remain ignored: %+v", repos)
+ }
+}
+
+// Regression: when the ignored directory itself is a repo and a
+// negation rule points at a descendant, the walker must NOT record
+// the ignored dir (it's ignored) AND must keep descending so the
+// negation's target can be visited. Previously the .git detection
+// ran unconditionally and both incorrectly recorded the ignored
+// parent AND SkipDir'd the child out of the walk.
+// With a 6-hex (24-bit) suffix, the birthday paradox predicts a
+// truncation collision around ~2^12 duplicates. Generating 10k paths
+// that share a basename makes collisions statistically near-certain
+// (expected count ≈ 3). The invariant "all resulting slugs are
+// distinct" forces assignSlugs's retry loop to grow the hash and
+// still produce a unique namespace — the exact corruption class the
+// extra iterations were added to prevent.
+func TestAssignSlugs_UniqueEvenUnderTruncationCollisions(t *testing.T) {
+ const n = 10000
+ repos := make([]Repo, n)
+ for i := 0; i < n; i++ {
+ repos[i] = Repo{AbsPath: fmt.Sprintf("/path/to/dir-%06d/myrepo", i)}
+ }
+ assignSlugs(repos)
+
+ slugs := make(map[string]int, n)
+ for i, r := range repos {
+ if prev, ok := slugs[r.Slug]; ok {
+ t.Fatalf("duplicate slug %q between repos[%d]=%s and repos[%d]=%s — JSONL + state files would collide",
+ r.Slug, prev, repos[prev].AbsPath, i, r.AbsPath)
+ }
+ slugs[r.Slug] = i
+ }
+}
+
+// Directly exercise the retry path with two paths constructed to
+// collide at initialSlugHashLen. Without the retry, both would get
+// slug "myrepo-" and the scan would silently overwrite one
+// repo's files with the other's. With the retry, the loop grows
+// the hash until the pair separates.
+func TestAssignSlugs_ResolvesFirstPrefixCollision(t *testing.T) {
+ a, b, found := findColliding6HexPaths(50000)
+ if !found {
+ t.Skip("no colliding pair found within search budget — astronomically unlikely, skip rather than flake")
+ }
+ repos := []Repo{{AbsPath: a}, {AbsPath: b}}
+ assignSlugs(repos)
+
+ if repos[0].Slug == repos[1].Slug {
+ t.Fatalf("retry failed: both repos got slug %q for colliding paths %s and %s", repos[0].Slug, a, b)
+ }
+ // Sanity: the slug suffix must be longer than the initial 6 hex,
+ // proving the retry branch actually fired.
+ const minLenAfterRetry = len("myrepo-") + initialSlugHashLen + 1
+ if len(repos[0].Slug) < minLenAfterRetry && len(repos[1].Slug) < minLenAfterRetry {
+ t.Errorf("expected at least one slug to have a longer hash after retry; got %q and %q", repos[0].Slug, repos[1].Slug)
+ }
+}
+
+// findColliding6HexPaths searches a deterministic sequence of
+// `myrepo` paths for any two whose SHA-1 digests share the first
+// initialSlugHashLen hex chars. At 24 bits of resolution the
+// birthday bound hits 50% around N≈4900, so maxAttempts=50000 is
+// overwhelmingly likely to yield a pair. Returns false only if no
+// collision was found — the test treats that as a skip, not a
+// failure, since the worst-case outcome is a missed opportunity to
+// exercise the retry, not a bug.
+func findColliding6HexPaths(maxAttempts int) (string, string, bool) {
+ seen := make(map[string]string, maxAttempts)
+ for i := 0; i < maxAttempts; i++ {
+ p := fmt.Sprintf("/search/path-%07d/myrepo", i)
+ h := sha1.Sum([]byte(p))
+ prefix := hex.EncodeToString(h[:])[:initialSlugHashLen]
+ if prev, ok := seen[prefix]; ok {
+ return prev, p, true
+ }
+ seen[prefix] = p
+ }
+ return "", "", false
+}
+
+// `index` is the landing-page filename used by `scan --report-dir`.
+// A repo whose basename is literally `index` would emit
+// `/index.html`, colliding with (and getting overwritten by)
+// the landing page. Force the hash branch for reserved names so
+// the per-repo report file never lands on a reserved path.
+func TestAssignSlugs_ReservesIndex(t *testing.T) {
+ repos := []Repo{
+ {AbsPath: "/workspace/index"},
+ {AbsPath: "/other/unrelated"},
+ }
+ assignSlugs(repos)
+
+ for _, r := range repos {
+ if filepath.Base(r.AbsPath) == "index" && r.Slug == "index" {
+ t.Errorf("`index` basename must not produce the bare slug; landing page would be overwritten (path=%s, slug=%s)", r.AbsPath, r.Slug)
+ }
+ }
+ // Case-insensitivity: `Index` would also collide on a case-
+ // insensitive filesystem serving the HTML.
+ repos = []Repo{{AbsPath: "/workspace/Index"}}
+ assignSlugs(repos)
+ if strings.EqualFold(repos[0].Slug, "index") {
+ t.Errorf("`Index` basename must also be reserved; got %q", repos[0].Slug)
+ }
+}
+
+// Regression: on macOS (APFS/HFS+ default) and Windows (NTFS),
+// `Repo.jsonl` and `repo.jsonl` are the same file on disk. A
+// case-sensitive slug compare would hand both repos the bare
+// basename and let the filesystem merge their outputs — one scan's
+// JSONL silently overwrites the other. Counts and seen-set are
+// lower-cased so any case-only difference is treated as a collision
+// and forces the hash suffix.
+func TestAssignSlugs_CaseInsensitiveUniqueness(t *testing.T) {
+ repos := []Repo{
+ {AbsPath: "/a/Team/Repo"},
+ {AbsPath: "/b/OSS/repo"},
+ }
+ assignSlugs(repos)
+
+ if repos[0].Slug == repos[1].Slug {
+ t.Fatalf("repos with case-only-differing basenames got same slug %q", repos[0].Slug)
+ }
+ // The fs collision only goes away if the slugs also differ when
+ // folded to lower case — which is what decides filenames on
+ // case-insensitive fs.
+ if strings.EqualFold(repos[0].Slug, repos[1].Slug) {
+ t.Fatalf("slugs %q and %q differ only in case — they would still collide on macOS/Windows", repos[0].Slug, repos[1].Slug)
+ }
+ // Sanity: both should have gained a hash suffix (the bare `Repo`
+ // and `repo` weren't safe to emit).
+ for _, r := range repos {
+ if !strings.Contains(r.Slug, "-") {
+ t.Errorf("repo %s got bare slug %q; case-sensitive branch did not trigger hashing", r.AbsPath, r.Slug)
+ }
+ }
+}
+
+// Same paths ingested twice must produce the same slugs — this is
+// what makes resume work across runs. Even with the retry loop
+// adjusting hash length under collisions, a deterministic input set
+// must yield a deterministic output.
+func TestAssignSlugs_DeterministicUnderRetry(t *testing.T) {
+ build := func() []Repo {
+ rs := make([]Repo, 5000)
+ for i := 0; i < 5000; i++ {
+ rs[i] = Repo{AbsPath: fmt.Sprintf("/work/p-%05d/myrepo", i)}
+ }
+ return rs
+ }
+ a := build()
+ b := build()
+ assignSlugs(a)
+ assignSlugs(b)
+ for i := range a {
+ if a[i].Slug != b[i].Slug {
+ t.Errorf("slug for %s differs across runs: %q vs %q", a[i].AbsPath, a[i].Slug, b[i].Slug)
+ }
+ }
+}
+
+// Mid-walk cancel: the pre-cancelled test below only proves the
+// first callback checks ctx. This test cancels AFTER Discover has
+// already started walking — the scenario users hit with Ctrl+C on
+// a long home-dir scan. Asserting the walk stops in flight, not
+// just when prompted at the very start.
+//
+// The tree is large enough (5,000 dirs × 3 subdirs = 20k nodes)
+// that a full walk on typical hardware takes tens of ms; sleeping
+// briefly before cancel lands the signal mid-flight. We assert:
+// - err is context.Canceled (ctx check fired inside the walk)
+// - elapsed time < full baseline (walk was actually interrupted)
+// - at least one repo was discovered BEFORE cancel (not a
+// pre-cancel scenario in disguise)
+// Regression: WalkDir given a symlink ROOT visits only the link
+// entry and refuses to descend. Users whose ~/work is a symlink to
+// a real data disk would previously see "no git repositories found"
+// despite the target being full of repos. Canonicalize via
+// EvalSymlinks before walking so the walk starts at a real dir.
+func TestDiscover_DereferencesSymlinkRoot(t *testing.T) {
+ real := t.TempDir()
+ mustMkRepo(t, filepath.Join(real, "repo-a"))
+ mustMkRepo(t, filepath.Join(real, "nested", "repo-b"))
+
+ // Place the symlink in a separate TempDir so the test doesn't
+ // have to clean it up explicitly.
+ linkDir := t.TempDir()
+ link := filepath.Join(linkDir, "work")
+ if err := os.Symlink(real, link); err != nil {
+ t.Skipf("symlinks not supported here: %v", err)
+ }
+
+ repos, err := Discover(context.Background(), []string{link}, NewMatcher(nil), 0)
+ if err != nil {
+ t.Fatalf("Discover via symlink root: %v", err)
+ }
+ if len(repos) != 2 {
+ t.Fatalf("want 2 repos found through symlink root, got %d: %+v", len(repos), repos)
+ }
+ got := map[string]bool{}
+ for _, r := range repos {
+ got[r.Slug] = true
+ }
+ for _, want := range []string{"repo-a", "repo-b"} {
+ if !got[want] {
+ t.Errorf("expected slug %q in %v (symlink root was not dereferenced)", want, got)
+ }
+ }
+}
+
+func TestDiscover_AbortsMidWalk(t *testing.T) {
+ if testing.Short() {
+ t.Skip("builds a large synthetic tree; skipped in -short mode")
+ }
+ root := t.TempDir()
+ const topLevel = 5000
+ for i := 0; i < topLevel; i++ {
+ parent := filepath.Join(root, fmt.Sprintf("d-%05d", i))
+ for j := 0; j < 3; j++ {
+ if err := os.MkdirAll(filepath.Join(parent, fmt.Sprintf("sub-%d", j)), 0o755); err != nil {
+ t.Fatal(err)
+ }
+ }
+ // Every top-level dir is a repo so the result count is a
+ // direct proxy for how far the walk got.
+ mustMkRepo(t, parent)
+ }
+
+ // Baseline: uncancelled walk finds all topLevel repos.
+ baseStart := time.Now()
+ full, err := Discover(context.Background(), []string{root}, NewMatcher(nil), 0)
+ if err != nil {
+ t.Fatal(err)
+ }
+ baseline := time.Since(baseStart)
+ if len(full) != topLevel {
+ t.Fatalf("baseline walk should find all %d repos, got %d", topLevel, len(full))
+ }
+
+ // Cancelled walk: give the walk a head start, then cancel.
+ ctx, cancel := context.WithCancel(context.Background())
+ done := make(chan struct{})
+ go func() {
+ time.Sleep(baseline / 4)
+ cancel()
+ close(done)
+ }()
+
+ cancStart := time.Now()
+ repos, err := Discover(ctx, []string{root}, NewMatcher(nil), 0)
+ cancElapsed := time.Since(cancStart)
+ <-done
+
+ if err != context.Canceled {
+ t.Fatalf("want context.Canceled after mid-walk cancel, got err=%v (elapsed %v, baseline %v, repos %d)",
+ err, cancElapsed, baseline, len(repos))
+ }
+ if cancElapsed >= baseline {
+ t.Errorf("cancelled walk took %v — baseline was %v. Ctx respected but walk did not shortcut; users will see delayed Ctrl+C", cancElapsed, baseline)
+ }
+ if len(repos) >= topLevel {
+ t.Errorf("cancelled walk returned all %d repos — appears to have finished before cancel fired; baseline %v, elapsed %v", len(repos), baseline, cancElapsed)
+ }
+}
+
+// A cancelled context must abort the walk — previously Discover
+// ignored ctx entirely, so Ctrl+C during the walk phase only took
+// effect after every directory had been stat'd. Test uses a
+// pre-cancelled context to trip the early-return on the very first
+// callback; no repos should be returned and the error should equal
+// ctx.Err().
+func TestDiscover_AbortsOnCancelledContext(t *testing.T) {
+ root := t.TempDir()
+ // Enough decoy directories that an un-aborted walk would still
+ // produce observable work — the assertion below fails loudly if
+ // the check is skipped.
+ for i := 0; i < 20; i++ {
+ if err := os.MkdirAll(filepath.Join(root, fmt.Sprintf("dir-%d", i)), 0o755); err != nil {
+ t.Fatal(err)
+ }
+ }
+ mustMkRepo(t, filepath.Join(root, "would-be-found"))
+
+ ctx, cancel := context.WithCancel(context.Background())
+ cancel() // pre-cancelled
+
+ repos, err := Discover(ctx, []string{root}, NewMatcher(nil), 0)
+ if err != context.Canceled {
+ t.Fatalf("want context.Canceled, got err=%v", err)
+ }
+ if len(repos) != 0 {
+ t.Errorf("want empty repo list after cancel, got %+v", repos)
+ }
+}
+
+func TestDiscover_IgnoredRepoNotRecorded_DescendantStillFound(t *testing.T) {
+ root := t.TempDir()
+ // `vendor` is itself a repo AND is ignored by `vendor/`.
+ mustMkRepo(t, filepath.Join(root, "vendor"))
+ // `vendor/keep` is a nested repo, re-included by `!vendor/keep`.
+ mustMkRepo(t, filepath.Join(root, "vendor", "keep"))
+
+ matcher := NewMatcher([]string{"vendor/", "!vendor/keep"})
+ repos, err := Discover(context.Background(), []string{root}, matcher, 0)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ got := map[string]bool{}
+ for _, r := range repos {
+ got[r.RelPath] = true
+ }
+ if got["vendor"] {
+ t.Error("vendor itself is ignored and must not be recorded as a repo")
+ }
+ if !got["vendor/keep"] {
+ t.Errorf("vendor/keep should be re-included by the negation rule; got %+v", repos)
+ }
+}
+
+func TestMatcher_CouldReinclude(t *testing.T) {
+ cases := []struct {
+ name string
+ patterns []string
+ dir string
+ want bool
+ }{
+ {"no negation", []string{"vendor/"}, "vendor", false},
+ {"explicit descendant", []string{"vendor/", "!vendor/keep"}, "vendor", true},
+ {"unrelated negation", []string{"vendor/", "!src/main"}, "vendor", false},
+ {"basename negation could fire anywhere", []string{"vendor/", "!keep"}, "vendor", true},
+ {"deep-match negation", []string{"build/", "!**/src"}, "build", true},
+ // Globbed first segment — reviewer case.
+ {"glob star in segment", []string{"vendor*/", "!vendor*/keep"}, "vendor", true},
+ {"wildcard segment matches any parent", []string{"*/", "!*/keep"}, "vendor", true},
+ {"glob prefix that doesn't match dir", []string{"vendor*/", "!foo*/keep"}, "vendor", false},
+ {"nested dir with literal pattern", []string{"pkg/vendor/", "!pkg/vendor/keep"}, "pkg/vendor", true},
+ {"nested dir with glob in first segment", []string{"*/vendor/", "!*/vendor/keep"}, "pkg/vendor", true},
+ {"pattern with same segment count as dir can't match descendant", []string{"!vendor"}, "vendor", true}, // basename-anywhere
+ }
+ for _, c := range cases {
+ t.Run(c.name, func(t *testing.T) {
+ m := NewMatcher(c.patterns)
+ if got := m.CouldReinclude(c.dir); got != c.want {
+ t.Errorf("CouldReinclude(%q) with %v = %v, want %v", c.dir, c.patterns, got, c.want)
+ }
+ })
+ }
+}
+
+// Regression: glob-prefixed negations like `!vendor*/keep` or `!*/keep`
+// used to slip past CouldReinclude because it only matched literal
+// `dir + "/"` prefixes. Discovery then pruned vendor/ and the
+// re-included vendor/keep repo disappeared from the scan.
+func TestDiscover_HonorsGlobbedNegation(t *testing.T) {
+ root := t.TempDir()
+ mustMkRepo(t, filepath.Join(root, "vendor", "keep"))
+ mustMkRepo(t, filepath.Join(root, "vendor", "garbage"))
+ mustMkRepo(t, filepath.Join(root, "vendor-old", "keep"))
+ mustMkRepo(t, filepath.Join(root, "unrelated"))
+
+ matcher := NewMatcher([]string{"vendor*/", "!vendor*/keep"})
+ repos, err := Discover(context.Background(), []string{root}, matcher, 0)
+ if err != nil {
+ t.Fatal(err)
+ }
+ got := map[string]bool{}
+ for _, r := range repos {
+ got[r.RelPath] = true
+ }
+ want := []string{"vendor/keep", "vendor-old/keep", "unrelated"}
+ for _, w := range want {
+ if !got[w] {
+ t.Errorf("missing %q in discovered repos: %+v", w, repos)
+ }
+ }
+ if got["vendor/garbage"] {
+ t.Error("vendor/garbage should remain ignored")
+ }
+}
+
+func TestDiscover_DoesNotDescendIntoRepo(t *testing.T) {
+ root := t.TempDir()
+ parent := filepath.Join(root, "parent")
+ mustMkRepo(t, parent)
+ // A nested .git inside an already-discovered repo is treated as a
+ // submodule and skipped — we don't double-count.
+ mustMkRepo(t, filepath.Join(parent, "submodule"))
+
+ repos, err := Discover(context.Background(), []string{root}, NewMatcher(nil), 0)
+ if err != nil {
+ t.Fatal(err)
+ }
+ if len(repos) != 1 {
+ t.Fatalf("expected 1 repo (parent only), got %d: %+v", len(repos), repos)
+ }
+}
+
+func TestDiscover_SlugCollisionGetsHashSuffix(t *testing.T) {
+ root := t.TempDir()
+ mustMkRepo(t, filepath.Join(root, "a", "myrepo"))
+ mustMkRepo(t, filepath.Join(root, "b", "myrepo"))
+
+ repos, err := Discover(context.Background(), []string{root}, NewMatcher(nil), 0)
+ if err != nil {
+ t.Fatal(err)
+ }
+ if len(repos) != 2 {
+ t.Fatalf("expected 2 repos, got %d", len(repos))
+ }
+ if repos[0].Slug == repos[1].Slug {
+ t.Errorf("collision not resolved: both slugs are %q", repos[0].Slug)
+ }
+ // With the two-pass naming, BOTH duplicates carry a hash suffix —
+ // neither gets the bare basename. This guarantees the slug for any
+ // given path is stable regardless of which sibling WalkDir hits
+ // first across runs.
+ for _, r := range repos {
+ if r.Slug == "myrepo" {
+ t.Errorf("expected both colliding repos to get a hash suffix, but %s kept the bare name", r.AbsPath)
+ }
+ }
+}
+
+// Re-running discovery must produce the same slug for the same path
+// even when the WalkDir traversal could legally vary. This is the
+// invariant `.state` resumption depends on.
+func TestDiscover_SlugDeterministicAcrossRuns(t *testing.T) {
+ root := t.TempDir()
+ mustMkRepo(t, filepath.Join(root, "a", "myrepo"))
+ mustMkRepo(t, filepath.Join(root, "b", "myrepo"))
+
+ first, err := Discover(context.Background(), []string{root}, NewMatcher(nil), 0)
+ if err != nil {
+ t.Fatal(err)
+ }
+ second, err := Discover(context.Background(), []string{root}, NewMatcher(nil), 0)
+ if err != nil {
+ t.Fatal(err)
+ }
+ if len(first) != len(second) {
+ t.Fatalf("repo count differs across runs: %d vs %d", len(first), len(second))
+ }
+ pathSlug := map[string]string{}
+ for _, r := range first {
+ pathSlug[r.AbsPath] = r.Slug
+ }
+ for _, r := range second {
+ if pathSlug[r.AbsPath] != r.Slug {
+ t.Errorf("slug for %s changed across runs: %q → %q", r.AbsPath, pathSlug[r.AbsPath], r.Slug)
+ }
+ }
+}
+
+// Regression: a random regular file named `.git` (not the
+// `gitdir: …` pointer format) used to make discovery record the
+// parent as a repo, SkipDir out of the subtree, and hide any real
+// repos nested below. Now the .git entry is validated — arbitrary
+// files are rejected AND the walker keeps descending to find real
+// repos deeper.
+func TestDiscover_RejectsArbitraryGitFileAndDescends(t *testing.T) {
+ root := t.TempDir()
+ // Parent with a bogus `.git` file — looks like a repo by naive
+ // existence check, isn't by content check.
+ bogus := filepath.Join(root, "bogus")
+ if err := os.MkdirAll(bogus, 0o755); err != nil {
+ t.Fatal(err)
+ }
+ if err := os.WriteFile(filepath.Join(bogus, ".git"), []byte("not a git pointer file\n"), 0o644); err != nil {
+ t.Fatal(err)
+ }
+ // Real nested repo the old code would have hidden via SkipDir
+ // after recording the bogus parent.
+ mustMkRepo(t, filepath.Join(bogus, "nested"))
+
+ repos, err := Discover(context.Background(), []string{root}, NewMatcher(nil), 0)
+ if err != nil {
+ t.Fatal(err)
+ }
+ got := map[string]bool{}
+ for _, r := range repos {
+ got[r.RelPath] = true
+ }
+ if got["bogus"] {
+ t.Error("parent with a non-pointer `.git` file must not be recorded as a repo")
+ }
+ if !got["bogus/nested"] {
+ t.Errorf("real repo nested under a bogus .git file should still be discovered; got %+v", repos)
+ }
+}
+
+// Regression companion: a valid `gitdir: …` pointer file (the
+// format git writes for linked worktrees and submodules) must still
+// be accepted as a repo.
+func TestDiscover_AcceptsGitdirPointerFile(t *testing.T) {
+ root := t.TempDir()
+ realGit := filepath.Join(t.TempDir(), "real-git-dir")
+ if err := os.MkdirAll(realGit, 0o755); err != nil {
+ t.Fatal(err)
+ }
+
+ wt := filepath.Join(root, "worktree")
+ if err := os.MkdirAll(wt, 0o755); err != nil {
+ t.Fatal(err)
+ }
+ // Pointer file: git's actual format is literally "gitdir: \n".
+ pointer := []byte("gitdir: " + realGit + "\n")
+ if err := os.WriteFile(filepath.Join(wt, ".git"), pointer, 0o644); err != nil {
+ t.Fatal(err)
+ }
+
+ repos, err := Discover(context.Background(), []string{root}, NewMatcher(nil), 0)
+ if err != nil {
+ t.Fatal(err)
+ }
+ if len(repos) != 1 || repos[0].RelPath != "worktree" {
+ t.Fatalf("expected to discover the gitdir-pointer worktree, got %+v", repos)
+ }
+}
+
+func TestDiscover_RejectsSymlinkGit(t *testing.T) {
+ root := t.TempDir()
+ // A "repo" whose .git is a symlink — should not be picked up.
+ bad := filepath.Join(root, "weird")
+ if err := os.MkdirAll(bad, 0o755); err != nil {
+ t.Fatal(err)
+ }
+ if err := os.Symlink("/etc/hostname", filepath.Join(bad, ".git")); err != nil {
+ t.Skipf("symlink unsupported here: %v", err)
+ }
+ mustMkRepo(t, filepath.Join(root, "real"))
+
+ repos, err := Discover(context.Background(), []string{root}, NewMatcher(nil), 0)
+ if err != nil {
+ t.Fatal(err)
+ }
+ if len(repos) != 1 || repos[0].Slug != "real" {
+ t.Fatalf("expected only `real`, got %+v", repos)
+ }
+}
+
+func TestDiscover_MaxDepthHonored(t *testing.T) {
+ root := t.TempDir()
+ mustMkRepo(t, filepath.Join(root, "shallow"))
+ mustMkRepo(t, filepath.Join(root, "a", "b", "c", "deep"))
+
+ repos, err := Discover(context.Background(), []string{root}, NewMatcher(nil), 2)
+ if err != nil {
+ t.Fatal(err)
+ }
+ if len(repos) != 1 || repos[0].Slug != "shallow" {
+ t.Fatalf("expected only shallow, got %+v", repos)
+ }
+}
+
+func mustMkRepo(t *testing.T, path string) {
+ t.Helper()
+ if err := os.MkdirAll(filepath.Join(path, ".git"), 0o755); err != nil {
+ t.Fatal(err)
+ }
+}
+
+func TestDiscover_FindsBareRepo(t *testing.T) {
+ root := t.TempDir()
+ bare := filepath.Join(root, "myrepo.git")
+ for _, name := range []string{"HEAD", "objects", "refs"} {
+ full := filepath.Join(bare, name)
+ if name == "HEAD" {
+ if err := os.MkdirAll(bare, 0o755); err != nil {
+ t.Fatal(err)
+ }
+ if err := os.WriteFile(full, []byte("ref: refs/heads/main\n"), 0o644); err != nil {
+ t.Fatal(err)
+ }
+ } else {
+ if err := os.MkdirAll(full, 0o755); err != nil {
+ t.Fatal(err)
+ }
+ }
+ }
+ // Decoy: dir with HEAD only — must not be picked up.
+ decoy := filepath.Join(root, "not-a-repo")
+ if err := os.MkdirAll(decoy, 0o755); err != nil {
+ t.Fatal(err)
+ }
+ if err := os.WriteFile(filepath.Join(decoy, "HEAD"), []byte("nope\n"), 0o644); err != nil {
+ t.Fatal(err)
+ }
+
+ repos, err := Discover(context.Background(), []string{root}, NewMatcher(nil), 0)
+ if err != nil {
+ t.Fatal(err)
+ }
+ if len(repos) != 1 || repos[0].Slug != "myrepo.git" {
+ t.Fatalf("expected single myrepo.git, got %+v", repos)
+ }
+}
diff --git a/internal/scan/ignore.go b/internal/scan/ignore.go
new file mode 100644
index 0000000..bffce2c
--- /dev/null
+++ b/internal/scan/ignore.go
@@ -0,0 +1,231 @@
+package scan
+
+import (
+ "bufio"
+ "fmt"
+ "os"
+ "path"
+ "path/filepath"
+ "strings"
+)
+
+// IgnoreRule is one line of the ignore file, pre-parsed.
+// Supported syntax (a subset of gitignore, enough for directory pruning):
+//
+// # comment (ignored)
+// node_modules (literal name — matches as basename or full path)
+// vendor/ (directory-only; matches dirs named vendor)
+// archive/* (glob — forwarded to path.Match on the rel path)
+// **/generated (deep-match: same as `generated` matched anywhere)
+// !important (negation — re-includes a match)
+//
+// Matching is evaluated in order; the last rule to match wins, following
+// gitignore semantics so `!foo` after `foo/` re-includes foo.
+type IgnoreRule struct {
+ Pattern string
+ Negate bool
+ DirOnly bool
+}
+
+type Matcher struct {
+ rules []IgnoreRule
+}
+
+func NewMatcher(patterns []string) *Matcher {
+ m := &Matcher{}
+ for _, p := range patterns {
+ if r, ok := parseRule(p); ok {
+ m.rules = append(m.rules, r)
+ }
+ }
+ return m
+}
+
+// LoadMatcher reads an ignore file and returns a matcher. Missing files
+// are rejected — this entry point is used when the path was specified
+// explicitly (e.g. `--ignore-file foo.ignore`), where a typo should
+// fail loudly instead of silently disabling all ignore rules and
+// widening discovery scope. Callers that want "load if present, empty
+// matcher otherwise" semantics (implicit default-path lookup) should
+// os.Stat first.
+func LoadMatcher(file string) (*Matcher, error) {
+ f, err := os.Open(file)
+ if err != nil {
+ return nil, fmt.Errorf("open %s: %w", file, err)
+ }
+ defer f.Close()
+
+ var patterns []string
+ scanner := bufio.NewScanner(f)
+ for scanner.Scan() {
+ patterns = append(patterns, scanner.Text())
+ }
+ if err := scanner.Err(); err != nil {
+ return nil, fmt.Errorf("read %s: %w", file, err)
+ }
+ return NewMatcher(patterns), nil
+}
+
+func parseRule(line string) (IgnoreRule, bool) {
+ line = strings.TrimSpace(line)
+ if line == "" || strings.HasPrefix(line, "#") {
+ return IgnoreRule{}, false
+ }
+ r := IgnoreRule{}
+ if strings.HasPrefix(line, "!") {
+ r.Negate = true
+ line = line[1:]
+ }
+ if strings.HasSuffix(line, "/") {
+ r.DirOnly = true
+ line = strings.TrimSuffix(line, "/")
+ }
+ r.Pattern = line
+ return r, r.Pattern != ""
+}
+
+// Match reports whether relPath should be ignored. isDir hints directory-
+// only rules (`vendor/` only fires on dirs).
+//
+// Evaluation order: scan every rule, track the last match. This is what
+// gitignore does — it lets `!src/keep` override a broader earlier `src/`
+// block without forcing the user to care about rule ordering beyond the
+// obvious "put exceptions after the thing they exclude".
+func (m *Matcher) Match(relPath string, isDir bool) bool {
+ relPath = filepath.ToSlash(relPath)
+ matched := false
+ for _, r := range m.rules {
+ if r.DirOnly && !isDir {
+ continue
+ }
+ if !matchRule(r, relPath) {
+ continue
+ }
+ matched = !r.Negate
+ }
+ return matched
+}
+
+// CouldReinclude reports whether any negation rule targets a descendant
+// of dir — i.e. walking into the dir could still yield a re-included
+// path. Callers use this to decide whether to prune an ignored
+// directory via filepath.SkipDir or descend into it and evaluate
+// children individually.
+//
+// Without this check the walker short-circuits the matcher's last-
+// match-wins semantics: a `vendor/` + `!vendor/keep` pair would skip
+// the vendor subtree entirely before vendor/keep could be examined,
+// silently dropping the re-included path.
+//
+// Returns true when:
+// - a negation's pattern has no path separator (basename rules like
+// `!keep` or `!*.go` that could fire at any depth), OR
+// - a negation begins with `**/` (deep-match; applies anywhere in the
+// tree), OR
+// - the pattern's leading path segments are each compatible with the
+// corresponding segment of dir via path.Match, so its trailing
+// segment(s) could name a descendant. This covers literal prefixes
+// like `!vendor/keep` AND globbed prefixes like `!vendor*/keep` or
+// `!*/keep` — both can match children of an ignored `vendor`.
+//
+// Negations that target a sibling or ancestor path (e.g. `!src/keep`
+// when walking vendor) correctly don't trigger descent, so pruning
+// remains effective for unrelated ignored trees like `node_modules/`.
+func (m *Matcher) CouldReinclude(dir string) bool {
+ dir = filepath.ToSlash(dir)
+ dirSegs := strings.Split(dir, "/")
+ for _, r := range m.rules {
+ if !r.Negate {
+ continue
+ }
+ pat := r.Pattern
+ // `**/foo` and other deep-match patterns can fire at any
+ // depth — walking into any ignored subtree could reach one.
+ if strings.HasPrefix(pat, "**/") {
+ return true
+ }
+ // Basename-only rules (no `/`) apply to any segment. `!keep`
+ // could re-include vendor/keep, a/b/keep, anywhere.
+ if !strings.Contains(pat, "/") {
+ return true
+ }
+ patSegs := strings.Split(pat, "/")
+ // The pattern must have strictly more segments than dir;
+ // otherwise its deepest named entity is dir itself or an
+ // ancestor, never a descendant worth descending for.
+ if len(patSegs) <= len(dirSegs) {
+ continue
+ }
+ // Each of the pattern's leading segments must be compatible
+ // with the corresponding dir segment. path.Match handles both
+ // literal names (`vendor`) and globs (`vendor*`, `*`, `?`)
+ // uniformly; a failing match on any segment rules this
+ // negation out.
+ ok := true
+ for i := 0; i < len(dirSegs); i++ {
+ matched, err := path.Match(patSegs[i], dirSegs[i])
+ if err != nil || !matched {
+ ok = false
+ break
+ }
+ }
+ if ok {
+ return true
+ }
+ }
+ return false
+}
+
+func matchRule(r IgnoreRule, relPath string) bool {
+ pat := r.Pattern
+
+ // **/foo → strip leading **/ and treat as "match foo anywhere" —
+ // same as the basename-or-suffix check below. We don't support
+ // arbitrary ** in the middle of patterns because users coming from
+ // gitignore expect the common forms (prefix dir, basename, ext),
+ // not the full double-star algebra. Saves us writing a mini doublestar
+ // engine for a marginal feature.
+ if strings.HasPrefix(pat, "**/") {
+ pat = strings.TrimPrefix(pat, "**/")
+ }
+
+ base := path.Base(relPath)
+
+ // Literal basename: `node_modules` matches any segment named that.
+ if pat == base {
+ return true
+ }
+ // Any segment of the path equals the pattern (literal dir/file name
+ // embedded in the tree). "vendor" matches "a/b/vendor/c.go".
+ if !strings.ContainsAny(pat, "*?[") && segmentMatch(pat, relPath) {
+ return true
+ }
+ // Glob on basename: "*.log"
+ if matched, _ := path.Match(pat, base); matched {
+ return true
+ }
+ // Glob on full relative path: "archive/*"
+ if matched, _ := path.Match(pat, relPath); matched {
+ return true
+ }
+ // Directory prefix: "archive/" (after DirOnly stripping applied above
+ // removed the trailing slash) or "archive/*".
+ prefix := strings.TrimSuffix(pat, "*")
+ prefix = strings.TrimSuffix(prefix, "/")
+ if prefix != "" && prefix != pat {
+ if strings.HasPrefix(relPath, prefix+"/") || relPath == prefix {
+ return true
+ }
+ }
+ return false
+}
+
+func segmentMatch(needle, relPath string) bool {
+ for _, seg := range strings.Split(relPath, "/") {
+ if seg == needle {
+ return true
+ }
+ }
+ return false
+}
+
diff --git a/internal/scan/ignore_test.go b/internal/scan/ignore_test.go
new file mode 100644
index 0000000..cda4f25
--- /dev/null
+++ b/internal/scan/ignore_test.go
@@ -0,0 +1,87 @@
+package scan
+
+import (
+ "os"
+ "path/filepath"
+ "strings"
+ "testing"
+)
+
+func TestMatcher_Basics(t *testing.T) {
+ m := NewMatcher([]string{
+ "# comment line",
+ "",
+ "node_modules",
+ "vendor/",
+ "archive/*",
+ "*.log",
+ "!important.log",
+ "**/generated",
+ })
+
+ cases := []struct {
+ name string
+ path string
+ isDir bool
+ want bool
+ }{
+ {"basename dir match", "a/b/node_modules", true, true},
+ {"basename file match in middle", "a/node_modules/c.go", false, true},
+ {"vendor dir-only match", "src/vendor", true, true},
+ {"vendor on file does not match", "src/vendor", false, false},
+ {"glob in subdir", "archive/2024", true, true},
+ {"glob extension", "out/build.log", false, true},
+ {"negation re-includes", "out/important.log", false, false},
+ {"deep generated", "src/foo/generated", true, true},
+ {"unrelated path", "src/main.go", false, false},
+ }
+
+ for _, c := range cases {
+ t.Run(c.name, func(t *testing.T) {
+ got := m.Match(c.path, c.isDir)
+ if got != c.want {
+ t.Errorf("Match(%q, dir=%v) = %v, want %v", c.path, c.isDir, got, c.want)
+ }
+ })
+ }
+}
+
+// LoadMatcher is called when the user supplied an explicit
+// --ignore-file; a missing file there is almost always a typo. If
+// we silently returned an empty matcher, every discovery would
+// widen to include node_modules/, vendor/, chromium-clones/, etc.
+// the user thought they had excluded — and they'd have no way to
+// tell from the console output. Fail loudly instead; the
+// default-path lookup in scan.loadMatcher handles the "silent when
+// absent" case via its own os.Stat before calling here.
+func TestLoadMatcher_MissingFileFails(t *testing.T) {
+ _, err := LoadMatcher(filepath.Join(t.TempDir(), "typo.ignore"))
+ if err == nil {
+ t.Fatal("expected error for missing explicit ignore file; a typo must not silently disable rules")
+ }
+ if !os.IsNotExist(err) && !strings.Contains(err.Error(), "typo.ignore") {
+ t.Errorf("error should identify the missing path; got %q", err)
+ }
+}
+
+func TestLoadMatcher_FromFile(t *testing.T) {
+ dir := t.TempDir()
+ path := filepath.Join(dir, ".gitcortex-ignore")
+ contents := "# comment\nnode_modules\nvendor/\n"
+ if err := os.WriteFile(path, []byte(contents), 0o644); err != nil {
+ t.Fatal(err)
+ }
+ m, err := LoadMatcher(path)
+ if err != nil {
+ t.Fatalf("LoadMatcher: %v", err)
+ }
+ if !m.Match("foo/node_modules", true) {
+ t.Error("node_modules should match")
+ }
+ if !m.Match("vendor", true) {
+ t.Error("vendor/ should match dirs")
+ }
+ if m.Match("vendor", false) {
+ t.Error("vendor/ should not match files")
+ }
+}
diff --git a/internal/scan/scan.go b/internal/scan/scan.go
new file mode 100644
index 0000000..a634f21
--- /dev/null
+++ b/internal/scan/scan.go
@@ -0,0 +1,304 @@
+package scan
+
+import (
+ "context"
+ "encoding/json"
+ "fmt"
+ "log"
+ "os"
+ "path/filepath"
+ "sync"
+ "time"
+
+ "github.com/lex0c/gitcortex/internal/extract"
+ "github.com/lex0c/gitcortex/internal/git"
+)
+
+// Config holds scan command input. The Extract template is copied per-repo
+// and its Repo/Output/StateFile/Branch are overwritten per worker — the
+// template exists to carry shared flags (ignore patterns, batch size,
+// mailmap, etc.) without re-declaring them on the scan surface.
+type Config struct {
+ Roots []string
+ Output string
+ IgnoreFile string
+ MaxDepth int
+ Parallel int
+ Extract extract.Config
+}
+
+// Manifest is persisted to