From 369e78b51ed08fcb1ce02792925c9261c08db63a Mon Sep 17 00:00:00 2001
From: lex0c <lex0c@proton.me>
Date: Mon, 20 Apr 2026 11:49:17 -0300
Subject: [PATCH 01/37] Add scan command for multi-repo discovery and
 consolidated reporting
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Walks one or more roots, finds every git repo, runs extract per repo
in parallel, and emits a manifest plus optional consolidated HTML
report. The report grows a per-repository breakdown surfacing how
work distributes across repos — both in the team report and in the
--email developer profile, where Files/commits/churn are filtered
to that dev's contributions.

- internal/scan: discovery, gitignore-style ignore matcher, worker
  pool. Two-pass slug naming makes <slug>.state stable across runs
  so resume keeps working when sibling repos share basenames.
- internal/stats: commitEntry carries the LoadMultiJSONL pathPrefix;
  RepoBreakdown groups commits + files by repo with optional email
  filter (per-dev file count uses devCommits, not the repo total).
- internal/report: Per-Repository Breakdown section in both the main
  and profile templates, gated on len(Repos) > 1.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 cmd/gitcortex/main.go                 | 162 +++++++++++++++
 internal/report/profile_template.go   |  36 ++++
 internal/report/report.go             |  15 ++
 internal/report/template.go           |  39 ++++
 internal/scan/discovery.go            | 175 +++++++++++++++++
 internal/scan/discovery_test.go       | 166 ++++++++++++++++
 internal/scan/ignore.go               | 160 +++++++++++++++
 internal/scan/ignore_test.go          |  81 ++++++++
 internal/scan/scan.go                 | 271 ++++++++++++++++++++++++++
 internal/scan/scan_test.go            | 138 +++++++++++++
 internal/stats/reader.go              |   6 +
 internal/stats/repo_breakdown.go      | 169 ++++++++++++++++
 internal/stats/repo_breakdown_test.go | 127 ++++++++++++
 13 files changed, 1545 insertions(+)
 create mode 100644 internal/scan/discovery.go
 create mode 100644 internal/scan/discovery_test.go
 create mode 100644 internal/scan/ignore.go
 create mode 100644 internal/scan/ignore_test.go
 create mode 100644 internal/scan/scan.go
 create mode 100644 internal/scan/scan_test.go
 create mode 100644 internal/stats/repo_breakdown.go
 create mode 100644 internal/stats/repo_breakdown_test.go
diff --git a/cmd/gitcortex/main.go b/cmd/gitcortex/main.go
index 50267c4..ddcf0eb 100644
--- a/cmd/gitcortex/main.go
+++ b/cmd/gitcortex/main.go
@@ -16,6 +16,7 @@ import (
 	"github.com/lex0c/gitcortex/internal/extract"
 	"github.com/lex0c/gitcortex/internal/git"
 	reportpkg "github.com/lex0c/gitcortex/internal/report"
+	"github.com/lex0c/gitcortex/internal/scan"
 	"github.com/lex0c/gitcortex/internal/stats"
 
 	"github.com/spf13/cobra"
@@ -35,6 +36,7 @@ func main() {
 	rootCmd.AddCommand(diffCmd())
 	rootCmd.AddCommand(ciCmd())
 	rootCmd.AddCommand(reportCmd())
+	rootCmd.AddCommand(scanCmd())
 
 	if err := rootCmd.Execute(); err != nil {
 		os.Exit(1)
@@ -869,3 +871,163 @@ func reportCmd() *cobra.Command {
 
 	return cmd
 }
+
+// --- Scan ---
+
+func scanCmd() *cobra.Command {
+	var (
+		roots              []string
+		output             string
+		ignoreFile         string
+		maxDepth           int
+		parallel           int
+		email              string
+		from               string
+		to                 string
+		since              string
+		reportPath         string
+		topN               int
+		extractIgnore      []string
+		batchSize          int
+		mailmap            bool
+		firstParent        bool
+		includeMessages    bool
+		couplingMaxFiles   int
+		couplingMinChanges int
+		churnHalfLife      int
+		networkMinFiles    int
+	)
+
+	cmd := &cobra.Command{
+		Use:   "scan",
+		Short: "Discover git repositories under one or more roots and consolidate their history",
+		Long: `Walk the given root(s), find every git repository, and run extract on each
+repository in parallel. Outputs one JSONL per repo plus a manifest in --output.
+Optionally generates a consolidated HTML report including a per-repository
+breakdown — handy for showing aggregated work across many repos.`,
+		RunE: func(cmd *cobra.Command, args []string) error {
+			if len(roots) == 0 {
+				return fmt.Errorf("--root is required (repeatable for multiple roots)")
+			}
+			if since != "" && (from != "" || to != "") {
+				return fmt.Errorf("--since cannot be combined with --from/--to")
+			}
+			if err := validateDate(from, "--from"); err != nil {
+				return err
+			}
+			if err := validateDate(to, "--to"); err != nil {
+				return err
+			}
+			if from != "" && to != "" && from > to {
+				return fmt.Errorf("--from (%s) must be on or before --to (%s)", from, to)
+			}
+
+			cfg := scan.Config{
+				Roots:      roots,
+				Output:     output,
+				IgnoreFile: ignoreFile,
+				MaxDepth:   maxDepth,
+				Parallel:   parallel,
+				Extract: extract.Config{
+					BatchSize:       batchSize,
+					IncludeMessages: includeMessages,
+					CommandTimeout:  extract.DefaultCommandTimeout,
+					FirstParent:     firstParent,
+					Mailmap:         mailmap,
+					IgnorePatterns:  extractIgnore,
+					StartOffset:     -1,
+				},
+			}
+
+			ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM)
+			defer stop()
+
+			result, err := scan.Run(ctx, cfg)
+			// scan.Run returns a partial result alongside ctx.Err() on
+			// cancellation. Honor that — write whatever progress we made
+			// to disk and surface the error so the CLI exits non-zero.
+			if err != nil {
+				return err
+			}
+
+			if reportPath == "" {
+				fmt.Fprintf(os.Stderr, "Scan complete: %d JSONL file(s) in %s\n", len(result.JSONLs), result.OutputDir)
+				return nil
+			}
+			if len(result.JSONLs) == 0 {
+				return fmt.Errorf("no successful repos extracted; cannot build report")
+			}
+
+			fromDate := from
+			if since != "" {
+				d, err := parseSince(since)
+				if err != nil {
+					return err
+				}
+				fromDate = d
+			}
+
+			ds, err := stats.LoadMultiJSONL(result.JSONLs, stats.LoadOptions{
+				From:         fromDate,
+				To:           to,
+				HalfLifeDays: churnHalfLife,
+				CoupMaxFiles: couplingMaxFiles,
+			})
+			if err != nil {
+				return fmt.Errorf("load consolidated dataset: %w", err)
+			}
+			fmt.Fprintf(os.Stderr, "Loaded %d commits across %d repo(s)\n", ds.CommitCount, len(result.JSONLs))
+
+			f, err := os.Create(reportPath)
+			if err != nil {
+				return fmt.Errorf("create report: %w", err)
+			}
+			defer f.Close()
+
+			// Label the report after the basename of the first --root
+			// (or the output dir as a fallback). "scan-scan-output" was
+			// the previous default; users find the root name far more
+			// recognizable as the H1 of the report.
+			repoLabel := filepath.Base(result.OutputDir)
+			if len(cfg.Roots) > 0 {
+				repoLabel = filepath.Base(absPath(cfg.Roots[0]))
+			}
+			sf := stats.StatsFlags{CouplingMinChanges: couplingMinChanges, NetworkMinFiles: networkMinFiles}
+			if email != "" {
+				if err := reportpkg.GenerateProfile(f, ds, repoLabel, email); err != nil {
+					return fmt.Errorf("generate profile report: %w", err)
+				}
+				fmt.Fprintf(os.Stderr, "Profile report for %s written to %s\n", email, fileURL(reportPath))
+				return nil
+			}
+			if err := reportpkg.Generate(f, ds, repoLabel, topN, sf); err != nil {
+				return fmt.Errorf("generate report: %w", err)
+			}
+			fmt.Fprintf(os.Stderr, "Consolidated report written to %s\n", fileURL(reportPath))
+			return nil
+		},
+	}
+
+	cmd.Flags().StringSliceVar(&roots, "root", nil, "Root directory to walk for repositories (repeatable)")
+	cmd.Flags().StringVar(&output, "output", "scan-output", "Directory to write per-repo JSONL files and the manifest")
+	cmd.Flags().StringVar(&ignoreFile, "ignore-file", "", "Gitignore-style file with directories to skip during discovery. When unset, only the first --root is searched for a .gitcortex-ignore; pass an explicit path to apply rules across all roots.")
+	cmd.Flags().IntVar(&maxDepth, "max-depth", 0, "Maximum directory depth to descend into when looking for repos (0 = unlimited)")
+	cmd.Flags().IntVar(&parallel, "parallel", 4, "Number of repositories to extract in parallel")
+	cmd.Flags().StringVar(&email, "email", "", "Generate a per-developer profile report (only when --report is set)")
+	cmd.Flags().StringVar(&from, "from", "", "Window start date YYYY-MM-DD (forwarded to the consolidated report)")
+	cmd.Flags().StringVar(&to, "to", "", "Window end date YYYY-MM-DD (forwarded to the consolidated report)")
+	cmd.Flags().StringVar(&since, "since", "", "Filter to recent period (e.g. 7d, 4w, 3m, 1y); mutually exclusive with --from/--to")
+	cmd.Flags().StringVar(&reportPath, "report", "", "If set, generate a consolidated HTML report at this path after the scan")
+	cmd.Flags().IntVar(&topN, "top", 20, "Top-N entries per section in the consolidated report")
+	cmd.Flags().StringSliceVar(&extractIgnore, "extract-ignore", nil, "Glob patterns forwarded to per-repo extract --ignore (e.g. package-lock.json)")
+	cmd.Flags().IntVar(&batchSize, "batch-size", 1000, "Per-repo extract checkpoint interval")
+	cmd.Flags().BoolVar(&mailmap, "mailmap", false, "Use .mailmap (per repo) to normalize identities")
+	cmd.Flags().BoolVar(&firstParent, "first-parent", false, "Restrict extracts to the first-parent chain")
+	cmd.Flags().BoolVar(&includeMessages, "include-commit-messages", false, "Include commit messages in JSONL (needed for Top Commits in the consolidated report)")
+	cmd.Flags().IntVar(&couplingMaxFiles, "coupling-max-files", 50, "Max files per commit for coupling analysis (consolidated report)")
+	cmd.Flags().IntVar(&couplingMinChanges, "coupling-min-changes", 5, "Min co-changes for coupling results (consolidated report)")
+	cmd.Flags().IntVar(&churnHalfLife, "churn-half-life", 90, "Half-life in days for churn decay (consolidated report)")
+	cmd.Flags().IntVar(&networkMinFiles, "network-min-files", 5, "Min shared files for dev-network edges (consolidated report)")
+
+	return cmd
+}
diff --git a/internal/report/profile_template.go b/internal/report/profile_template.go
index 5bb8f0f..5fedc3a 100644
--- a/internal/report/profile_template.go
+++ b/internal/report/profile_template.go
@@ -134,6 +134,42 @@ footer { margin-top: 40px; padding-top: 16px; border-top: 1px solid #d0d7de; col
 </div>
 {{end}}
 
+{{if gt (len .Repos) 1}}
+<h2>Per-Repository Breakdown <span style="font-size:13px; color:#656d76; font-weight:normal;">{{thousands (len .Repos)}} repositories</span></h2>
+<p class="hint">How {{.Profile.Name}}'s work is split across repositories in this scan. Use this to point at the projects with the most activity, or to spot single-repo focus vs. broad multi-repo engagement.</p>
+<table>
+<tr>
+  <th>Repository</th>
+  <th>Commits</th>
+  <th>% of My Commits</th>
+  <th>Churn</th>
+  <th>% of My Churn</th>
+  <th>Files</th>
+  <th>Active days</th>
+  <th>First → Last</th>
+</tr>
+{{range .Repos}}
+<tr>
+  <td class="mono">{{.Repo}}</td>
+  <td>{{thousands .Commits}}</td>
+  <td>
+    <div style="display:flex; align-items:center; gap:6px;">
+      <div style="flex:0 0 60px; height:8px; background:#eaeef2; border-radius:3px; overflow:hidden;">
+        <div style="height:100%; width:{{printf "%.0f" .PctOfTotalCommits}}%; background:#0969da;"></div>
+      </div>
+      <span class="mono">{{printf "%.1f" .PctOfTotalCommits}}%</span>
+    </div>
+  </td>
+  <td>{{thousands .Churn}}</td>
+  <td class="mono">{{printf "%.1f" .PctOfTotalChurn}}%</td>
+  <td>{{thousands .Files}}</td>
+  <td>{{.ActiveDays}}</td>
+  <td class="mono" style="font-size:11px;">{{.FirstCommitDate}} → {{.LastCommitDate}}</td>
+</tr>
+{{end}}
+</table>
+{{end}}
+
 {{if .Profile.TopFiles}}
 <h2>Top Files</h2>
 <p class="hint">Files this developer changed most (churn = additions + deletions). High churn on few files suggests deep ownership and potential knowledge concentration. · {{docRef "hotspots"}}</p>
diff --git a/internal/report/report.go b/internal/report/report.go
index 80e875f..358c894 100644
--- a/internal/report/report.go
+++ b/internal/report/report.go
@@ -68,6 +68,12 @@ type ReportData struct {
 	TotalDirectories    int
 	TotalExtensions     int
 	TotalBusFactorFiles int
+
+	// Repos holds per-repository aggregates for multi-repo (scan) reports.
+	// Empty on single-repo runs — the template gates the section behind
+	// `{{if gt (len .Repos) 1}}` so single-repo callers keep their
+	// existing layout untouched.
+	Repos []stats.RepoStat
 }
 
 // htmlTreeDepth caps the repo-structure tree baked into the HTML report.
@@ -376,6 +382,7 @@ func Generate(w io.Writer, ds *stats.Dataset, repoName string, topN int, sf stat
 		TotalDirectories:     stats.DirectoryCount(ds),
 		TotalExtensions:      stats.ExtensionCount(ds),
 		TotalBusFactorFiles:  stats.BusFactorCount(ds),
+		Repos:                stats.RepoBreakdown(ds, ""),
 	}
 	CapChildrenPerDir(data.Structure, htmlTreeMaxChildrenPerDir)
 
@@ -641,6 +648,13 @@ type ProfileReportData struct {
 	MaxActivityCommits int
 	PatternGrid     [7][24]int
 	MaxPattern      int
+
+	// Repos is the per-repository breakdown filtered to this developer's
+	// commits. Empty on single-repo profile reports — gated in the
+	// template so existing single-repo callers see no change. The headline
+	// use case for `gitcortex scan --email me` lives here: the developer
+	// can see at a glance which repos they spent time in.
+	Repos []stats.RepoStat
 }
 
 func GenerateProfile(w io.Writer, ds *stats.Dataset, repoName, email string) error {
@@ -672,6 +686,7 @@ func GenerateProfile(w io.Writer, ds *stats.Dataset, repoName, email string) err
 		MaxActivityCommits: maxAct,
 		PatternGrid:        p.WorkGrid,
 		MaxPattern:         maxP,
+		Repos:              stats.RepoBreakdown(ds, email),
 	}
 
 	return profileTmpl.Execute(w, data)
diff --git a/internal/report/template.go b/internal/report/template.go
index d6a3a64..5abc4e1 100644
--- a/internal/report/template.go
+++ b/internal/report/template.go
@@ -145,6 +145,45 @@ footer { margin-top: 40px; padding-top: 16px; border-top: 1px solid #d0d7de; col
   </div>
 </div>
 
+{{if gt (len .Repos) 1}}
+<h2>Per-Repository Breakdown <span style="font-size:13px; color:#656d76; font-weight:normal;">{{thousands (len .Repos)}} repositories</span></h2>
+<p class="hint">Cross-repo aggregation from <code>scan</code>. Use this view to see how work is distributed: a single repo dominating commits or churn often means the consolidated activity story is really about that one project; an even spread shows broad multi-repo engagement. Bar widths are normalized to the largest commit-count repo.</p>
+{{$maxRepoCommits := 0}}{{range .Repos}}{{if gt .Commits $maxRepoCommits}}{{$maxRepoCommits = .Commits}}{{end}}{{end}}
+<table>
+<tr>
+  <th>Repository</th>
+  <th>Commits</th>
+  <th>% Commits</th>
+  <th>Churn</th>
+  <th>% Churn</th>
+  <th>Files</th>
+  <th>Active days</th>
+  <th>Devs</th>
+  <th>First → Last</th>
+</tr>
+{{range .Repos}}
+<tr>
+  <td class="mono">{{.Repo}}</td>
+  <td>{{thousands .Commits}}</td>
+  <td>
+    <div style="display:flex; align-items:center; gap:6px;">
+      <div style="flex:0 0 60px; height:8px; background:#eaeef2; border-radius:3px; overflow:hidden;">
+        <div style="height:100%; width:{{if gt $maxRepoCommits 0}}{{printf "%.0f" (pctFloat .PctOfTotalCommits 100.0)}}%{{else}}0%{{end}}; background:#216e39;"></div>
+      </div>
+      <span class="mono">{{printf "%.1f" .PctOfTotalCommits}}%</span>
+    </div>
+  </td>
+  <td>{{thousands .Churn}}</td>
+  <td class="mono">{{printf "%.1f" .PctOfTotalChurn}}%</td>
+  <td>{{thousands .Files}}</td>
+  <td>{{.ActiveDays}}</td>
+  <td>{{.UniqueDevs}}</td>
+  <td class="mono" style="font-size:11px;">{{.FirstCommitDate}} → {{.LastCommitDate}}</td>
+</tr>
+{{end}}
+</table>
+{{end}}
+
 {{if .ActivityYears}}
 <h2 style="display:flex; justify-content:space-between; align-items:center;">Activity <button onclick="var h=document.getElementById('act-heatmap'),t=document.getElementById('act-table');h.hidden=!h.hidden;t.hidden=!t.hidden;this.textContent=h.hidden?'heatmap':'table'" style="font-size:11px; font-weight:normal; padding:2px 10px; border:1px solid #d0d7de; border-radius:4px; background:#f6f8fa; color:#24292f; cursor:pointer;">table</button></h2>
 <p class="hint">Monthly commit heatmap. Darker = more commits. Sudden drop-offs may mark team changes, re-orgs, or freezes; steady cadence signals healthy pace. Hover for details; toggle to table for exact numbers. · {{docRef "activity"}}</p>
diff --git a/internal/scan/discovery.go b/internal/scan/discovery.go
new file mode 100644
index 0000000..0d7b66f
--- /dev/null
+++ b/internal/scan/discovery.go
@@ -0,0 +1,175 @@
+package scan
+
+import (
+	"crypto/sha1"
+	"encoding/hex"
+	"fmt"
+	"log"
+	"os"
+	"path/filepath"
+	"sort"
+	"strings"
+)
+
+// Repo describes one git repository discovered during scan.
+type Repo struct {
+	// AbsPath is the absolute path to the working tree root.
+	AbsPath string
+	// RelPath is AbsPath relative to the scan root it was found under.
+	RelPath string
+	// Slug is a filesystem-safe identifier used to derive the output
+	// JSONL file name and the path prefix that stats will use as the
+	// repo label. Must be unique across the scan (collisions are resolved
+	// by appending a short hash of AbsPath).
+	Slug string
+}
+
+// Discover walks the given roots and returns every git repository it finds.
+// Directories matched by the ignore matcher are pruned. Each repo's Slug is
+// unique across the full result so downstream JSONL files don't collide,
+// and the prefix LoadMultiJSONL derives (basename-minus-ext) still groups
+// correctly.
+func Discover(roots []string, matcher *Matcher, maxDepth int) ([]Repo, error) {
+	if matcher == nil {
+		matcher = NewMatcher(nil)
+	}
+	var repos []Repo
+	seen := make(map[string]bool)
+
+	for _, root := range roots {
+		abs, err := filepath.Abs(root)
+		if err != nil {
+			return nil, fmt.Errorf("resolve %s: %w", root, err)
+		}
+		info, err := os.Stat(abs)
+		if err != nil {
+			return nil, fmt.Errorf("stat %s: %w", abs, err)
+		}
+		if !info.IsDir() {
+			return nil, fmt.Errorf("%s is not a directory", abs)
+		}
+
+		err = filepath.WalkDir(abs, func(path string, d os.DirEntry, werr error) error {
+			if werr != nil {
+				// Permission errors on one subtree shouldn't abort the
+				// whole scan. Log and keep walking.
+				log.Printf("scan: skip %s: %v", path, werr)
+				if d != nil && d.IsDir() {
+					return filepath.SkipDir
+				}
+				return nil
+			}
+			if !d.IsDir() {
+				return nil
+			}
+
+			rel, _ := filepath.Rel(abs, path)
+			rel = filepath.ToSlash(rel)
+
+			if maxDepth > 0 && rel != "." {
+				depth := strings.Count(rel, "/") + 1
+				if depth > maxDepth {
+					return filepath.SkipDir
+				}
+			}
+
+			if rel != "." && matcher.Match(rel, true) {
+				return filepath.SkipDir
+			}
+
+			// Detect .git either as a directory (normal) or a file
+			// (worktree / submodule). Presence alone is enough — we don't
+			// try to validate the content; extract will fail loudly if
+			// the repo is broken, which is a better signal than silently
+			// skipping.
+			// Lstat (not Stat) so a symlink named `.git` pointing somewhere
+			// arbitrary doesn't get treated as a real repo. Real `.git`
+			// entries are either a directory (normal worktree) or a regular
+			// file (worktree/submodule pointer). Both are caught here; a
+			// symlink is rejected, which is the conservative choice — if
+			// users genuinely point `.git` at a shared dir via symlink they
+			// can resolve it before scanning.
+			gitEntry := filepath.Join(path, ".git")
+			if info, statErr := os.Lstat(gitEntry); statErr == nil && info.Mode()&os.ModeSymlink == 0 {
+				if seen[path] {
+					return filepath.SkipDir
+				}
+				seen[path] = true
+				repos = append(repos, Repo{
+					AbsPath: path,
+					RelPath: rel,
+				})
+				// Don't descend into a repo — nested repos (submodules,
+				// vendored repos) get picked up separately only if they
+				// live outside this repo's worktree, which is rare. If
+				// users need submodule coverage they can list the parent
+				// and the submodule paths as separate roots.
+				return filepath.SkipDir
+			}
+
+			return nil
+		})
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	assignSlugs(repos)
+	// Sort by slug so output ordering is deterministic across runs
+	// (WalkDir ordering is, but concatenating multiple roots could shuffle).
+	sort.Slice(repos, func(i, j int) bool { return repos[i].Slug < repos[j].Slug })
+	return repos, nil
+}
+
+// assignSlugs derives a unique slug per repo from its basename, falling
+// back to `<basename>-<shortHash(absPath)>` when two repos share a name.
+// The slug is also the JSONL filename stem and the persistence key
+// (`<slug>.state`), so uniqueness AND determinism across runs both matter:
+// if a re-run swaps which sibling gets the bare name, the per-repo
+// state file is orphaned and the affected repos are re-extracted from
+// scratch (or worse, two repos collide onto one state).
+//
+// Two-pass: count basenames first, then suffix with a hash whenever the
+// base appears more than once anywhere in the result. This makes the
+// slug a pure function of (absPath, set of basenames seen), independent
+// of WalkDir traversal order.
+func assignSlugs(repos []Repo) {
+	counts := make(map[string]int)
+	bases := make([]string, len(repos))
+	for i := range repos {
+		base := sanitizeSlug(filepath.Base(repos[i].AbsPath))
+		if base == "" {
+			base = "repo"
+		}
+		bases[i] = base
+		counts[base]++
+	}
+	for i := range repos {
+		base := bases[i]
+		slug := base
+		if counts[base] > 1 {
+			h := sha1.Sum([]byte(repos[i].AbsPath))
+			slug = fmt.Sprintf("%s-%s", base, hex.EncodeToString(h[:])[:6])
+		}
+		repos[i].Slug = slug
+	}
+}
+
+// sanitizeSlug strips characters that would break the LoadMultiJSONL prefix
+// contract (`<slug>:`) or filesystem paths. Keeps alphanumerics, dash,
+// underscore, and dot.
+func sanitizeSlug(s string) string {
+	var b strings.Builder
+	for _, r := range s {
+		switch {
+		case r >= 'a' && r <= 'z',
+			r >= 'A' && r <= 'Z',
+			r >= '0' && r <= '9',
+			r == '-', r == '_', r == '.':
+			b.WriteRune(r)
+		default:
+			b.WriteRune('_')
+		}
+	}
+	return b.String()
+}
diff --git a/internal/scan/discovery_test.go b/internal/scan/discovery_test.go
new file mode 100644
index 0000000..4bb7a0c
--- /dev/null
+++ b/internal/scan/discovery_test.go
@@ -0,0 +1,166 @@
+package scan
+
+import (
+	"os"
+	"path/filepath"
+	"testing"
+)
+
+func TestDiscover_FindsRepos(t *testing.T) {
+	root := t.TempDir()
+
+	mustMkRepo(t, filepath.Join(root, "a"))
+	mustMkRepo(t, filepath.Join(root, "b"))
+	mustMkRepo(t, filepath.Join(root, "nested", "c"))
+	// Plain dir without .git — must NOT be picked up.
+	if err := os.MkdirAll(filepath.Join(root, "not-a-repo", "src"), 0o755); err != nil {
+		t.Fatal(err)
+	}
+
+	repos, err := Discover([]string{root}, NewMatcher(nil), 0)
+	if err != nil {
+		t.Fatalf("Discover: %v", err)
+	}
+	if len(repos) != 3 {
+		t.Fatalf("expected 3 repos, got %d: %+v", len(repos), repos)
+	}
+	got := map[string]bool{}
+	for _, r := range repos {
+		got[r.Slug] = true
+	}
+	for _, want := range []string{"a", "b", "c"} {
+		if !got[want] {
+			t.Errorf("expected slug %q in %v", want, got)
+		}
+	}
+}
+
+func TestDiscover_RespectsIgnore(t *testing.T) {
+	root := t.TempDir()
+	mustMkRepo(t, filepath.Join(root, "keep"))
+	mustMkRepo(t, filepath.Join(root, "node_modules", "garbage"))
+
+	matcher := NewMatcher([]string{"node_modules"})
+	repos, err := Discover([]string{root}, matcher, 0)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(repos) != 1 || repos[0].Slug != "keep" {
+		t.Fatalf("expected only `keep`, got %+v", repos)
+	}
+}
+
+func TestDiscover_DoesNotDescendIntoRepo(t *testing.T) {
+	root := t.TempDir()
+	parent := filepath.Join(root, "parent")
+	mustMkRepo(t, parent)
+	// A nested .git inside an already-discovered repo is treated as a
+	// submodule and skipped — we don't double-count.
+	mustMkRepo(t, filepath.Join(parent, "submodule"))
+
+	repos, err := Discover([]string{root}, NewMatcher(nil), 0)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(repos) != 1 {
+		t.Fatalf("expected 1 repo (parent only), got %d: %+v", len(repos), repos)
+	}
+}
+
+func TestDiscover_SlugCollisionGetsHashSuffix(t *testing.T) {
+	root := t.TempDir()
+	mustMkRepo(t, filepath.Join(root, "a", "myrepo"))
+	mustMkRepo(t, filepath.Join(root, "b", "myrepo"))
+
+	repos, err := Discover([]string{root}, NewMatcher(nil), 0)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(repos) != 2 {
+		t.Fatalf("expected 2 repos, got %d", len(repos))
+	}
+	if repos[0].Slug == repos[1].Slug {
+		t.Errorf("collision not resolved: both slugs are %q", repos[0].Slug)
+	}
+	// With the two-pass naming, BOTH duplicates carry a hash suffix —
+	// neither gets the bare basename. This guarantees the slug for any
+	// given path is stable regardless of which sibling WalkDir hits
+	// first across runs.
+	for _, r := range repos {
+		if r.Slug == "myrepo" {
+			t.Errorf("expected both colliding repos to get a hash suffix, but %s kept the bare name", r.AbsPath)
+		}
+	}
+}
+
+// Re-running discovery must produce the same slug for the same path
+// even when the WalkDir traversal could legally vary. This is the
+// invariant `<slug>.state` resumption depends on.
+func TestDiscover_SlugDeterministicAcrossRuns(t *testing.T) {
+	root := t.TempDir()
+	mustMkRepo(t, filepath.Join(root, "a", "myrepo"))
+	mustMkRepo(t, filepath.Join(root, "b", "myrepo"))
+
+	first, err := Discover([]string{root}, NewMatcher(nil), 0)
+	if err != nil {
+		t.Fatal(err)
+	}
+	second, err := Discover([]string{root}, NewMatcher(nil), 0)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(first) != len(second) {
+		t.Fatalf("repo count differs across runs: %d vs %d", len(first), len(second))
+	}
+	pathSlug := map[string]string{}
+	for _, r := range first {
+		pathSlug[r.AbsPath] = r.Slug
+	}
+	for _, r := range second {
+		if pathSlug[r.AbsPath] != r.Slug {
+			t.Errorf("slug for %s changed across runs: %q → %q", r.AbsPath, pathSlug[r.AbsPath], r.Slug)
+		}
+	}
+}
+
+func TestDiscover_RejectsSymlinkGit(t *testing.T) {
+	root := t.TempDir()
+	// A "repo" whose .git is a symlink — should not be picked up.
+	bad := filepath.Join(root, "weird")
+	if err := os.MkdirAll(bad, 0o755); err != nil {
+		t.Fatal(err)
+	}
+	if err := os.Symlink("/etc/hostname", filepath.Join(bad, ".git")); err != nil {
+		t.Skipf("symlink unsupported here: %v", err)
+	}
+	mustMkRepo(t, filepath.Join(root, "real"))
+
+	repos, err := Discover([]string{root}, NewMatcher(nil), 0)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(repos) != 1 || repos[0].Slug != "real" {
+		t.Fatalf("expected only `real`, got %+v", repos)
+	}
+}
+
+func TestDiscover_MaxDepthHonored(t *testing.T) {
+	root := t.TempDir()
+	mustMkRepo(t, filepath.Join(root, "shallow"))
+	mustMkRepo(t, filepath.Join(root, "a", "b", "c", "deep"))
+
+	repos, err := Discover([]string{root}, NewMatcher(nil), 2)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(repos) != 1 || repos[0].Slug != "shallow" {
+		t.Fatalf("expected only shallow, got %+v", repos)
+	}
+}
+
+func mustMkRepo(t *testing.T, path string) {
+	t.Helper()
+	if err := os.MkdirAll(filepath.Join(path, ".git"), 0o755); err != nil {
+		t.Fatal(err)
+	}
+}
diff --git a/internal/scan/ignore.go b/internal/scan/ignore.go
new file mode 100644
index 0000000..c378b05
--- /dev/null
+++ b/internal/scan/ignore.go
@@ -0,0 +1,160 @@
+package scan
+
+import (
+	"bufio"
+	"fmt"
+	"os"
+	"path"
+	"path/filepath"
+	"strings"
+)
+
+// IgnoreRule is one line of the ignore file, pre-parsed.
+// Supported syntax (a subset of gitignore, enough for directory pruning):
+//
+//	# comment                (ignored)
+//	node_modules             (literal name — matches as basename or full path)
+//	vendor/                  (directory-only; matches dirs named vendor)
+//	archive/*                (glob — forwarded to path.Match on the rel path)
+//	**/generated             (deep-match: same as `generated` matched anywhere)
+//	!important               (negation — re-includes a match)
+//
+// Matching is evaluated in order; the last rule to match wins, following
+// gitignore semantics so `!foo` after `foo/` re-includes foo.
+type IgnoreRule struct {
+	Pattern string
+	Negate  bool
+	DirOnly bool
+}
+
+type Matcher struct {
+	rules []IgnoreRule
+}
+
+func NewMatcher(patterns []string) *Matcher {
+	m := &Matcher{}
+	for _, p := range patterns {
+		if r, ok := parseRule(p); ok {
+			m.rules = append(m.rules, r)
+		}
+	}
+	return m
+}
+
+// LoadMatcher reads an ignore file and returns a matcher. A missing file
+// is not an error — scan simply proceeds with no user-provided rules.
+// This matches the gitignore contract where `.gitignore` is optional.
+func LoadMatcher(file string) (*Matcher, error) {
+	f, err := os.Open(file)
+	if err != nil {
+		if os.IsNotExist(err) {
+			return NewMatcher(nil), nil
+		}
+		return nil, fmt.Errorf("open %s: %w", file, err)
+	}
+	defer f.Close()
+
+	var patterns []string
+	scanner := bufio.NewScanner(f)
+	for scanner.Scan() {
+		patterns = append(patterns, scanner.Text())
+	}
+	if err := scanner.Err(); err != nil {
+		return nil, fmt.Errorf("read %s: %w", file, err)
+	}
+	return NewMatcher(patterns), nil
+}
+
+func parseRule(line string) (IgnoreRule, bool) {
+	line = strings.TrimSpace(line)
+	if line == "" || strings.HasPrefix(line, "#") {
+		return IgnoreRule{}, false
+	}
+	r := IgnoreRule{}
+	if strings.HasPrefix(line, "!") {
+		r.Negate = true
+		line = line[1:]
+	}
+	if strings.HasSuffix(line, "/") {
+		r.DirOnly = true
+		line = strings.TrimSuffix(line, "/")
+	}
+	r.Pattern = line
+	return r, r.Pattern != ""
+}
+
+// Match reports whether relPath should be ignored. isDir hints directory-
+// only rules (`vendor/` only fires on dirs).
+//
+// Evaluation order: scan every rule, track the last match. This is what
+// gitignore does — it lets `!src/keep` override a broader earlier `src/`
+// block without forcing the user to care about rule ordering beyond the
+// obvious "put exceptions after the thing they exclude".
+func (m *Matcher) Match(relPath string, isDir bool) bool {
+	relPath = filepath.ToSlash(relPath)
+	matched := false
+	for _, r := range m.rules {
+		if r.DirOnly && !isDir {
+			continue
+		}
+		if !matchRule(r, relPath) {
+			continue
+		}
+		matched = !r.Negate
+	}
+	return matched
+}
+
+func matchRule(r IgnoreRule, relPath string) bool {
+	pat := r.Pattern
+
+	// **/foo → strip leading **/ and treat as "match foo anywhere" —
+	// same as the basename-or-suffix check below. We don't support
+	// arbitrary ** in the middle of patterns because users coming from
+	// gitignore expect the common forms (prefix dir, basename, ext),
+	// not the full double-star algebra. Saves us writing a mini doublestar
+	// engine for a marginal feature.
+	if strings.HasPrefix(pat, "**/") {
+		pat = strings.TrimPrefix(pat, "**/")
+	}
+
+	base := path.Base(relPath)
+
+	// Literal basename: `node_modules` matches any segment named that.
+	if pat == base {
+		return true
+	}
+	// Any segment of the path equals the pattern (literal dir/file name
+	// embedded in the tree). "vendor" matches "a/b/vendor/c.go".
+	if !strings.ContainsAny(pat, "*?[") && segmentMatch(pat, relPath) {
+		return true
+	}
+	// Glob on basename: "*.log"
+	if matched, _ := path.Match(pat, base); matched {
+		return true
+	}
+	// Glob on full relative path: "archive/*"
+	if matched, _ := path.Match(pat, relPath); matched {
+		return true
+	}
+	// Directory prefix: "archive/" (after DirOnly stripping applied above
+	// removed the trailing slash) or "archive/*".
+	prefix := strings.TrimSuffix(pat, "*")
+	prefix = strings.TrimSuffix(prefix, "/")
+	if prefix != "" && prefix != pat {
+		if strings.HasPrefix(relPath, prefix+"/") || relPath == prefix {
+			return true
+		}
+	}
+	return false
+}
+
+func segmentMatch(needle, relPath string) bool {
+	for _, seg := range strings.Split(relPath, "/") {
+		if seg == needle {
+			return true
+		}
+	}
+	return false
+}
+
diff --git a/internal/scan/ignore_test.go b/internal/scan/ignore_test.go
new file mode 100644
index 0000000..ed9510f
--- /dev/null
+++ b/internal/scan/ignore_test.go
@@ -0,0 +1,81 @@
+package scan
+
+import (
+	"os"
+	"path/filepath"
+	"testing"
+)
+
+func TestMatcher_Basics(t *testing.T) {
+	m := NewMatcher([]string{
+		"# comment line",
+		"",
+		"node_modules",
+		"vendor/",
+		"archive/*",
+		"*.log",
+		"!important.log",
+		"**/generated",
+	})
+
+	cases := []struct {
+		name    string
+		path    string
+		isDir   bool
+		want    bool
+	}{
+		{"basename dir match", "a/b/node_modules", true, true},
+		{"basename file match in middle", "a/node_modules/c.go", false, true},
+		{"vendor dir-only match", "src/vendor", true, true},
+		{"vendor on file does not match", "src/vendor", false, false},
+		{"glob in subdir", "archive/2024", true, true},
+		{"glob extension", "out/build.log", false, true},
+		{"negation re-includes", "out/important.log", false, false},
+		{"deep generated", "src/foo/generated", true, true},
+		{"unrelated path", "src/main.go", false, false},
+	}
+
+	for _, c := range cases {
+		t.Run(c.name, func(t *testing.T) {
+			got := m.Match(c.path, c.isDir)
+			if got != c.want {
+				t.Errorf("Match(%q, dir=%v) = %v, want %v", c.path, c.isDir, got, c.want)
+			}
+		})
+	}
+}
+
+func TestLoadMatcher_MissingFileIsOK(t *testing.T) {
+	m, err := LoadMatcher(filepath.Join(t.TempDir(), "missing"))
+	if err != nil {
+		t.Fatalf("expected no error for missing file, got %v", err)
+	}
+	if m == nil {
+		t.Fatal("matcher should not be nil")
+	}
+	if m.Match("anything", false) {
+		t.Error("empty matcher should match nothing")
+	}
+}
+
+func TestLoadMatcher_FromFile(t *testing.T) {
+	dir := t.TempDir()
+	path := filepath.Join(dir, ".gitcortex-ignore")
+	contents := "# comment\nnode_modules\nvendor/\n"
+	if err := os.WriteFile(path, []byte(contents), 0o644); err != nil {
+		t.Fatal(err)
+	}
+	m, err := LoadMatcher(path)
+	if err != nil {
+		t.Fatalf("LoadMatcher: %v", err)
+	}
+	if !m.Match("foo/node_modules", true) {
+		t.Error("node_modules should match")
+	}
+	if !m.Match("vendor", true) {
+		t.Error("vendor/ should match dirs")
+	}
+	if m.Match("vendor", false) {
+		t.Error("vendor/ should not match files")
+	}
+}
diff --git a/internal/scan/scan.go b/internal/scan/scan.go
new file mode 100644
index 0000000..0d4a41d
--- /dev/null
+++ b/internal/scan/scan.go
@@ -0,0 +1,271 @@
+package scan
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"log"
+	"os"
+	"path/filepath"
+	"sync"
+	"time"
+
+	"github.com/lex0c/gitcortex/internal/extract"
+	"github.com/lex0c/gitcortex/internal/git"
+)
+
+// Config holds scan command input. The Extract template is copied per-repo
+// and its Repo/Output/StateFile/Branch are overwritten per worker — the
+// template exists to carry shared flags (ignore patterns, batch size,
+// mailmap, etc.) without re-declaring them on the scan surface.
+type Config struct {
+	Roots      []string
+	Output     string
+	IgnoreFile string
+	MaxDepth   int
+	Parallel   int
+	Extract    extract.Config
+}
+
+// Manifest is persisted to <output>/manifest.json so subsequent runs
+// (or the report stage) can discover which JSONL files to consolidate
+// without re-walking the filesystem.
+type Manifest struct {
+	GeneratedAt string         `json:"generated_at"`
+	Roots       []string       `json:"roots"`
+	Repos       []ManifestRepo `json:"repos"`
+}
+
+type ManifestRepo struct {
+	Slug      string `json:"slug"`
+	Path      string `json:"path"`
+	Branch    string `json:"branch"`
+	JSONL     string `json:"jsonl"`
+	StateFile string `json:"state_file"`
+	Status    string `json:"status"` // ok | failed | skipped
+	Error     string `json:"error,omitempty"`
+	Commits   int    `json:"commits,omitempty"`
+	DurationMS int64 `json:"duration_ms,omitempty"`
+}
+
+// Result is returned to the caller so the CLI can drive the report stage
+// with the concrete JSONL paths (no need to re-read the manifest).
+type Result struct {
+	OutputDir string
+	Manifest  Manifest
+	JSONLs    []string // only successful repos
+}
+
+// Run discovers repos under cfg.Roots, writes one JSONL per repo via
+// the existing extract pipeline, and emits a manifest. Failures on
+// individual repos are recorded in the manifest with Status="failed"
+// but do not abort the whole scan — the report can still be generated
+// from the repos that succeeded.
+func Run(ctx context.Context, cfg Config) (*Result, error) {
+	if len(cfg.Roots) == 0 {
+		return nil, fmt.Errorf("scan: at least one --root is required")
+	}
+	if cfg.Output == "" {
+		return nil, fmt.Errorf("scan: --output is required")
+	}
+	if err := os.MkdirAll(cfg.Output, 0o755); err != nil {
+		return nil, fmt.Errorf("create output dir: %w", err)
+	}
+	if cfg.Parallel <= 0 {
+		cfg.Parallel = 1
+	}
+
+	matcher, err := loadMatcher(cfg)
+	if err != nil {
+		return nil, err
+	}
+
+	repos, err := Discover(cfg.Roots, matcher, cfg.MaxDepth)
+	if err != nil {
+		return nil, err
+	}
+	if len(repos) == 0 {
+		return nil, fmt.Errorf("scan: no git repositories found under %v", cfg.Roots)
+	}
+	log.Printf("scan: discovered %d repositories", len(repos))
+
+	manifest := Manifest{
+		GeneratedAt: time.Now().UTC().Format(time.RFC3339),
+		Roots:       cfg.Roots,
+		Repos:       make([]ManifestRepo, len(repos)),
+	}
+	// Pre-fill every slot with the repo's identity and a "pending"
+	// status. If the user cancels (Ctrl+C) before a worker reaches a
+	// repo, the manifest still names it with Status="pending" instead
+	// of an empty struct that misleads "ok, 0 failed" accounting.
+	for i, r := range repos {
+		manifest.Repos[i] = ManifestRepo{
+			Slug:      r.Slug,
+			Path:      r.AbsPath,
+			JSONL:     filepath.Join(cfg.Output, r.Slug+".jsonl"),
+			StateFile: filepath.Join(cfg.Output, r.Slug+".state"),
+			Status:    "pending",
+		}
+	}
+
+	// Worker pool over repos. Each worker runs extract.Run against one
+	// repo and writes to its dedicated JSONL + state file. The extract
+	// package already checkpoints per-repo state, so an interrupted scan
+	// can be resumed by re-invoking the command — completed repos will
+	// be skipped (their state file's last SHA prevents re-emission) and
+	// partials will resume.
+	type job struct {
+		idx  int
+		repo Repo
+	}
+	jobs := make(chan job)
+	var wg sync.WaitGroup
+
+	for w := 0; w < cfg.Parallel; w++ {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			for j := range jobs {
+				entry := runOne(ctx, cfg, j.repo)
+				manifest.Repos[j.idx] = entry
+			}
+		}()
+	}
+
+	go func() {
+		for i, r := range repos {
+			select {
+			case <-ctx.Done():
+				close(jobs)
+				return
+			case jobs <- job{idx: i, repo: r}:
+			}
+		}
+		close(jobs)
+	}()
+
+	wg.Wait()
+
+	manifestPath := filepath.Join(cfg.Output, "manifest.json")
+	if err := writeManifest(manifestPath, manifest); err != nil {
+		return nil, fmt.Errorf("write manifest: %w", err)
+	}
+
+	var jsonls []string
+	var ok, failed, pending int
+	for _, r := range manifest.Repos {
+		switch r.Status {
+		case "ok":
+			jsonls = append(jsonls, r.JSONL)
+			ok++
+		case "pending":
+			pending++
+		default:
+			failed++
+		}
+	}
+	log.Printf("scan: %d ok, %d failed, %d not started; manifest at %s", ok, failed, pending, manifestPath)
+
+	res := &Result{
+		OutputDir: cfg.Output,
+		Manifest:  manifest,
+		JSONLs:    jsonls,
+	}
+	// Surface cancellation as a non-nil error so the CLI exits non-zero
+	// — but return the partial result anyway so the caller can still
+	// inspect what completed before the interrupt.
+	if err := ctx.Err(); err != nil {
+		return res, err
+	}
+	return res, nil
+}
+
+func loadMatcher(cfg Config) (*Matcher, error) {
+	// Per-root default: look for .gitcortex-ignore at the first root when
+	// no explicit path was given. Single-file load is enough for the MVP;
+	// if users need per-root rule sets they can concat them into one file.
+	path := cfg.IgnoreFile
+	if path == "" && len(cfg.Roots) > 0 {
+		candidate := filepath.Join(cfg.Roots[0], ".gitcortex-ignore")
+		if _, err := os.Stat(candidate); err == nil {
+			path = candidate
+		}
+	}
+	if path == "" {
+		return NewMatcher(nil), nil
+	}
+	return LoadMatcher(path)
+}
+
+func runOne(ctx context.Context, cfg Config, repo Repo) ManifestRepo {
+	jsonlPath := filepath.Join(cfg.Output, repo.Slug+".jsonl")
+	statePath := filepath.Join(cfg.Output, repo.Slug+".state")
+	entry := ManifestRepo{
+		Slug:      repo.Slug,
+		Path:      repo.AbsPath,
+		JSONL:     jsonlPath,
+		StateFile: statePath,
+	}
+
+	// Skip cleanly if the user already cancelled before this worker
+	// picked up the job — avoids a noisy "extracting..." log followed
+	// by an immediate ctx.Err() from extract.
+	if err := ctx.Err(); err != nil {
+		entry.Status = "skipped"
+		entry.Error = err.Error()
+		return entry
+	}
+
+	branch := cfg.Extract.Branch
+	if branch == "" {
+		branch = git.DetectDefaultBranch(repo.AbsPath)
+	}
+	if branch == "" {
+		entry.Status = "failed"
+		entry.Error = "could not detect default branch"
+		return entry
+	}
+	entry.Branch = branch
+
+	// Copy the extract config so each worker gets its own Repo/Output/State
+	// without racing on shared fields. Preserves ignore patterns, batch
+	// size, mailmap, etc. from the template.
+	sub := cfg.Extract
+	sub.Repo = repo.AbsPath
+	sub.Branch = branch
+	sub.Output = jsonlPath
+	sub.StateFile = statePath
+	if sub.CommandTimeout == 0 {
+		sub.CommandTimeout = extract.DefaultCommandTimeout
+	}
+	// Always force StartOffset back to -1 (= "load from state file")
+	// regardless of the template's value. Programmatic callers passing
+	// a zero-value Config would otherwise feed flagOffset=0 to
+	// extract.LoadState, which would IGNORE the per-repo state file and
+	// re-extract from scratch. StartSHA gets cleared for the same
+	// reason: a single CLI-level SHA cannot apply to N different repos.
+	sub.StartSHA = ""
+	sub.StartOffset = -1
+
+	log.Printf("scan: [%s] extracting from %s (branch %s)", repo.Slug, repo.AbsPath, branch)
+
+	start := time.Now()
+	if err := extract.Run(ctx, sub); err != nil {
+		entry.Status = "failed"
+		entry.Error = fmt.Errorf("extract %s: %w", repo.Slug, err).Error()
+		entry.DurationMS = time.Since(start).Milliseconds()
+		log.Printf("scan: [%s] failed: %v", repo.Slug, err)
+		return entry
+	}
+	entry.Status = "ok"
+	entry.DurationMS = time.Since(start).Milliseconds()
+	return entry
+}
+
+func writeManifest(path string, m Manifest) error {
+	data, err := json.MarshalIndent(m, "", "  ")
+	if err != nil {
+		return err
+	}
+	return os.WriteFile(path, data, 0o644)
+}
diff --git a/internal/scan/scan_test.go b/internal/scan/scan_test.go
new file mode 100644
index 0000000..db24410
--- /dev/null
+++ b/internal/scan/scan_test.go
@@ -0,0 +1,138 @@
+package scan
+
+import (
+	"context"
+	"encoding/json"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"strings"
+	"testing"
+
+	"github.com/lex0c/gitcortex/internal/extract"
+	"github.com/lex0c/gitcortex/internal/stats"
+)
+
+func TestRun_EndToEnd(t *testing.T) {
+	if _, err := exec.LookPath("git"); err != nil {
+		t.Skip("git not installed")
+	}
+
+	root := t.TempDir()
+	makeRealRepo(t, filepath.Join(root, "alpha"), map[string]string{
+		"main.go":     "package main\nfunc main() {}\n",
+		"README.md":   "# alpha\n",
+	})
+	makeRealRepo(t, filepath.Join(root, "beta"), map[string]string{
+		"app.py": "print('hi')\n",
+	})
+	// One node_modules-style decoy that should be ignored.
+	makeRealRepo(t, filepath.Join(root, "node_modules", "garbage"), map[string]string{
+		"x.js": "x\n",
+	})
+
+	output := filepath.Join(t.TempDir(), "out")
+	cfg := Config{
+		Roots:    []string{root},
+		Output:   output,
+		Parallel: 2,
+		Extract: extract.Config{
+			BatchSize:      100,
+			CommandTimeout: extract.DefaultCommandTimeout,
+		},
+	}
+	cfg.IgnoreFile = writeIgnoreFile(t, root, "node_modules\n")
+
+	res, err := Run(context.Background(), cfg)
+	if err != nil {
+		t.Fatalf("Run: %v", err)
+	}
+
+	if len(res.JSONLs) != 2 {
+		t.Fatalf("expected 2 JSONLs, got %d (%v)", len(res.JSONLs), res.JSONLs)
+	}
+
+	// Manifest sanity
+	manifestPath := filepath.Join(output, "manifest.json")
+	mb, err := os.ReadFile(manifestPath)
+	if err != nil {
+		t.Fatalf("read manifest: %v", err)
+	}
+	var m Manifest
+	if err := json.Unmarshal(mb, &m); err != nil {
+		t.Fatalf("parse manifest: %v", err)
+	}
+	if len(m.Repos) != 2 {
+		t.Fatalf("manifest should have 2 repos, got %d", len(m.Repos))
+	}
+	for _, r := range m.Repos {
+		if r.Status != "ok" {
+			t.Errorf("repo %s status=%s err=%s", r.Slug, r.Status, r.Error)
+		}
+	}
+
+	// Consolidated load + breakdown
+	ds, err := stats.LoadMultiJSONL(res.JSONLs)
+	if err != nil {
+		t.Fatalf("LoadMultiJSONL: %v", err)
+	}
+	breakdown := stats.RepoBreakdown(ds, "")
+	if len(breakdown) != 2 {
+		t.Fatalf("expected breakdown across 2 repos, got %d: %+v", len(breakdown), breakdown)
+	}
+	got := map[string]int{}
+	for _, b := range breakdown {
+		got[b.Repo] = b.Commits
+	}
+	for _, name := range []string{"alpha", "beta"} {
+		if got[name] == 0 {
+			t.Errorf("repo %s missing or has 0 commits in breakdown: %v", name, got)
+		}
+	}
+}
+
+// makeRealRepo initializes a git repo with the given files and a single
+// commit using a deterministic identity so the assertions don't depend on
+// the developer's git config.
+func makeRealRepo(t *testing.T, dir string, files map[string]string) {
+	t.Helper()
+	if err := os.MkdirAll(dir, 0o755); err != nil {
+		t.Fatal(err)
+	}
+	for name, contents := range files {
+		full := filepath.Join(dir, name)
+		if err := os.MkdirAll(filepath.Dir(full), 0o755); err != nil {
+			t.Fatal(err)
+		}
+		if err := os.WriteFile(full, []byte(contents), 0o644); err != nil {
+			t.Fatal(err)
+		}
+	}
+
+	runGit := func(args ...string) {
+		cmd := exec.Command("git", args...)
+		cmd.Dir = dir
+		cmd.Env = append(os.Environ(),
+			"GIT_AUTHOR_NAME=Test",
+			"GIT_AUTHOR_EMAIL=test@example.com",
+			"GIT_COMMITTER_NAME=Test",
+			"GIT_COMMITTER_EMAIL=test@example.com",
+		)
+		out, err := cmd.CombinedOutput()
+		if err != nil {
+			t.Fatalf("git %s: %v\n%s", strings.Join(args, " "), err, out)
+		}
+	}
+	runGit("init", "-q", "-b", "main")
+	runGit("add", ".")
+	runGit("commit", "-q", "-m", "initial")
+}
+
+func writeIgnoreFile(t *testing.T, root, contents string) string {
+	t.Helper()
+	path := filepath.Join(root, ".gitcortex-ignore")
+	if err := os.WriteFile(path, []byte(contents), 0o644); err != nil {
+		t.Fatal(err)
+	}
+	return path
+}
diff --git a/internal/stats/reader.go b/internal/stats/reader.go
index 87fc4ef..63783e7 100644
--- a/internal/stats/reader.go
+++ b/internal/stats/reader.go
@@ -22,6 +22,11 @@ type commitEntry struct {
 	del     int64
 	files   int
 	message string
+	// repo carries the LoadMultiJSONL pathPrefix (without the trailing
+	// colon) so RepoBreakdown can attribute commits back to their source
+	// repository on multi-repo scans. Empty for single-file loads, which
+	// keeps single-repo callers' behavior unchanged.
+	repo string
 }
 
 type fileEntry struct {
@@ -242,6 +247,7 @@ func streamLoadInto(ds *Dataset, r io.Reader, opt LoadOptions, pathPrefix string
 				del:     c.Deletions,
 				files:   c.FilesChanged,
 				message: c.Message,
+				repo:    strings.TrimSuffix(pathPrefix, ":"),
 			}
 
 			// Contributors
diff --git a/internal/stats/repo_breakdown.go b/internal/stats/repo_breakdown.go
new file mode 100644
index 0000000..0de8091
--- /dev/null
+++ b/internal/stats/repo_breakdown.go
@@ -0,0 +1,169 @@
+package stats
+
+import (
+	"sort"
+	"strings"
+	"time"
+)
+
+// RepoStat aggregates commit-level activity for one repository in a
+// multi-repo Dataset. Populated only when LoadMultiJSONL ran with more
+// than one input file (single-repo loads have no path prefix to group by,
+// so the breakdown would be a noop).
+type RepoStat struct {
+	Repo            string
+	Commits         int
+	Additions       int64
+	Deletions       int64
+	Churn           int64
+	Files           int
+	ActiveDays      int
+	UniqueDevs      int
+	FirstCommitDate string
+	LastCommitDate  string
+	// PctOfTotalCommits / PctOfTotalChurn are computed against the
+	// dataset totals so the report can show "this repo was 38% of
+	// your commits" without the template needing to do the math.
+	PctOfTotalCommits float64
+	PctOfTotalChurn   float64
+}
+
+// RepoBreakdown groups commits by their repo tag (set during ingest from
+// the LoadMultiJSONL pathPrefix) and returns one row per repo, sorted by
+// commit count descending. When emailFilter is non-empty, only commits
+// authored by that email are counted — this is the "show my work across
+// repos" lens used by the scan report.
+//
+// On single-repo datasets (no prefix tag, repo == ""), the function
+// returns a single row labeled "(repo)" so callers can still render
+// without a special case.
+//
+// Caveat: the underlying ds.commits map is keyed by SHA. If two repos
+// share a commit SHA (cherry-pick, mirror, fork) the second one ingested
+// overwrites the first, and the breakdown will report only the
+// surviving repo. In practice this only matters for forked / mirrored
+// repos under a single scan root — for those, scan and aggregate
+// separately if exact attribution matters.
+func RepoBreakdown(ds *Dataset, emailFilter string) []RepoStat {
+	type acc struct {
+		commits     int
+		add, del    int64
+		days        map[string]struct{}
+		devs        map[string]struct{}
+		first, last time.Time
+	}
+
+	repos := make(map[string]*acc)
+	emailLower := strings.ToLower(strings.TrimSpace(emailFilter))
+
+	for _, c := range ds.commits {
+		if emailLower != "" && strings.ToLower(strings.TrimSpace(c.email)) != emailLower {
+			continue
+		}
+		key := c.repo
+		if key == "" {
+			key = "(repo)"
+		}
+		a, ok := repos[key]
+		if !ok {
+			a = &acc{
+				days: make(map[string]struct{}),
+				devs: make(map[string]struct{}),
+			}
+			repos[key] = a
+		}
+		a.commits++
+		a.add += c.add
+		a.del += c.del
+		if c.email != "" {
+			a.devs[strings.ToLower(c.email)] = struct{}{}
+		}
+		if !c.date.IsZero() {
+			a.days[c.date.UTC().Format("2006-01-02")] = struct{}{}
+			if a.first.IsZero() || c.date.Before(a.first) {
+				a.first = c.date
+			}
+			if c.date.After(a.last) {
+				a.last = c.date
+			}
+		}
+	}
+
+	// File counts come from ds.files. Path prefix is `<repo>:<path>` so
+	// the split key matches the commit-side `c.repo`. Files without a
+	// prefix (single-repo loads) bucket under the same "(repo)" label.
+	//
+	// When emailFilter is set the count must reflect files THIS dev
+	// touched, not the repo's total — the unfiltered count would
+	// massively over-state per-dev scope (e.g. "I worked on 12k files
+	// in monorepo X" when really the dev touched 30). devCommits
+	// tracks per-file dev appearance and is the right denominator.
+	fileCounts := make(map[string]int)
+	for path, fe := range ds.files {
+		if emailLower != "" {
+			matched := false
+			for devEmail := range fe.devCommits {
+				if strings.ToLower(strings.TrimSpace(devEmail)) == emailLower {
+					matched = true
+					break
+				}
+			}
+			if !matched {
+				continue
+			}
+		}
+		key := ""
+		if i := strings.Index(path, ":"); i >= 0 {
+			key = path[:i]
+		}
+		if key == "" {
+			key = "(repo)"
+		}
+		fileCounts[key]++
+	}
+
+	// Totals computed AFTER the email filter: percentages are relative
+	// to the filtered universe so a dev profile shows "60% of MY commits
+	// happened in repo X", not "0.4% of the org's commits".
+	var totalCommits int
+	var totalChurn int64
+	for _, a := range repos {
+		totalCommits += a.commits
+		totalChurn += a.add + a.del
+	}
+
+	out := make([]RepoStat, 0, len(repos))
+	for repo, a := range repos {
+		stat := RepoStat{
+			Repo:       repo,
+			Commits:    a.commits,
+			Additions:  a.add,
+			Deletions:  a.del,
+			Churn:      a.add + a.del,
+			Files:      fileCounts[repo],
+			ActiveDays: len(a.days),
+			UniqueDevs: len(a.devs),
+		}
+		if !a.first.IsZero() {
+			stat.FirstCommitDate = a.first.UTC().Format("2006-01-02")
+		}
+		if !a.last.IsZero() {
+			stat.LastCommitDate = a.last.UTC().Format("2006-01-02")
+		}
+		if totalCommits > 0 {
+			stat.PctOfTotalCommits = float64(stat.Commits) / float64(totalCommits) * 100
+		}
+		if totalChurn > 0 {
+			stat.PctOfTotalChurn = float64(stat.Churn) / float64(totalChurn) * 100
+		}
+		out = append(out, stat)
+	}
+
+	sort.Slice(out, func(i, j int) bool {
+		if out[i].Commits != out[j].Commits {
+			return out[i].Commits > out[j].Commits
+		}
+		return out[i].Repo < out[j].Repo
+	})
+	return out
+}
diff --git a/internal/stats/repo_breakdown_test.go b/internal/stats/repo_breakdown_test.go
new file mode 100644
index 0000000..952ae00
--- /dev/null
+++ b/internal/stats/repo_breakdown_test.go
@@ -0,0 +1,127 @@
+package stats
+
+import (
+	"testing"
+	"time"
+)
+
+func TestRepoBreakdown_GroupsByPrefix(t *testing.T) {
+	ds := &Dataset{
+		commits: map[string]*commitEntry{
+			"a1": {email: "me@x.com", repo: "alpha", date: time.Date(2024, 1, 1, 0, 0, 0, 0, time.UTC), add: 10, del: 0},
+			"a2": {email: "me@x.com", repo: "alpha", date: time.Date(2024, 1, 5, 0, 0, 0, 0, time.UTC), add: 5, del: 2},
+			"b1": {email: "you@x.com", repo: "beta", date: time.Date(2024, 2, 1, 0, 0, 0, 0, time.UTC), add: 30, del: 0},
+		},
+		files: map[string]*fileEntry{
+			"alpha:main.go":   {},
+			"alpha:helper.go": {},
+			"beta:app.py":     {},
+		},
+	}
+
+	breakdown := RepoBreakdown(ds, "")
+	if len(breakdown) != 2 {
+		t.Fatalf("want 2 repos, got %d", len(breakdown))
+	}
+	// Sort is by commit count desc, so alpha first.
+	if breakdown[0].Repo != "alpha" || breakdown[0].Commits != 2 {
+		t.Errorf("alpha: got %+v", breakdown[0])
+	}
+	if breakdown[0].Files != 2 {
+		t.Errorf("alpha files: got %d, want 2", breakdown[0].Files)
+	}
+	if breakdown[1].Repo != "beta" || breakdown[1].Commits != 1 {
+		t.Errorf("beta: got %+v", breakdown[1])
+	}
+	// Pct totals — alpha 2/3, beta 1/3.
+	if got := breakdown[0].PctOfTotalCommits; got < 66.6 || got > 66.7 {
+		t.Errorf("alpha pct commits: got %.2f, want ~66.67", got)
+	}
+}
+
+func TestRepoBreakdown_EmailFilter(t *testing.T) {
+	ds := &Dataset{
+		commits: map[string]*commitEntry{
+			"a1": {email: "me@x.com", repo: "alpha", date: time.Date(2024, 1, 1, 0, 0, 0, 0, time.UTC), add: 10},
+			"a2": {email: "other@x.com", repo: "alpha", date: time.Date(2024, 1, 2, 0, 0, 0, 0, time.UTC), add: 5},
+			"b1": {email: "me@x.com", repo: "beta", date: time.Date(2024, 2, 1, 0, 0, 0, 0, time.UTC), add: 30},
+		},
+		// Per-dev file counts: alpha has 2 files but me@ touched only 1;
+		// beta has 1 file me@ touched. Other dev's exclusive file in
+		// alpha must NOT be counted toward me@'s repo Files.
+		files: map[string]*fileEntry{
+			"alpha:mine.go":  {devCommits: map[string]int{"me@x.com": 1}},
+			"alpha:other.go": {devCommits: map[string]int{"other@x.com": 1}},
+			"beta:both.py":   {devCommits: map[string]int{"me@x.com": 1}},
+		},
+	}
+	breakdown := RepoBreakdown(ds, "me@x.com")
+	if len(breakdown) != 2 {
+		t.Fatalf("want 2 repos for me@x.com, got %d", len(breakdown))
+	}
+	for _, b := range breakdown {
+		if b.Commits != 1 {
+			t.Errorf("repo %s: expected 1 commit (filtered), got %d", b.Repo, b.Commits)
+		}
+		if b.Files != 1 {
+			t.Errorf("repo %s: expected 1 file (filtered), got %d", b.Repo, b.Files)
+		}
+	}
+}
+
+// A rename within a repo collapses two file paths onto one canonical
+// path during finalizeDataset. The collapsed path keeps its `<repo>:`
+// prefix, so RepoBreakdown should still attribute the file to the
+// correct repo. Regression guard: without proper handling, the
+// post-rename canonical key would either lose its prefix (and bucket
+// into "(repo)") or duplicate-count.
+func TestRepoBreakdown_SurvivesRename(t *testing.T) {
+	ds := newDataset()
+	ds.commits["c1"] = &commitEntry{
+		email: "me@x.com", repo: "alpha",
+		date: time.Date(2024, 1, 1, 0, 0, 0, 0, time.UTC), add: 5,
+	}
+	ds.commits["c2"] = &commitEntry{
+		email: "me@x.com", repo: "alpha",
+		date: time.Date(2024, 1, 5, 0, 0, 0, 0, time.UTC), add: 10,
+	}
+	ds.files["alpha:old.go"] = &fileEntry{
+		devCommits: map[string]int{"me@x.com": 1},
+	}
+	ds.files["alpha:new.go"] = &fileEntry{
+		devCommits: map[string]int{"me@x.com": 1},
+	}
+	ds.renameEdges = []renameEdge{{
+		oldPath: "alpha:old.go", newPath: "alpha:new.go",
+		commitDate: time.Date(2024, 1, 5, 0, 0, 0, 0, time.UTC),
+	}}
+	applyRenames(ds)
+
+	breakdown := RepoBreakdown(ds, "")
+	if len(breakdown) != 1 || breakdown[0].Repo != "alpha" {
+		t.Fatalf("want single alpha row, got %+v", breakdown)
+	}
+	// After rename collapse: 1 canonical file, both commits attributed.
+	if breakdown[0].Files != 1 {
+		t.Errorf("want 1 file (post-rename), got %d", breakdown[0].Files)
+	}
+	if breakdown[0].Commits != 2 {
+		t.Errorf("want 2 commits, got %d", breakdown[0].Commits)
+	}
+}
+
+func TestRepoBreakdown_SingleRepoFallback(t *testing.T) {
+	// No prefix tag → all commits bucket under "(repo)".
+	ds := &Dataset{
+		commits: map[string]*commitEntry{
+			"x1": {email: "me@x.com", date: time.Date(2024, 1, 1, 0, 0, 0, 0, time.UTC), add: 5},
+		},
+		files: map[string]*fileEntry{
+			"src/foo.go": {},
+		},
+	}
+	breakdown := RepoBreakdown(ds, "")
+	if len(breakdown) != 1 || breakdown[0].Repo != "(repo)" {
+		t.Fatalf("want single (repo) row, got %+v", breakdown)
+	}
+}

From 02100f5c0ac271f32d9d6e015bdf69a332677ede Mon Sep 17 00:00:00 2001
From: lex0c <lex0c@proton.me>
Date: Mon, 20 Apr 2026 11:58:42 -0300
Subject: [PATCH 02/37] Detect bare repositories during scan discovery
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bare clones (foo.git dirs with HEAD/objects/refs at the root rather
than inside .git/) weren't picked up — discovery only accepted
working-tree layouts. Since mirrors, fixture clones, and
GitHub-style server dirs all use this shape, the omission blocked
the most common "point gitcortex scan at a folder of .git clones"
validation path.

isBareRepo requires all three canonical entries to avoid false
positives from stray HEAD/ or refs/ directories.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 internal/scan/discovery.go      | 42 +++++++++++++++++++++++----------
 internal/scan/discovery_test.go | 36 ++++++++++++++++++++++++++++
 2 files changed, 66 insertions(+), 12 deletions(-)

diff --git a/internal/scan/discovery.go b/internal/scan/discovery.go
index 0d7b66f..618fefd 100644
--- a/internal/scan/discovery.go
+++ b/internal/scan/discovery.go
@@ -77,20 +77,24 @@ func Discover(roots []string, matcher *Matcher, maxDepth int) ([]Repo, error) {
 				return filepath.SkipDir
 			}
 
-			// Detect .git either as a directory (normal) or a file
-			// (worktree / submodule). Presence alone is enough — we don't
-			// try to validate the content; extract will fail loudly if
-			// the repo is broken, which is a better signal than silently
-			// skipping.
-			// Lstat (not Stat) so a symlink named `.git` pointing somewhere
-			// arbitrary doesn't get treated as a real repo. Real `.git`
-			// entries are either a directory (normal worktree) or a regular
-			// file (worktree/submodule pointer). Both are caught here; a
-			// symlink is rejected, which is the conservative choice — if
-			// users genuinely point `.git` at a shared dir via symlink they
-			// can resolve it before scanning.
+			// Two repo shapes to detect:
+			//   1. Working tree:   path/.git is a dir (normal) or file
+			//      (worktree/submodule pointer).
+			//   2. Bare repo:      path itself contains HEAD + objects/
+			//      + refs/ — common for clones used as fixtures or
+			//      mirrors (`git clone --bare`, GitHub-style server dirs
+			//      named foo.git).
+			// Lstat (not Stat) on the .git entry so a symlink named .git
+			// pointing somewhere arbitrary doesn't get treated as a real
+			// repo. The bare check uses os.Stat by way of isBareRepo,
+			// which inspects three required entries — much harder to
+			// trick than a single dirent check.
 			gitEntry := filepath.Join(path, ".git")
+			isWorkingTree := false
 			if info, statErr := os.Lstat(gitEntry); statErr == nil && info.Mode()&os.ModeSymlink == 0 {
+				isWorkingTree = true
+			}
+			if isWorkingTree || isBareRepo(path) {
 				if seen[path] {
 					return filepath.SkipDir
 				}
@@ -155,6 +159,20 @@ func assignSlugs(repos []Repo) {
 	}
 }
 
+// isBareRepo returns true when path is a bare git repository — i.e. the
+// directory itself holds HEAD, objects/, and refs/ rather than wrapping
+// them in a .git subdirectory. All three entries are required because
+// a stray HEAD file or empty refs/ dir alone is not enough to be a real
+// repo and we don't want false positives polluting the manifest.
+func isBareRepo(path string) bool {
+	for _, name := range []string{"HEAD", "objects", "refs"} {
+		if _, err := os.Stat(filepath.Join(path, name)); err != nil {
+			return false
+		}
+	}
+	return true
+}
+
 // sanitizeSlug strips characters that would break the LoadMultiJSONL prefix
 // contract (`<slug>:`) or filesystem paths. Keeps alphanumerics, dash,
 // underscore, and dot.
diff --git a/internal/scan/discovery_test.go b/internal/scan/discovery_test.go
index 4bb7a0c..9957b21 100644
--- a/internal/scan/discovery_test.go
+++ b/internal/scan/discovery_test.go
@@ -164,3 +164,39 @@ func mustMkRepo(t *testing.T, path string) {
 		t.Fatal(err)
 	}
 }
+
+func TestDiscover_FindsBareRepo(t *testing.T) {
+	root := t.TempDir()
+	bare := filepath.Join(root, "myrepo.git")
+	for _, name := range []string{"HEAD", "objects", "refs"} {
+		full := filepath.Join(bare, name)
+		if name == "HEAD" {
+			if err := os.MkdirAll(bare, 0o755); err != nil {
+				t.Fatal(err)
+			}
+			if err := os.WriteFile(full, []byte("ref: refs/heads/main\n"), 0o644); err != nil {
+				t.Fatal(err)
+			}
+		} else {
+			if err := os.MkdirAll(full, 0o755); err != nil {
+				t.Fatal(err)
+			}
+		}
+	}
+	// Decoy: dir with HEAD only — must not be picked up.
+	decoy := filepath.Join(root, "not-a-repo")
+	if err := os.MkdirAll(decoy, 0o755); err != nil {
+		t.Fatal(err)
+	}
+	if err := os.WriteFile(filepath.Join(decoy, "HEAD"), []byte("nope\n"), 0o644); err != nil {
+		t.Fatal(err)
+	}
+
+	repos, err := Discover([]string{root}, NewMatcher(nil), 0)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(repos) != 1 || repos[0].Slug != "myrepo.git" {
+		t.Fatalf("expected single myrepo.git, got %+v", repos)
+	}
+}

From 16742dadd5cde382ee7b805d184361a1e77af30e Mon Sep 17 00:00:00 2001
From: lex0c <lex0c@proton.me>
Date: Mon, 20 Apr 2026 12:04:14 -0300
Subject: [PATCH 03/37] Strip repo prefix from suspect-detector suggestions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

LoadMultiJSONL prefixes file paths with '<repo>:' so cross-repo
aggregates can attribute files back to their source. The suspect
detector's directory-glob suggestions inherited that prefix, which
made the warning's remediation command a no-op: extract --ignore
and scan --extract-ignore both expect bare, repo-relative globs.

Strip the prefix in CollectAllSuggestions before dedup — same
shape from different repos (dist/*, node_modules/*) now collapses
to one canonical suggestion that works across every repo in the
scan. The stripping is conservative: a colon only counts as a
prefix delimiter when the preceding segment contains no slash or
glob metachars, preserving the rare but valid case of a colon
embedded in a real path.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 internal/stats/suspect.go      | 30 ++++++++++++++++++
 internal/stats/suspect_test.go | 57 ++++++++++++++++++++++++++++++++++
 2 files changed, 87 insertions(+)

diff --git a/internal/stats/suspect.go b/internal/stats/suspect.go
index 33a4b5f..af62aaf 100644
--- a/internal/stats/suspect.go
+++ b/internal/stats/suspect.go
@@ -211,11 +211,23 @@ func ShellQuoteSingle(s string) string {
 // trimming suggestions to just the displayed subset would leave
 // unshown suspects untouched and cause the warning to persist after
 // the suggested fix.
+//
+// Multi-repo note: LoadMultiJSONL prefixes file paths with `<repo>:`
+// so every consumer (hotspots, directories, suspect detector) can
+// attribute paths back to their source. The suggestion strings
+// inherit that prefix from suggestDirGlob, but `extract --ignore` and
+// `scan --extract-ignore` both consume bare repo-relative globs —
+// neither understands the `<repo>:` notation. Strip it here so the
+// copy-pasted remediation command actually matches the offending
+// paths. Same-shaped globs from different repos collapse via the
+// dedup below (e.g. `dist/*` appearing under multiple repos becomes
+// one suggestion, which is what the user wants).
 func CollectAllSuggestions(buckets []SuspectBucket) []string {
 	seen := map[string]struct{}{}
 	var out []string
 	for _, b := range buckets {
 		for _, s := range b.Suggestions {
+			s = stripRepoPrefix(s)
 			if _, ok := seen[s]; ok {
 				continue
 			}
@@ -225,3 +237,21 @@ func CollectAllSuggestions(buckets []SuspectBucket) []string {
 	}
 	return out
 }
+
+// stripRepoPrefix removes a leading `<repo>:` from a suggestion glob
+// when present. Conservative: the prefix is only recognized when the
+// segment before the colon has no slash or glob metacharacters, which
+// matches the LoadMultiJSONL convention and avoids misinterpreting
+// legitimate colons inside paths (rare but possible on some
+// filesystems). Single-repo datasets have no prefix and pass through
+// unchanged.
+func stripRepoPrefix(s string) string {
+	i := strings.Index(s, ":")
+	if i < 0 {
+		return s
+	}
+	if strings.ContainsAny(s[:i], "/*?[") {
+		return s
+	}
+	return s[i+1:]
+}
diff --git a/internal/stats/suspect_test.go b/internal/stats/suspect_test.go
index a83ea2c..bd2df02 100644
--- a/internal/stats/suspect_test.go
+++ b/internal/stats/suspect_test.go
@@ -1,6 +1,7 @@
 package stats
 
 import (
+	"strings"
 	"testing"
 
 	"github.com/lex0c/gitcortex/internal/extract"
@@ -353,6 +354,62 @@ func TestCollectAllSuggestionsCoversUnshownBuckets(t *testing.T) {
 	}
 }
 
+// When paths come from LoadMultiJSONL they carry a `<repo>:` prefix.
+// Those prefixes leak into directory-glob suggestions and make the
+// copy-pasted `extract --ignore` / `scan --extract-ignore` command
+// a no-op on real repo paths (which never carry the prefix).
+// CollectAllSuggestions must strip the prefix and collapse duplicates
+// that only differed in which repo they came from.
+func TestCollectAllSuggestionsStripsRepoPrefix(t *testing.T) {
+	ds := &Dataset{
+		files: map[string]*fileEntry{
+			// Two repos with the same offending shape — suggestions
+			// collapse to one canonical "dist/*" glob.
+			"repoA:js/dist/a.min.js":  {additions: 400, deletions: 400},
+			"repoB:css/dist/b.min.css": {additions: 400, deletions: 400},
+			// A single-repo path stays untouched (no prefix).
+			"vendor/c.go": {additions: 400, deletions: 400},
+		},
+	}
+	buckets, _ := DetectSuspectFiles(ds)
+	got := CollectAllSuggestions(buckets)
+
+	for _, s := range got {
+		if strings.Contains(s, ":") {
+			t.Errorf("suggestion %q still carries a repo prefix — users can't copy-paste this into extract --ignore", s)
+		}
+	}
+	has := func(want string) bool {
+		for _, s := range got {
+			if s == want {
+				return true
+			}
+		}
+		return false
+	}
+	for _, want := range []string{"js/dist/*", "css/dist/*", "vendor/*"} {
+		if !has(want) {
+			t.Errorf("expected suggestion %q in %v", want, got)
+		}
+	}
+}
+
+func TestStripRepoPrefix(t *testing.T) {
+	cases := []struct{ in, want string }{
+		{"repoA:wp-includes/dist/*", "wp-includes/dist/*"},
+		{"dist/*", "dist/*"},                          // no prefix
+		{"*.min.js", "*.min.js"},                      // no prefix
+		{"vendor/*", "vendor/*"},                      // no prefix (no colon)
+		{"src/weird:name.go", "src/weird:name.go"},    // colon after slash = not a prefix
+		{"glob*prefix:foo/bar", "glob*prefix:foo/bar"}, // metachar before colon = not a prefix
+	}
+	for _, c := range cases {
+		if got := stripRepoPrefix(c.in); got != c.want {
+			t.Errorf("stripRepoPrefix(%q) = %q, want %q", c.in, got, c.want)
+		}
+	}
+}
+
 func TestSuspectSuggestionsMatchExtractShouldIgnore(t *testing.T) {
 	// Mix root-level and nested occurrences of every pattern class:
 	// directory segments (vendor, dist), suffix (*.min.js), basename

From 61f8e65a359bd07be00a33338e36b6f09f02f783 Mon Sep 17 00:00:00 2001
From: lex0c <lex0c@proton.me>
Date: Mon, 20 Apr 2026 12:13:37 -0300
Subject: [PATCH 04/37] Honor negated ignore rules before pruning a directory
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The unconditional SkipDir on an ignored directory short-circuited
the matcher's last-match-wins semantics: patterns like `vendor/` +
`!vendor/keep` would prune vendor before vendor/keep could be
examined, so intentionally re-included repositories were silently
dropped from the scan.

Matcher now exposes CouldReinclude(dir), which returns true iff any
negation rule could fire on a descendant of dir — explicit child
references (!vendor/keep) and basename-only negations (!keep) both
force descent. Discovery descends instead of pruning whenever that
check is true, letting per-path Match() decide. Unrelated
exclusions keep pruning effective for the common case.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 internal/scan/discovery.go      | 11 ++++++-
 internal/scan/discovery_test.go | 53 +++++++++++++++++++++++++++++++++
 internal/scan/ignore.go         | 37 +++++++++++++++++++++++
 3 files changed, 100 insertions(+), 1 deletion(-)

diff --git a/internal/scan/discovery.go b/internal/scan/discovery.go
index 618fefd..feae8b2 100644
--- a/internal/scan/discovery.go
+++ b/internal/scan/discovery.go
@@ -74,7 +74,16 @@ func Discover(roots []string, matcher *Matcher, maxDepth int) ([]Repo, error) {
 			}
 
 			if rel != "." && matcher.Match(rel, true) {
-				return filepath.SkipDir
+				// Don't SkipDir blindly — if any negation rule could
+				// re-include a descendant (e.g. `vendor/` + `!vendor/keep`),
+				// pruning here would drop the re-inclusion before its
+				// target is visited. Descend and let Match() decide
+				// per-path. For ordinary `node_modules`-style excludes
+				// with no negations in play, CouldReinclude returns
+				// false and pruning remains.
+				if !matcher.CouldReinclude(rel) {
+					return filepath.SkipDir
+				}
 			}
 
 			// Two repo shapes to detect:
diff --git a/internal/scan/discovery_test.go b/internal/scan/discovery_test.go
index 9957b21..e5917ef 100644
--- a/internal/scan/discovery_test.go
+++ b/internal/scan/discovery_test.go
@@ -50,6 +50,59 @@ func TestDiscover_RespectsIgnore(t *testing.T) {
 	}
 }
 
+// Regression: `vendor/` + `!vendor/keep` must descend into vendor/
+// so the negation has a chance to re-include vendor/keep. Before the
+// fix, the walker SkipDir'd vendor unconditionally and the re-included
+// repo was silently dropped.
+func TestDiscover_HonorsNegatedDescendant(t *testing.T) {
+	root := t.TempDir()
+	mustMkRepo(t, filepath.Join(root, "app"))
+	mustMkRepo(t, filepath.Join(root, "vendor", "garbage"))
+	mustMkRepo(t, filepath.Join(root, "vendor", "keep"))
+
+	matcher := NewMatcher([]string{"vendor/", "!vendor/keep"})
+	repos, err := Discover([]string{root}, matcher, 0)
+	if err != nil {
+		t.Fatal(err)
+	}
+	got := map[string]bool{}
+	for _, r := range repos {
+		got[r.RelPath] = true
+	}
+	if !got["app"] {
+		t.Errorf("app should be included: %+v", repos)
+	}
+	if !got["vendor/keep"] {
+		t.Errorf("vendor/keep should be re-included by negation rule: %+v", repos)
+	}
+	if got["vendor/garbage"] {
+		t.Errorf("vendor/garbage should remain ignored: %+v", repos)
+	}
+}
+
+func TestMatcher_CouldReinclude(t *testing.T) {
+	cases := []struct {
+		name     string
+		patterns []string
+		dir      string
+		want     bool
+	}{
+		{"no negation", []string{"vendor/"}, "vendor", false},
+		{"explicit descendant", []string{"vendor/", "!vendor/keep"}, "vendor", true},
+		{"unrelated negation", []string{"vendor/", "!src/main"}, "vendor", false},
+		{"basename negation could fire anywhere", []string{"vendor/", "!keep"}, "vendor", true},
+		{"deep-match negation", []string{"build/", "!**/src"}, "build", true},
+	}
+	for _, c := range cases {
+		t.Run(c.name, func(t *testing.T) {
+			m := NewMatcher(c.patterns)
+			if got := m.CouldReinclude(c.dir); got != c.want {
+				t.Errorf("CouldReinclude(%q) with %v = %v, want %v", c.dir, c.patterns, got, c.want)
+			}
+		})
+	}
+}
+
 func TestDiscover_DoesNotDescendIntoRepo(t *testing.T) {
 	root := t.TempDir()
 	parent := filepath.Join(root, "parent")
diff --git a/internal/scan/ignore.go b/internal/scan/ignore.go
index c378b05..30ab306 100644
--- a/internal/scan/ignore.go
+++ b/internal/scan/ignore.go
@@ -105,6 +105,43 @@ func (m *Matcher) Match(relPath string, isDir bool) bool {
 	return matched
 }
 
+// CouldReinclude reports whether any negation rule targets a descendant
+// of dir — i.e. walking into the dir could still yield a re-included
+// path. Callers use this to decide whether to prune an ignored
+// directory via filepath.SkipDir or descend into it and evaluate
+// children individually.
+//
+// Without this check the walker short-circuits the matcher's last-
+// match-wins semantics: a `vendor/` + `!vendor/keep` pair would skip
+// the vendor subtree entirely before vendor/keep could be examined,
+// silently dropping the re-included path.
+//
+// The check is conservative — it returns true when:
+//   - a negation's pattern begins with `dir/` (explicit descendant
+//     reference, the documented common case), OR
+//   - a negation's pattern has no path separator (basename rules like
+//     `!keep` that could fire anywhere, including inside dir).
+//
+// Negations that target a sibling or ancestor path correctly don't
+// trigger descent, so pruning remains effective for unrelated ignored
+// trees like `node_modules/`.
+func (m *Matcher) CouldReinclude(dir string) bool {
+	dir = filepath.ToSlash(dir)
+	for _, r := range m.rules {
+		if !r.Negate {
+			continue
+		}
+		pat := strings.TrimPrefix(r.Pattern, "**/")
+		if !strings.Contains(pat, "/") {
+			return true
+		}
+		if strings.HasPrefix(pat, dir+"/") {
+			return true
+		}
+	}
+	return false
+}
+
 func matchRule(r IgnoreRule, relPath string) bool {
 	pat := r.Pattern
 

From 5950a26b7bbe0a9d063939b3286e8ad903fc451a Mon Sep 17 00:00:00 2001
From: lex0c <lex0c@proton.me>
Date: Mon, 20 Apr 2026 12:17:43 -0300
Subject: [PATCH 05/37] Keep per-repo commits when SHAs overlap across
 repositories
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ds.commits is keyed by SHA; when a scan ingests repos that share a
commit (fork, mirror, cherry-pick) the later ingest overwrote the
earlier one. RepoBreakdown iterated ds.commits directly, so the
consolidated report silently under-counted commits and churn for one
side of every collision — materially wrong percentages in exactly
the multi-repo scenarios scan was built for.

Track commits in a parallel map[repo][]*commitEntry populated at
ingest. File-side lookups keep using ds.commits (one record per SHA
is enough there); RepoBreakdown iterates the per-repo slice, which
preserves every commit regardless of collisions.

Regression test uses LoadMultiJSONL with two JSONLs that share a
SHA — without the fix, one of the two repos disappears from the
breakdown entirely.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 internal/stats/reader.go              | 24 ++++++++++-
 internal/stats/repo_breakdown.go      | 59 ++++++++++++++++++---------
 internal/stats/repo_breakdown_test.go | 50 +++++++++++++++++++++++
 3 files changed, 112 insertions(+), 21 deletions(-)

diff --git a/internal/stats/reader.go b/internal/stats/reader.go
index 63783e7..4162249 100644
--- a/internal/stats/reader.go
+++ b/internal/stats/reader.go
@@ -87,6 +87,17 @@ type Dataset struct {
 	files        map[string]*fileEntry
 	workGrid     [7][24]int
 
+	// commitsByRepo preserves commit records per repository so the
+	// per-repo breakdown stays correct when two repos share a SHA
+	// (forks, mirrors, cherry-picks between sibling projects). The
+	// commits map above is keyed by SHA and drops the losing side of
+	// a collision — acceptable for file-side lookups where a single
+	// record per SHA is enough — but RepoBreakdown would then
+	// silently under-count the overwritten repo's commits and churn.
+	// Populated at ingest in lockstep with commits; a single-repo
+	// load (no pathPrefix) keeps all entries under the "(repo)" key.
+	commitsByRepo map[string][]*commitEntry
+
 	// Coupling (pre-aggregated during load)
 	couplingPairs       map[filePair]int
 	couplingFileChanges map[string]int
@@ -163,6 +174,7 @@ func newDataset() *Dataset {
 		contribFiles:        make(map[string]map[string]struct{}),
 		contribFirst:        make(map[string]time.Time),
 		contribLast:         make(map[string]time.Time),
+		commitsByRepo:       make(map[string][]*commitEntry),
 	}
 }
 
@@ -240,7 +252,7 @@ func streamLoadInto(ds *Dataset, r io.Reader, opt LoadOptions, pathPrefix string
 			ds.TotalDeletions += c.Deletions
 			ds.TotalFilesChanged += int64(c.FilesChanged)
 
-			ds.commits[c.SHA] = &commitEntry{
+			entry := &commitEntry{
 				email:   c.AuthorEmail,
 				date:    t,
 				add:     c.Additions,
@@ -249,6 +261,16 @@ func streamLoadInto(ds *Dataset, r io.Reader, opt LoadOptions, pathPrefix string
 				message: c.Message,
 				repo:    strings.TrimSuffix(pathPrefix, ":"),
 			}
+			ds.commits[c.SHA] = entry
+			// Record separately per-repo so RepoBreakdown survives
+			// SHA collisions across repositories (forks/mirrors/
+			// cherry-picks). Key defaults to "(repo)" in single-repo
+			// loads so the fallback row in RepoBreakdown still works.
+			repoKey := entry.repo
+			if repoKey == "" {
+				repoKey = "(repo)"
+			}
+			ds.commitsByRepo[repoKey] = append(ds.commitsByRepo[repoKey], entry)
 
 			// Contributors
 			cs, ok := ds.contributors[c.AuthorEmail]
diff --git a/internal/stats/repo_breakdown.go b/internal/stats/repo_breakdown.go
index 0de8091..57822e8 100644
--- a/internal/stats/repo_breakdown.go
+++ b/internal/stats/repo_breakdown.go
@@ -28,22 +28,26 @@ type RepoStat struct {
 	PctOfTotalChurn   float64
 }
 
-// RepoBreakdown groups commits by their repo tag (set during ingest from
-// the LoadMultiJSONL pathPrefix) and returns one row per repo, sorted by
-// commit count descending. When emailFilter is non-empty, only commits
-// authored by that email are counted — this is the "show my work across
-// repos" lens used by the scan report.
+// RepoBreakdown groups commits by repo and returns one row per repo,
+// sorted by commit count descending. When emailFilter is non-empty,
+// only commits authored by that email are counted — this is the
+// "show my work across repos" lens used by the scan report.
 //
-// On single-repo datasets (no prefix tag, repo == ""), the function
-// returns a single row labeled "(repo)" so callers can still render
-// without a special case.
+// Source of truth is ds.commitsByRepo, a per-repository slice
+// populated at ingest. The SHA-keyed ds.commits map cannot be used
+// directly because two repos that share a commit SHA (fork, mirror,
+// cherry-pick) would lose the earlier ingest to the later one,
+// under-counting one repo's totals. Iterating commitsByRepo sidesteps
+// that collision.
 //
-// Caveat: the underlying ds.commits map is keyed by SHA. If two repos
-// share a commit SHA (cherry-pick, mirror, fork) the second one ingested
-// overwrites the first, and the breakdown will report only the
-// surviving repo. In practice this only matters for forked / mirrored
-// repos under a single scan root — for those, scan and aggregate
-// separately if exact attribution matters.
+// Fallback: test datasets that hand-build ds.commits without
+// populating commitsByRepo still work — we detect the empty slice
+// and bucket ds.commits by repo as before. Production ingest always
+// populates commitsByRepo.
+//
+// On single-repo datasets (no prefix tag), the function returns a
+// single row labeled "(repo)" so callers can render without a special
+// case.
 func RepoBreakdown(ds *Dataset, emailFilter string) []RepoStat {
 	type acc struct {
 		commits     int
@@ -56,13 +60,9 @@ func RepoBreakdown(ds *Dataset, emailFilter string) []RepoStat {
 	repos := make(map[string]*acc)
 	emailLower := strings.ToLower(strings.TrimSpace(emailFilter))
 
-	for _, c := range ds.commits {
+	visit := func(key string, c *commitEntry) {
 		if emailLower != "" && strings.ToLower(strings.TrimSpace(c.email)) != emailLower {
-			continue
-		}
-		key := c.repo
-		if key == "" {
-			key = "(repo)"
+			return
 		}
 		a, ok := repos[key]
 		if !ok {
@@ -89,6 +89,25 @@ func RepoBreakdown(ds *Dataset, emailFilter string) []RepoStat {
 		}
 	}
 
+	if len(ds.commitsByRepo) > 0 {
+		for repo, entries := range ds.commitsByRepo {
+			for _, c := range entries {
+				visit(repo, c)
+			}
+		}
+	} else {
+		// Fallback for hand-built test datasets that populate
+		// ds.commits directly. Production ingest always goes through
+		// streamLoadInto and populates commitsByRepo.
+		for _, c := range ds.commits {
+			key := c.repo
+			if key == "" {
+				key = "(repo)"
+			}
+			visit(key, c)
+		}
+	}
+
 	// File counts come from ds.files. Path prefix is `<repo>:<path>` so
 	// the split key matches the commit-side `c.repo`. Files without a
 	// prefix (single-repo loads) bucket under the same "(repo)" label.
diff --git a/internal/stats/repo_breakdown_test.go b/internal/stats/repo_breakdown_test.go
index 952ae00..404a563 100644
--- a/internal/stats/repo_breakdown_test.go
+++ b/internal/stats/repo_breakdown_test.go
@@ -1,6 +1,8 @@
 package stats
 
 import (
+	"os"
+	"path/filepath"
 	"testing"
 	"time"
 )
@@ -110,6 +112,54 @@ func TestRepoBreakdown_SurvivesRename(t *testing.T) {
 	}
 }
 
+// Regression: when two repos share a commit SHA (fork / mirror /
+// cherry-pick) the ingest map ds.commits drops one side. Without a
+// separate per-repo record, RepoBreakdown would silently under-count
+// the repo ingested first and the consolidated report would show
+// wrong percentages. Must preserve both.
+func TestRepoBreakdown_SurvivesCrossRepoSHACollision(t *testing.T) {
+	dir := t.TempDir()
+	// Two repos with IDENTICAL commit rows — different churn per row
+	// so we can verify both contribute to the breakdown. In real
+	// life this maps to a cherry-picked or mirrored commit where the
+	// SHA matches but the files and stats land in different repos.
+	commitRow := `{"type":"commit","sha":"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa","author_date":"2024-01-01T00:00:00Z","author_email":"me@x.com","author_name":"Me","additions":10,"deletions":0,"files_changed":1}`
+	alphaFile := `{"type":"commit_file","commit":"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa","path_current":"a.go","additions":10,"deletions":0}`
+	betaFile := `{"type":"commit_file","commit":"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa","path_current":"b.go","additions":10,"deletions":0}`
+
+	alpha := filepath.Join(dir, "alpha.jsonl")
+	beta := filepath.Join(dir, "beta.jsonl")
+	if err := os.WriteFile(alpha, []byte(commitRow+"\n"+alphaFile+"\n"), 0o644); err != nil {
+		t.Fatal(err)
+	}
+	if err := os.WriteFile(beta, []byte(commitRow+"\n"+betaFile+"\n"), 0o644); err != nil {
+		t.Fatal(err)
+	}
+
+	ds, err := LoadMultiJSONL([]string{alpha, beta})
+	if err != nil {
+		t.Fatalf("LoadMultiJSONL: %v", err)
+	}
+
+	breakdown := RepoBreakdown(ds, "")
+	if len(breakdown) != 2 {
+		t.Fatalf("want breakdown across both alpha AND beta despite SHA collision, got %d rows: %+v", len(breakdown), breakdown)
+	}
+	got := map[string]int{}
+	for _, b := range breakdown {
+		got[b.Repo] = b.Commits
+	}
+	if got["alpha"] != 1 || got["beta"] != 1 {
+		t.Errorf("each repo should keep its commit; got alpha=%d beta=%d", got["alpha"], got["beta"])
+	}
+	// Sanity on percentage — each repo is 50% of the 2 collided commits.
+	for _, b := range breakdown {
+		if b.PctOfTotalCommits != 50 {
+			t.Errorf("repo %s: want 50%% commits, got %.1f", b.Repo, b.PctOfTotalCommits)
+		}
+	}
+}
+
 func TestRepoBreakdown_SingleRepoFallback(t *testing.T) {
 	// No prefix tag → all commits bucket under "(repo)".
 	ds := &Dataset{

From 9a6f5f68148e5ded305831e5ab57d870cc4b9568 Mon Sep 17 00:00:00 2001
From: lex0c <lex0c@proton.me>
Date: Mon, 20 Apr 2026 12:29:32 -0300
Subject: [PATCH 06/37] Normalize repo prefix before suspect-pattern matching
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

DetectSuspectFiles fed ds.files keys straight into the pattern
predicates, but multi-repo loads prefix every path with <repo>:.
Root-level vendor, package-lock.json, go.sum, and friends missed
detection — `strings.HasPrefix(p, "vendor/")` and
`p == "package-lock.json"` both fail when the string actually begins
with `alpha:vendor/x.go` or equals `alpha:package-lock.json`.

Nested occurrences like `repo:pkg/vendor/x.go` still matched via the
`/vendor/` substring check, so the bug looked partial in the wild:
heavy repos with nested generated content triggered the warning,
while a Go project with vendor/ at the root under the same scan went
silently unflagged and users kept chasing bus-factor ghosts from
vendored code.

Strip the `<repo>:` prefix once per file and feed the normalized
path to both Match and Suggest. Nested paths are unaffected; single-
repo loads (no prefix) pass through unchanged.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 internal/stats/suspect.go      | 15 +++++++++--
 internal/stats/suspect_test.go | 47 ++++++++++++++++++++++++++++++++++
 2 files changed, 60 insertions(+), 2 deletions(-)

diff --git a/internal/stats/suspect.go b/internal/stats/suspect.go
index af62aaf..c12f6f4 100644
--- a/internal/stats/suspect.go
+++ b/internal/stats/suspect.go
@@ -142,11 +142,22 @@ func DetectSuspectFiles(ds *Dataset) ([]SuspectBucket, bool) {
 	for path, fe := range ds.files {
 		fileChurn := fe.additions + fe.deletions
 		totalChurn += fileChurn
+		// Multi-repo loads prefix every path with `<repo>:` so files
+		// from different repos can coexist in one Dataset. The
+		// pattern predicates assume bare repo-relative paths — they
+		// look for root-level `vendor/`, full-path `package-lock.json`,
+		// and similar shapes that fail when the string actually
+		// begins with `alpha:vendor/x.go` or equals
+		// `alpha:package-lock.json`. Strip the prefix once and feed
+		// the normalized form to both Match and Suggest; nested
+		// occurrences (e.g. `repo:pkg/vendor/x.go`) still work via
+		// the normal substring checks on the stripped path.
+		matchPath := stripRepoPrefix(path)
 		// A file might match multiple patterns (e.g. vendor/*.pb.go).
 		// Attribute it to the first match so totals don't double-count;
 		// the first-match wins keeps output predictable.
 		for _, pat := range defaultSuspectPatterns {
-			if !pat.Match(path) {
+			if !pat.Match(matchPath) {
 				continue
 			}
 			b, ok := buckets[pat.Glob]
@@ -159,7 +170,7 @@ func DetectSuspectFiles(ds *Dataset) ([]SuspectBucket, bool) {
 			b.Churn += fileChurn
 			suspectChurn += fileChurn
 			if pat.Suggest != nil {
-				suggSets[pat.Glob][pat.Suggest(path)] = struct{}{}
+				suggSets[pat.Glob][pat.Suggest(matchPath)] = struct{}{}
 			}
 			break
 		}
diff --git a/internal/stats/suspect_test.go b/internal/stats/suspect_test.go
index bd2df02..a9dfa7c 100644
--- a/internal/stats/suspect_test.go
+++ b/internal/stats/suspect_test.go
@@ -410,6 +410,53 @@ func TestStripRepoPrefix(t *testing.T) {
 	}
 }
 
+// Regression: multi-repo loads prefix every file path with `<repo>:`,
+// and the pattern predicates (hasPathSegment, basenameEquals) compare
+// against the raw string. Root-level occurrences like
+// `alpha:vendor/x.go` and `alpha:package-lock.json` slipped past
+// detection because `strings.HasPrefix(p, "vendor/")` and
+// `p == "package-lock.json"` both fail when the `<repo>:` prefix is
+// present. Symptom in the wild: stats on a scan consumed multiple
+// --input files, suspect warning suppressed, users kept seeing
+// inflated churn/bus-factor from generated content with no hint to
+// fix it.
+func TestDetectSuspectFiles_RootLevelUnderRepoPrefix(t *testing.T) {
+	ds := &Dataset{
+		files: map[string]*fileEntry{
+			// Root-level occurrences — the cases that previously missed.
+			"alpha:vendor/x.go":        {additions: 400, deletions: 400},
+			"beta:package-lock.json":   {additions: 400, deletions: 400},
+			"beta:go.sum":              {additions: 400, deletions: 400},
+			// Nested occurrence — worked before the fix and must still work.
+			"alpha:pkg/vendor/deep.go": {additions: 100, deletions: 100},
+		},
+	}
+	buckets, worth := DetectSuspectFiles(ds)
+	if !worth {
+		t.Fatal("expected warning-worthy dataset; prefix-stripped matching should fire on root-level vendor/lockfiles")
+	}
+
+	got := map[string]bool{}
+	for _, b := range buckets {
+		got[b.Pattern.Glob] = true
+	}
+	for _, want := range []string{"vendor/*", "package-lock.json", "go.sum"} {
+		if !got[want] {
+			t.Errorf("pattern %q missing from detected buckets %v — root-level %q under repo prefix went undetected", want, got, want)
+		}
+	}
+
+	// Suggestions should come out clean (no `<repo>:`) so the
+	// copy-pasted --ignore command actually matches these paths
+	// next run.
+	suggestions := CollectAllSuggestions(buckets)
+	for _, s := range suggestions {
+		if len(s) > 0 && s[0] == '\'' {
+			t.Errorf("suggestion %q should not carry shell quotes here", s)
+		}
+	}
+}
+
 func TestSuspectSuggestionsMatchExtractShouldIgnore(t *testing.T) {
 	// Mix root-level and nested occurrences of every pattern class:
 	// directory segments (vendor, dist), suffix (*.min.js), basename

From 98efb109897781c6e475a1ebeb52a5d611679e78 Mon Sep 17 00:00:00 2001
From: lex0c <lex0c@proton.me>
Date: Mon, 20 Apr 2026 12:33:03 -0300
Subject: [PATCH 07/37] Use numeric value for repo breakdown bar widths
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The Per-Repository Breakdown section fed pctFloat's output (a
string) into printf "%.0f". html/template's CSS sanitizer saw the
resulting %!f(string=...) as unsafe and replaced every bar width
with ZgotmplZ — the commit-share visualization rendered at zero
width in every multi-repo report.

PctOfTotalCommits is already a float64 on RepoStat, so printf it
directly. Drop the $maxRepoCommits variable that was only guarding
a zero-churn case already covered by PctOfTotalCommits returning 0.

Regression test asserts no ZgotmplZ or %!f(string= markers appear
in a multi-repo render.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 internal/report/report_test.go | 44 ++++++++++++++++++++++++++++++++++
 internal/report/template.go    |  5 ++--
 2 files changed, 46 insertions(+), 3 deletions(-)

diff --git a/internal/report/report_test.go b/internal/report/report_test.go
index 2b4c826..595b0f6 100644
--- a/internal/report/report_test.go
+++ b/internal/report/report_test.go
@@ -77,6 +77,50 @@ func TestGenerate_SmokeRender(t *testing.T) {
 	}
 }
 
+// Regression: the Per-Repository Breakdown section emits a CSS width
+// for each repo's commit-share bar. The width has to be a numeric
+// literal — if the template feeds a string through printf "%.0f"
+// (e.g. via the string-returning pctFloat helper) html/template's
+// CSS sanitizer replaces it with `ZgotmplZ` and the bars render at
+// zero width. Exercise the multi-repo path end-to-end and assert
+// the rendered widths are clean.
+func TestGenerate_PerRepoBreakdownWidthsAreNumeric(t *testing.T) {
+	dir := t.TempDir()
+	alpha := filepath.Join(dir, "alpha.jsonl")
+	beta := filepath.Join(dir, "beta.jsonl")
+	// Distinct SHAs and content so both repos appear in the breakdown.
+	alphaRow := `{"type":"commit","sha":"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa","author_email":"me@x.com","author_name":"Me","author_date":"2024-01-01T00:00:00Z","additions":10,"deletions":0,"files_changed":1}
+{"type":"commit_file","commit":"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa","path_current":"a.go","additions":10,"deletions":0}
+`
+	betaRow := `{"type":"commit","sha":"bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb","author_email":"me@x.com","author_name":"Me","author_date":"2024-02-01T00:00:00Z","additions":20,"deletions":5,"files_changed":1}
+{"type":"commit_file","commit":"bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb","path_current":"b.go","additions":20,"deletions":5}
+`
+	if err := os.WriteFile(alpha, []byte(alphaRow), 0o644); err != nil {
+		t.Fatal(err)
+	}
+	if err := os.WriteFile(beta, []byte(betaRow), 0o644); err != nil {
+		t.Fatal(err)
+	}
+	ds, err := stats.LoadMultiJSONL([]string{alpha, beta})
+	if err != nil {
+		t.Fatalf("LoadMultiJSONL: %v", err)
+	}
+	var buf bytes.Buffer
+	if err := Generate(&buf, ds, "scan-fixture", 10, stats.StatsFlags{CouplingMinChanges: 1, NetworkMinFiles: 1}); err != nil {
+		t.Fatalf("Generate: %v", err)
+	}
+	out := buf.String()
+	if !strings.Contains(out, "Per-Repository Breakdown") {
+		t.Fatal("breakdown section missing from multi-repo report")
+	}
+	if strings.Contains(out, "ZgotmplZ") {
+		t.Error("report contains ZgotmplZ — template fed a non-numeric value to a CSS context (likely printf'd a string)")
+	}
+	if strings.Contains(out, "%!f(string=") {
+		t.Error("report contains %!f(string=...) — printf was given a string where a float was expected")
+	}
+}
+
 func TestGenerate_EmptyDataset(t *testing.T) {
 	dir := t.TempDir()
 	path := filepath.Join(dir, "empty.jsonl")
diff --git a/internal/report/template.go b/internal/report/template.go
index 5abc4e1..9e19846 100644
--- a/internal/report/template.go
+++ b/internal/report/template.go
@@ -147,8 +147,7 @@ footer { margin-top: 40px; padding-top: 16px; border-top: 1px solid #d0d7de; col
 
 {{if gt (len .Repos) 1}}
 <h2>Per-Repository Breakdown <span style="font-size:13px; color:#656d76; font-weight:normal;">{{thousands (len .Repos)}} repositories</span></h2>
-<p class="hint">Cross-repo aggregation from <code>scan</code>. Use this view to see how work is distributed: a single repo dominating commits or churn often means the consolidated activity story is really about that one project; an even spread shows broad multi-repo engagement. Bar widths are normalized to the largest commit-count repo.</p>
-{{$maxRepoCommits := 0}}{{range .Repos}}{{if gt .Commits $maxRepoCommits}}{{$maxRepoCommits = .Commits}}{{end}}{{end}}
+<p class="hint">Cross-repo aggregation from <code>scan</code>. Use this view to see how work is distributed: a single repo dominating commits or churn often means the consolidated activity story is really about that one project; an even spread shows broad multi-repo engagement. Bar widths are the share of total commits across all scanned repos.</p>
 <table>
 <tr>
   <th>Repository</th>
@@ -168,7 +167,7 @@ footer { margin-top: 40px; padding-top: 16px; border-top: 1px solid #d0d7de; col
   <td>
     <div style="display:flex; align-items:center; gap:6px;">
       <div style="flex:0 0 60px; height:8px; background:#eaeef2; border-radius:3px; overflow:hidden;">
-        <div style="height:100%; width:{{if gt $maxRepoCommits 0}}{{printf "%.0f" (pctFloat .PctOfTotalCommits 100.0)}}%{{else}}0%{{end}}; background:#216e39;"></div>
+        <div style="height:100%; width:{{printf "%.0f" .PctOfTotalCommits}}%; background:#216e39;"></div>
       </div>
       <span class="mono">{{printf "%.1f" .PctOfTotalCommits}}%</span>
     </div>

From 6cfe86d3f18350e8038daa8ca86e229b17391d4e Mon Sep 17 00:00:00 2001
From: lex0c <lex0c@proton.me>
Date: Mon, 20 Apr 2026 12:48:21 -0300
Subject: [PATCH 08/37] Skip ignored repos before recording discovery hits
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Discover descended into ignored directories only to give negation
rules a chance to re-include descendants, but after the descent
decision it still ran the .git detection on the ignored parent.
When the parent itself held a .git (e.g. `vendor/` with vendor/.git
plus a sibling !vendor/keep rule), the walker wrongly recorded the
ignored parent as a repo and then returned filepath.SkipDir from
the repo branch — which cut off the very descent the negation
relied on. Both outcomes violated the documented last-match-wins
semantics: the ignored dir leaked into scan output, and the
re-included child never got a chance to register.

Return early as soon as an ignored dir is walked-into-for-negation.
The .git detection is now reached only for dirs the matcher did not
ignore; re-included descendants are examined in their own callback
invocation.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 internal/scan/discovery.go      | 15 +++++++++++----
 internal/scan/discovery_test.go | 31 +++++++++++++++++++++++++++++++
 2 files changed, 42 insertions(+), 4 deletions(-)

diff --git a/internal/scan/discovery.go b/internal/scan/discovery.go
index feae8b2..4211674 100644
--- a/internal/scan/discovery.go
+++ b/internal/scan/discovery.go
@@ -77,13 +77,20 @@ func Discover(roots []string, matcher *Matcher, maxDepth int) ([]Repo, error) {
 				// Don't SkipDir blindly — if any negation rule could
 				// re-include a descendant (e.g. `vendor/` + `!vendor/keep`),
 				// pruning here would drop the re-inclusion before its
-				// target is visited. Descend and let Match() decide
-				// per-path. For ordinary `node_modules`-style excludes
-				// with no negations in play, CouldReinclude returns
-				// false and pruning remains.
+				// target is visited.
 				if !matcher.CouldReinclude(rel) {
 					return filepath.SkipDir
 				}
+				// The dir itself is ignored; we only walk in so a
+				// negated descendant can be visited in its own turn.
+				// Return early so the .git detection below doesn't
+				// record this ignored directory as a repo (and so its
+				// `return filepath.SkipDir` doesn't cut the descent
+				// off before the negation's target is seen). A
+				// descendant whose rel path is re-included by `!rule`
+				// will be examined by the next WalkDir callback
+				// invocation.
+				return nil
 			}
 
 			// Two repo shapes to detect:
diff --git a/internal/scan/discovery_test.go b/internal/scan/discovery_test.go
index e5917ef..27f514f 100644
--- a/internal/scan/discovery_test.go
+++ b/internal/scan/discovery_test.go
@@ -80,6 +80,37 @@ func TestDiscover_HonorsNegatedDescendant(t *testing.T) {
 	}
 }
 
+// Regression: when the ignored directory itself is a repo and a
+// negation rule points at a descendant, the walker must NOT record
+// the ignored dir (it's ignored) AND must keep descending so the
+// negation's target can be visited. Previously the .git detection
+// ran unconditionally and both incorrectly recorded the ignored
+// parent AND SkipDir'd the child out of the walk.
+func TestDiscover_IgnoredRepoNotRecorded_DescendantStillFound(t *testing.T) {
+	root := t.TempDir()
+	// `vendor` is itself a repo AND is ignored by `vendor/`.
+	mustMkRepo(t, filepath.Join(root, "vendor"))
+	// `vendor/keep` is a nested repo, re-included by `!vendor/keep`.
+	mustMkRepo(t, filepath.Join(root, "vendor", "keep"))
+
+	matcher := NewMatcher([]string{"vendor/", "!vendor/keep"})
+	repos, err := Discover([]string{root}, matcher, 0)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	got := map[string]bool{}
+	for _, r := range repos {
+		got[r.RelPath] = true
+	}
+	if got["vendor"] {
+		t.Error("vendor itself is ignored and must not be recorded as a repo")
+	}
+	if !got["vendor/keep"] {
+		t.Errorf("vendor/keep should be re-included by the negation rule; got %+v", repos)
+	}
+}
+
 func TestMatcher_CouldReinclude(t *testing.T) {
 	cases := []struct {
 		name     string

From 26ac75e762855b189492eae83166b2fe78c69c44 Mon Sep 17 00:00:00 2001
From: lex0c <lex0c@proton.me>
Date: Mon, 20 Apr 2026 12:51:00 -0300
Subject: [PATCH 09/37] Account for globbed negations in reinclude checks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

CouldReinclude only recognized negation rules whose pattern began
with a literal `dir/` prefix. Globbed variants that can legitimately
match descendants of an ignored directory — `!vendor*/keep`,
`!*/keep`, `!*/vendor/keep` — returned false, so the walker pruned
the parent and the re-included repos silently disappeared.

Per-segment path.Match against the dir's segments handles both
literal and glob first segments uniformly. A pattern is a candidate
reinclude iff it has strictly more segments than dir and every
leading segment matches the corresponding dir segment (path.Match
treats `vendor*`, `*`, literals, etc. the same way). Basename-only
and `**/`-prefixed negations keep their fire-anywhere semantics.

Discovery test uses `vendor*/` + `!vendor*/keep` across vendor/ and
vendor-old/ to prove both re-included repos are now kept while
vendor/garbage stays out.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 internal/scan/discovery_test.go | 38 ++++++++++++++++++++++++
 internal/scan/ignore.go         | 51 +++++++++++++++++++++++++++------
 2 files changed, 80 insertions(+), 9 deletions(-)

diff --git a/internal/scan/discovery_test.go b/internal/scan/discovery_test.go
index 27f514f..2a95aa2 100644
--- a/internal/scan/discovery_test.go
+++ b/internal/scan/discovery_test.go
@@ -123,6 +123,13 @@ func TestMatcher_CouldReinclude(t *testing.T) {
 		{"unrelated negation", []string{"vendor/", "!src/main"}, "vendor", false},
 		{"basename negation could fire anywhere", []string{"vendor/", "!keep"}, "vendor", true},
 		{"deep-match negation", []string{"build/", "!**/src"}, "build", true},
+		// Globbed first segment — reviewer case.
+		{"glob star in segment", []string{"vendor*/", "!vendor*/keep"}, "vendor", true},
+		{"wildcard segment matches any parent", []string{"*/", "!*/keep"}, "vendor", true},
+		{"glob prefix that doesn't match dir", []string{"vendor*/", "!foo*/keep"}, "vendor", false},
+		{"nested dir with literal pattern", []string{"pkg/vendor/", "!pkg/vendor/keep"}, "pkg/vendor", true},
+		{"nested dir with glob in first segment", []string{"*/vendor/", "!*/vendor/keep"}, "pkg/vendor", true},
+		{"pattern with same segment count as dir can't match descendant", []string{"!vendor"}, "vendor", true}, // basename-anywhere
 	}
 	for _, c := range cases {
 		t.Run(c.name, func(t *testing.T) {
@@ -134,6 +141,37 @@ func TestMatcher_CouldReinclude(t *testing.T) {
 	}
 }
 
+// Regression: glob-prefixed negations like `!vendor*/keep` or `!*/keep`
+// used to slip past CouldReinclude because it only matched literal
+// `dir + "/"` prefixes. Discovery then pruned vendor/ and the
+// re-included vendor/keep repo disappeared from the scan.
+func TestDiscover_HonorsGlobbedNegation(t *testing.T) {
+	root := t.TempDir()
+	mustMkRepo(t, filepath.Join(root, "vendor", "keep"))
+	mustMkRepo(t, filepath.Join(root, "vendor", "garbage"))
+	mustMkRepo(t, filepath.Join(root, "vendor-old", "keep"))
+	mustMkRepo(t, filepath.Join(root, "unrelated"))
+
+	matcher := NewMatcher([]string{"vendor*/", "!vendor*/keep"})
+	repos, err := Discover([]string{root}, matcher, 0)
+	if err != nil {
+		t.Fatal(err)
+	}
+	got := map[string]bool{}
+	for _, r := range repos {
+		got[r.RelPath] = true
+	}
+	want := []string{"vendor/keep", "vendor-old/keep", "unrelated"}
+	for _, w := range want {
+		if !got[w] {
+			t.Errorf("missing %q in discovered repos: %+v", w, repos)
+		}
+	}
+	if got["vendor/garbage"] {
+		t.Error("vendor/garbage should remain ignored")
+	}
+}
+
 func TestDiscover_DoesNotDescendIntoRepo(t *testing.T) {
 	root := t.TempDir()
 	parent := filepath.Join(root, "parent")
diff --git a/internal/scan/ignore.go b/internal/scan/ignore.go
index 30ab306..3101124 100644
--- a/internal/scan/ignore.go
+++ b/internal/scan/ignore.go
@@ -116,26 +116,59 @@ func (m *Matcher) Match(relPath string, isDir bool) bool {
 // the vendor subtree entirely before vendor/keep could be examined,
 // silently dropping the re-included path.
 //
-// The check is conservative — it returns true when:
-//   - a negation's pattern begins with `dir/` (explicit descendant
-//     reference, the documented common case), OR
+// Returns true when:
 //   - a negation's pattern has no path separator (basename rules like
-//     `!keep` that could fire anywhere, including inside dir).
+//     `!keep` or `!*.go` that could fire at any depth), OR
+//   - a negation begins with `**/` (deep-match; applies anywhere in the
+//     tree), OR
+//   - the pattern's leading path segments are each compatible with the
+//     corresponding segment of dir via path.Match, so its trailing
+//     segment(s) could name a descendant. This covers literal prefixes
+//     like `!vendor/keep` AND globbed prefixes like `!vendor*/keep` or
+//     `!*/keep` — both can match children of an ignored `vendor`.
 //
-// Negations that target a sibling or ancestor path correctly don't
-// trigger descent, so pruning remains effective for unrelated ignored
-// trees like `node_modules/`.
+// Negations that target a sibling or ancestor path (e.g. `!src/keep`
+// when walking vendor) correctly don't trigger descent, so pruning
+// remains effective for unrelated ignored trees like `node_modules/`.
 func (m *Matcher) CouldReinclude(dir string) bool {
 	dir = filepath.ToSlash(dir)
+	dirSegs := strings.Split(dir, "/")
 	for _, r := range m.rules {
 		if !r.Negate {
 			continue
 		}
-		pat := strings.TrimPrefix(r.Pattern, "**/")
+		pat := r.Pattern
+		// `**/foo` and other deep-match patterns can fire at any
+		// depth — walking into any ignored subtree could reach one.
+		if strings.HasPrefix(pat, "**/") {
+			return true
+		}
+		// Basename-only rules (no `/`) apply to any segment. `!keep`
+		// could re-include vendor/keep, a/b/keep, anywhere.
 		if !strings.Contains(pat, "/") {
 			return true
 		}
-		if strings.HasPrefix(pat, dir+"/") {
+		patSegs := strings.Split(pat, "/")
+		// The pattern must have strictly more segments than dir;
+		// otherwise its deepest named entity is dir itself or an
+		// ancestor, never a descendant worth descending for.
+		if len(patSegs) <= len(dirSegs) {
+			continue
+		}
+		// Each of the pattern's leading segments must be compatible
+		// with the corresponding dir segment. path.Match handles both
+		// literal names (`vendor`) and globs (`vendor*`, `*`, `?`)
+		// uniformly; a failing match on any segment rules this
+		// negation out.
+		ok := true
+		for i := 0; i < len(dirSegs); i++ {
+			matched, err := path.Match(patSegs[i], dirSegs[i])
+			if err != nil || !matched {
+				ok = false
+				break
+			}
+		}
+		if ok {
 			return true
 		}
 	}

From e85768ed22671fc982f2ba6d4cc1f9ea62a0e015 Mon Sep 17 00:00:00 2001
From: lex0c <lex0c@proton.me>
Date: Mon, 20 Apr 2026 13:35:24 -0300
Subject: [PATCH 10/37] Guarantee unique slugs after hashing duplicate
 basenames
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The 6-hex (24-bit) hash suffix used for colliding basenames admits
a tiny but real birthday-collision probability: two absolute paths
whose SHA-1 digests share the first six hex chars would land on the
same slug, and scan would silently write both repos' data to the
same <slug>.jsonl and <slug>.state. Corruption of the exact kind
the hash was added to prevent.

Retry with a longer hash (up to the full 40 hex) whenever the
proposed slug set still has duplicates. Happy path remains
readable six-char suffixes; the rare collision grows until unique
or panics past SHA-1 — the latter would be a cryptographic event,
not a scan-time bug.

Tests: 10k-duplicate property test (24-bit birthday expects ~3
collisions, forces retry path to fire naturally), determinism
across re-runs of the same input, and a targeted test that builds
two paths colliding at 6 hex and asserts the retry separates them.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 internal/scan/discovery.go      | 47 +++++++++++++---
 internal/scan/discovery_test.go | 97 +++++++++++++++++++++++++++++++++
 2 files changed, 137 insertions(+), 7 deletions(-)

diff --git a/internal/scan/discovery.go b/internal/scan/discovery.go
index 4211674..cecbae2 100644
--- a/internal/scan/discovery.go
+++ b/internal/scan/discovery.go
@@ -141,6 +141,12 @@ func Discover(roots []string, matcher *Matcher, maxDepth int) ([]Repo, error) {
 	return repos, nil
 }
 
+// initialSlugHashLen controls the default hash-suffix length in hex
+// chars when two repos share a basename. Six chars (24 bits) gives
+// readable slugs in the common case; the retry loop in assignSlugs
+// grows the length if a truncation collision occurs.
+const initialSlugHashLen = 6
+
 // assignSlugs derives a unique slug per repo from its basename, falling
 // back to `<basename>-<shortHash(absPath)>` when two repos share a name.
 // The slug is also the JSONL filename stem and the persistence key
@@ -153,6 +159,15 @@ func Discover(roots []string, matcher *Matcher, maxDepth int) ([]Repo, error) {
 // base appears more than once anywhere in the result. This makes the
 // slug a pure function of (absPath, set of basenames seen), independent
 // of WalkDir traversal order.
+//
+// Short-hash truncation (6 hex = 24 bits) admits a small but nonzero
+// collision probability — two absolute paths whose SHA-1 digests share
+// the same 6-hex prefix would land on the same slug, and scan would
+// silently overwrite one repo's JSONL + state file with the other's.
+// The retry loop walks the proposed slug set; on any duplicate we
+// redo the pass with a longer hash. Grows up to the full 40 hex chars
+// before panicking — needing that much is a cryptographic event, not
+// a scan-time bug.
 func assignSlugs(repos []Repo) {
 	counts := make(map[string]int)
 	bases := make([]string, len(repos))
@@ -164,15 +179,33 @@ func assignSlugs(repos []Repo) {
 		bases[i] = base
 		counts[base]++
 	}
-	for i := range repos {
-		base := bases[i]
-		slug := base
-		if counts[base] > 1 {
-			h := sha1.Sum([]byte(repos[i].AbsPath))
-			slug = fmt.Sprintf("%s-%s", base, hex.EncodeToString(h[:])[:6])
+
+	for hashLen := initialSlugHashLen; hashLen <= 40; hashLen += 2 {
+		proposed := make([]string, len(repos))
+		seen := make(map[string]int, len(repos))
+		collided := false
+		for i := range repos {
+			base := bases[i]
+			slug := base
+			if counts[base] > 1 {
+				h := sha1.Sum([]byte(repos[i].AbsPath))
+				slug = fmt.Sprintf("%s-%s", base, hex.EncodeToString(h[:])[:hashLen])
+			}
+			if prev, ok := seen[slug]; ok && prev != i {
+				collided = true
+				break
+			}
+			seen[slug] = i
+			proposed[i] = slug
+		}
+		if !collided {
+			for i := range repos {
+				repos[i].Slug = proposed[i]
+			}
+			return
 		}
-		repos[i].Slug = slug
 	}
+	panic("scan: slug assignment failed to find unique hash within SHA-1 range")
 }
 
 // isBareRepo returns true when path is a bare git repository — i.e. the
diff --git a/internal/scan/discovery_test.go b/internal/scan/discovery_test.go
index 2a95aa2..9ec3aa7 100644
--- a/internal/scan/discovery_test.go
+++ b/internal/scan/discovery_test.go
@@ -1,6 +1,9 @@
 package scan
 
 import (
+	"crypto/sha1"
+	"encoding/hex"
+	"fmt"
 	"os"
 	"path/filepath"
 	"testing"
@@ -86,6 +89,100 @@ func TestDiscover_HonorsNegatedDescendant(t *testing.T) {
 // negation's target can be visited. Previously the .git detection
 // ran unconditionally and both incorrectly recorded the ignored
 // parent AND SkipDir'd the child out of the walk.
+// With a 6-hex (24-bit) suffix, the birthday paradox predicts a
+// truncation collision around ~2^12 duplicates. Generating 10k paths
+// that share a basename makes collisions statistically near-certain
+// (expected count ≈ 3). The invariant "all resulting slugs are
+// distinct" forces assignSlugs's retry loop to grow the hash and
+// still produce a unique namespace — the exact corruption class the
+// extra iterations were added to prevent.
+func TestAssignSlugs_UniqueEvenUnderTruncationCollisions(t *testing.T) {
+	const n = 10000
+	repos := make([]Repo, n)
+	for i := 0; i < n; i++ {
+		repos[i] = Repo{AbsPath: fmt.Sprintf("/path/to/dir-%06d/myrepo", i)}
+	}
+	assignSlugs(repos)
+
+	slugs := make(map[string]int, n)
+	for i, r := range repos {
+		if prev, ok := slugs[r.Slug]; ok {
+			t.Fatalf("duplicate slug %q between repos[%d]=%s and repos[%d]=%s — JSONL + state files would collide",
+				r.Slug, prev, repos[prev].AbsPath, i, r.AbsPath)
+		}
+		slugs[r.Slug] = i
+	}
+}
+
+// Directly exercise the retry path with two paths constructed to
+// collide at initialSlugHashLen. Without the retry, both would get
+// slug "myrepo-<sameHex>" and the scan would silently overwrite one
+// repo's files with the other's. With the retry, the loop grows
+// the hash until the pair separates.
+func TestAssignSlugs_ResolvesFirstPrefixCollision(t *testing.T) {
+	a, b, found := findColliding6HexPaths(50000)
+	if !found {
+		t.Skip("no colliding pair found within search budget — astronomically unlikely, skip rather than flake")
+	}
+	repos := []Repo{{AbsPath: a}, {AbsPath: b}}
+	assignSlugs(repos)
+
+	if repos[0].Slug == repos[1].Slug {
+		t.Fatalf("retry failed: both repos got slug %q for colliding paths %s and %s", repos[0].Slug, a, b)
+	}
+	// Sanity: the slug suffix must be longer than the initial 6 hex,
+	// proving the retry branch actually fired.
+	const minLenAfterRetry = len("myrepo-") + initialSlugHashLen + 1
+	if len(repos[0].Slug) < minLenAfterRetry && len(repos[1].Slug) < minLenAfterRetry {
+		t.Errorf("expected at least one slug to have a longer hash after retry; got %q and %q", repos[0].Slug, repos[1].Slug)
+	}
+}
+
+// findColliding6HexPaths searches a deterministic sequence of
+// `myrepo` paths for any two whose SHA-1 digests share the first
+// initialSlugHashLen hex chars. At 24 bits of resolution the
+// birthday bound hits 50% around N≈4900, so maxAttempts=50000 is
+// overwhelmingly likely to yield a pair. Returns false only if no
+// collision was found — the test treats that as a skip, not a
+// failure, since the worst-case outcome is a missed opportunity to
+// exercise the retry, not a bug.
+func findColliding6HexPaths(maxAttempts int) (string, string, bool) {
+	seen := make(map[string]string, maxAttempts)
+	for i := 0; i < maxAttempts; i++ {
+		p := fmt.Sprintf("/search/path-%07d/myrepo", i)
+		h := sha1.Sum([]byte(p))
+		prefix := hex.EncodeToString(h[:])[:initialSlugHashLen]
+		if prev, ok := seen[prefix]; ok {
+			return prev, p, true
+		}
+		seen[prefix] = p
+	}
+	return "", "", false
+}
+
+// Same paths ingested twice must produce the same slugs — this is
+// what makes resume work across runs. Even with the retry loop
+// adjusting hash length under collisions, a deterministic input set
+// must yield a deterministic output.
+func TestAssignSlugs_DeterministicUnderRetry(t *testing.T) {
+	build := func() []Repo {
+		rs := make([]Repo, 5000)
+		for i := 0; i < 5000; i++ {
+			rs[i] = Repo{AbsPath: fmt.Sprintf("/work/p-%05d/myrepo", i)}
+		}
+		return rs
+	}
+	a := build()
+	b := build()
+	assignSlugs(a)
+	assignSlugs(b)
+	for i := range a {
+		if a[i].Slug != b[i].Slug {
+			t.Errorf("slug for %s differs across runs: %q vs %q", a[i].AbsPath, a[i].Slug, b[i].Slug)
+		}
+	}
+}
+
 func TestDiscover_IgnoredRepoNotRecorded_DescendantStillFound(t *testing.T) {
 	root := t.TempDir()
 	// `vendor` is itself a repo AND is ignored by `vendor/`.

From c4958bb56d7622b10bb6381d0044ea715a78aeb1 Mon Sep 17 00:00:00 2001
From: lex0c <lex0c@proton.me>
Date: Mon, 20 Apr 2026 13:39:05 -0300
Subject: [PATCH 11/37] Propagate context cancellation into Discover walk

Discover took no context, so a ctx-cancelled scan still walked the
entire tree before returning. On home-directory scans where the walk
phase can be the longest leg of the whole command, Ctrl+C felt
unresponsive and anyone running interactive scans saw seconds of
post-cancel filesystem churn.

Thread ctx into Discover; each filepath.WalkDir callback checks
ctx.Err() before doing filesystem work and returns it to
short-circuit the walk. A pre-cancelled context test asserts
Discover returns context.Canceled with no repos instead of draining
the tree.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 internal/scan/discovery.go      | 23 +++++++++++++-
 internal/scan/discovery_test.go | 55 ++++++++++++++++++++++++++-------
 internal/scan/scan.go           |  2 +-
 3 files changed, 66 insertions(+), 14 deletions(-)

diff --git a/internal/scan/discovery.go b/internal/scan/discovery.go
index cecbae2..2f2716a 100644
--- a/internal/scan/discovery.go
+++ b/internal/scan/discovery.go
@@ -1,6 +1,7 @@
 package scan
 
 import (
+	"context"
 	"crypto/sha1"
 	"encoding/hex"
 	"fmt"
@@ -29,14 +30,27 @@ type Repo struct {
 // unique across the full result so downstream JSONL files don't collide,
 // and the prefix LoadMultiJSONL derives (basename-minus-ext) still groups
 // correctly.
-func Discover(roots []string, matcher *Matcher, maxDepth int) ([]Repo, error) {
+//
+// A cancelled ctx aborts the walk promptly — important because on large
+// roots (home directory scans, monorepos of repos) discovery is often
+// the longest phase of scan and Ctrl+C needs to land in real time, not
+// "after the walk naturally finishes". Each filepath.WalkDir callback
+// checks ctx.Done() before doing any filesystem work, so the abort
+// window is bounded by the cost of one stat call per dir entry.
+func Discover(ctx context.Context, roots []string, matcher *Matcher, maxDepth int) ([]Repo, error) {
 	if matcher == nil {
 		matcher = NewMatcher(nil)
 	}
+	if ctx == nil {
+		ctx = context.Background()
+	}
 	var repos []Repo
 	seen := make(map[string]bool)
 
 	for _, root := range roots {
+		if err := ctx.Err(); err != nil {
+			return nil, err
+		}
 		abs, err := filepath.Abs(root)
 		if err != nil {
 			return nil, fmt.Errorf("resolve %s: %w", root, err)
@@ -50,6 +64,13 @@ func Discover(roots []string, matcher *Matcher, maxDepth int) ([]Repo, error) {
 		}
 
 		err = filepath.WalkDir(abs, func(path string, d os.DirEntry, werr error) error {
+			// Return ctx.Err() (not SkipDir) so WalkDir short-circuits
+			// the entire walk. Without this check, a cancelled scan
+			// would keep stat'ing every directory in a large tree
+			// before the caller saw the interrupt.
+			if err := ctx.Err(); err != nil {
+				return err
+			}
 			if werr != nil {
 				// Permission errors on one subtree shouldn't abort the
 				// whole scan. Log and keep walking.
diff --git a/internal/scan/discovery_test.go b/internal/scan/discovery_test.go
index 9ec3aa7..8639acb 100644
--- a/internal/scan/discovery_test.go
+++ b/internal/scan/discovery_test.go
@@ -1,6 +1,7 @@
 package scan
 
 import (
+	"context"
 	"crypto/sha1"
 	"encoding/hex"
 	"fmt"
@@ -20,7 +21,7 @@ func TestDiscover_FindsRepos(t *testing.T) {
 		t.Fatal(err)
 	}
 
-	repos, err := Discover([]string{root}, NewMatcher(nil), 0)
+	repos, err := Discover(context.Background(), []string{root}, NewMatcher(nil), 0)
 	if err != nil {
 		t.Fatalf("Discover: %v", err)
 	}
@@ -44,7 +45,7 @@ func TestDiscover_RespectsIgnore(t *testing.T) {
 	mustMkRepo(t, filepath.Join(root, "node_modules", "garbage"))
 
 	matcher := NewMatcher([]string{"node_modules"})
-	repos, err := Discover([]string{root}, matcher, 0)
+	repos, err := Discover(context.Background(), []string{root}, matcher, 0)
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -64,7 +65,7 @@ func TestDiscover_HonorsNegatedDescendant(t *testing.T) {
 	mustMkRepo(t, filepath.Join(root, "vendor", "keep"))
 
 	matcher := NewMatcher([]string{"vendor/", "!vendor/keep"})
-	repos, err := Discover([]string{root}, matcher, 0)
+	repos, err := Discover(context.Background(), []string{root}, matcher, 0)
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -183,6 +184,36 @@ func TestAssignSlugs_DeterministicUnderRetry(t *testing.T) {
 	}
 }
 
+// A cancelled context must abort the walk — previously Discover
+// ignored ctx entirely, so Ctrl+C during the walk phase only took
+// effect after every directory had been stat'd. Test uses a
+// pre-cancelled context to trip the early-return on the very first
+// callback; no repos should be returned and the error should equal
+// ctx.Err().
+func TestDiscover_AbortsOnCancelledContext(t *testing.T) {
+	root := t.TempDir()
+	// Enough decoy directories that an un-aborted walk would still
+	// produce observable work — the assertion below fails loudly if
+	// the check is skipped.
+	for i := 0; i < 20; i++ {
+		if err := os.MkdirAll(filepath.Join(root, fmt.Sprintf("dir-%d", i)), 0o755); err != nil {
+			t.Fatal(err)
+		}
+	}
+	mustMkRepo(t, filepath.Join(root, "would-be-found"))
+
+	ctx, cancel := context.WithCancel(context.Background())
+	cancel() // pre-cancelled
+
+	repos, err := Discover(ctx, []string{root}, NewMatcher(nil), 0)
+	if err != context.Canceled {
+		t.Fatalf("want context.Canceled, got err=%v", err)
+	}
+	if len(repos) != 0 {
+		t.Errorf("want empty repo list after cancel, got %+v", repos)
+	}
+}
+
 func TestDiscover_IgnoredRepoNotRecorded_DescendantStillFound(t *testing.T) {
 	root := t.TempDir()
 	// `vendor` is itself a repo AND is ignored by `vendor/`.
@@ -191,7 +222,7 @@ func TestDiscover_IgnoredRepoNotRecorded_DescendantStillFound(t *testing.T) {
 	mustMkRepo(t, filepath.Join(root, "vendor", "keep"))
 
 	matcher := NewMatcher([]string{"vendor/", "!vendor/keep"})
-	repos, err := Discover([]string{root}, matcher, 0)
+	repos, err := Discover(context.Background(), []string{root}, matcher, 0)
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -250,7 +281,7 @@ func TestDiscover_HonorsGlobbedNegation(t *testing.T) {
 	mustMkRepo(t, filepath.Join(root, "unrelated"))
 
 	matcher := NewMatcher([]string{"vendor*/", "!vendor*/keep"})
-	repos, err := Discover([]string{root}, matcher, 0)
+	repos, err := Discover(context.Background(), []string{root}, matcher, 0)
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -277,7 +308,7 @@ func TestDiscover_DoesNotDescendIntoRepo(t *testing.T) {
 	// submodule and skipped — we don't double-count.
 	mustMkRepo(t, filepath.Join(parent, "submodule"))
 
-	repos, err := Discover([]string{root}, NewMatcher(nil), 0)
+	repos, err := Discover(context.Background(), []string{root}, NewMatcher(nil), 0)
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -291,7 +322,7 @@ func TestDiscover_SlugCollisionGetsHashSuffix(t *testing.T) {
 	mustMkRepo(t, filepath.Join(root, "a", "myrepo"))
 	mustMkRepo(t, filepath.Join(root, "b", "myrepo"))
 
-	repos, err := Discover([]string{root}, NewMatcher(nil), 0)
+	repos, err := Discover(context.Background(), []string{root}, NewMatcher(nil), 0)
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -320,11 +351,11 @@ func TestDiscover_SlugDeterministicAcrossRuns(t *testing.T) {
 	mustMkRepo(t, filepath.Join(root, "a", "myrepo"))
 	mustMkRepo(t, filepath.Join(root, "b", "myrepo"))
 
-	first, err := Discover([]string{root}, NewMatcher(nil), 0)
+	first, err := Discover(context.Background(), []string{root}, NewMatcher(nil), 0)
 	if err != nil {
 		t.Fatal(err)
 	}
-	second, err := Discover([]string{root}, NewMatcher(nil), 0)
+	second, err := Discover(context.Background(), []string{root}, NewMatcher(nil), 0)
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -354,7 +385,7 @@ func TestDiscover_RejectsSymlinkGit(t *testing.T) {
 	}
 	mustMkRepo(t, filepath.Join(root, "real"))
 
-	repos, err := Discover([]string{root}, NewMatcher(nil), 0)
+	repos, err := Discover(context.Background(), []string{root}, NewMatcher(nil), 0)
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -368,7 +399,7 @@ func TestDiscover_MaxDepthHonored(t *testing.T) {
 	mustMkRepo(t, filepath.Join(root, "shallow"))
 	mustMkRepo(t, filepath.Join(root, "a", "b", "c", "deep"))
 
-	repos, err := Discover([]string{root}, NewMatcher(nil), 2)
+	repos, err := Discover(context.Background(), []string{root}, NewMatcher(nil), 2)
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -411,7 +442,7 @@ func TestDiscover_FindsBareRepo(t *testing.T) {
 		t.Fatal(err)
 	}
 
-	repos, err := Discover([]string{root}, NewMatcher(nil), 0)
+	repos, err := Discover(context.Background(), []string{root}, NewMatcher(nil), 0)
 	if err != nil {
 		t.Fatal(err)
 	}
diff --git a/internal/scan/scan.go b/internal/scan/scan.go
index 0d4a41d..4417ad6 100644
--- a/internal/scan/scan.go
+++ b/internal/scan/scan.go
@@ -80,7 +80,7 @@ func Run(ctx context.Context, cfg Config) (*Result, error) {
 		return nil, err
 	}
 
-	repos, err := Discover(cfg.Roots, matcher, cfg.MaxDepth)
+	repos, err := Discover(ctx, cfg.Roots, matcher, cfg.MaxDepth)
 	if err != nil {
 		return nil, err
 	}

From 6153c59f9e651c5b842eb79a9547065c8600f298 Mon Sep 17 00:00:00 2001
From: lex0c <lex0c@proton.me>
Date: Mon, 20 Apr 2026 13:44:30 -0300
Subject: [PATCH 12/37] Harden tests for scan feature gaps

Three additions that fill coverage holes called out in the self-
assessment of this branch:

1. TestGenerate_PerRepoBreakdownWidthsAreNumeric now asserts at
   least one bar width renders above zero. The absence-of-error-
   markers check would previously pass a regression where every
   width silently collapsed to 0% (bars invisible but template
   still valid).

2. TestDiscover_AbortsMidWalk measures a baseline walk time,
   cancels at 25% of that interval, and asserts the cancelled walk
   returned ctx.Canceled, cut short before baseline, and produced
   fewer than the full repo count. Complements the existing pre-
   cancelled test, which only proved the first callback checks
   ctx.

3. TestGenerateProfile_MultiRepoBreakdownFiltersByEmail covers the
   scan --email --report flow end-to-end: two repos, one with a
   colleague-exclusive commit + file, assert the breakdown section
   renders, commit counts per repo reflect only the filtered
   dev, and colleague-only files do not bleed into the report.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 internal/report/report_test.go  | 93 +++++++++++++++++++++++++++++++++
 internal/scan/discovery_test.go | 69 ++++++++++++++++++++++++
 2 files changed, 162 insertions(+)

diff --git a/internal/report/report_test.go b/internal/report/report_test.go
index 595b0f6..f27cf33 100644
--- a/internal/report/report_test.go
+++ b/internal/report/report_test.go
@@ -5,6 +5,7 @@ import (
 	"fmt"
 	"os"
 	"path/filepath"
+	"regexp"
 	"strings"
 	"testing"
 
@@ -119,6 +120,98 @@ func TestGenerate_PerRepoBreakdownWidthsAreNumeric(t *testing.T) {
 	if strings.Contains(out, "%!f(string=") {
 		t.Error("report contains %!f(string=...) — printf was given a string where a float was expected")
 	}
+
+	// Stronger guard: the absence-of-markers check above would pass
+	// if a future bug rendered every bar at `width:0%`. The breakdown
+	// tile has a green fill (background:#216e39) — extract every
+	// rendered width for those tiles and assert at least one is
+	// non-zero. With alpha's 10-churn commit and beta's 25-churn
+	// commit producing distinct totals, the commit-share proportions
+	// can't all collapse to zero unless the template is broken.
+	widthRe := regexp.MustCompile(`width:([0-9]+(?:\.[0-9]+)?)%; background:#216e39`)
+	matches := widthRe.FindAllStringSubmatch(out, -1)
+	if len(matches) == 0 {
+		t.Fatal("no repo-breakdown bar widths found; template may have changed selector")
+	}
+	sawNonZero := false
+	for _, m := range matches {
+		if m[1] != "0" && m[1] != "0.0" {
+			sawNonZero = true
+			break
+		}
+	}
+	if !sawNonZero {
+		t.Errorf("every repo bar rendered with width:0 — proportions would be invisible in the UI; widths: %v", matches)
+	}
+}
+
+// End-to-end for the `gitcortex scan --email me@x.com --report …`
+// flow. Covers three assertions at once:
+//   1. GenerateProfile emits the Per-Repository Breakdown section
+//      when the dataset is multi-repo (gated on len(Repos) > 1).
+//   2. Counts per repo are filtered to the dev — a commit by
+//      someone-else@x.com in alpha doesn't bleed into my profile's
+//      alpha row.
+//   3. Files counted per repo are only files THIS dev touched — a
+//      colleague-exclusive file in alpha must not inflate my scope.
+func TestGenerateProfile_MultiRepoBreakdownFiltersByEmail(t *testing.T) {
+	dir := t.TempDir()
+	alpha := filepath.Join(dir, "alpha.jsonl")
+	beta := filepath.Join(dir, "beta.jsonl")
+
+	// alpha: me has 1 commit on a.go; colleague has 1 commit on
+	// colleague-only.go. beta: me has 1 commit on b.go.
+	alphaContent := `{"type":"commit","sha":"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa01","author_email":"me@x.com","author_name":"Me","author_date":"2024-01-10T00:00:00Z","additions":10,"deletions":0,"files_changed":1}
+{"type":"commit_file","commit":"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa01","path_current":"a.go","additions":10,"deletions":0}
+{"type":"commit","sha":"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa02","author_email":"colleague@x.com","author_name":"Col","author_date":"2024-01-11T00:00:00Z","additions":5,"deletions":0,"files_changed":1}
+{"type":"commit_file","commit":"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa02","path_current":"colleague-only.go","additions":5,"deletions":0}
+`
+	betaContent := `{"type":"commit","sha":"bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb01","author_email":"me@x.com","author_name":"Me","author_date":"2024-02-10T00:00:00Z","additions":20,"deletions":5,"files_changed":1}
+{"type":"commit_file","commit":"bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb01","path_current":"b.go","additions":20,"deletions":5}
+`
+	if err := os.WriteFile(alpha, []byte(alphaContent), 0o644); err != nil {
+		t.Fatal(err)
+	}
+	if err := os.WriteFile(beta, []byte(betaContent), 0o644); err != nil {
+		t.Fatal(err)
+	}
+
+	ds, err := stats.LoadMultiJSONL([]string{alpha, beta})
+	if err != nil {
+		t.Fatalf("LoadMultiJSONL: %v", err)
+	}
+
+	var buf bytes.Buffer
+	if err := GenerateProfile(&buf, ds, "scan-profile", "me@x.com"); err != nil {
+		t.Fatalf("GenerateProfile: %v", err)
+	}
+	out := buf.String()
+
+	if !strings.Contains(out, "Per-Repository Breakdown") {
+		t.Fatal("profile report missing the breakdown section — scan --email users won't see their cross-repo split")
+	}
+
+	// Each row renders as `<td class="mono">alpha</td>` followed by
+	// a `<td>N</td>` for commits. Assert 1 commit per repo — if the
+	// filter leaked, alpha would show 2 (me's + colleague's).
+	commitCountRe := regexp.MustCompile(`<td class="mono">(alpha|beta)</td>\s*<td>(\d+)</td>`)
+	rows := commitCountRe.FindAllStringSubmatch(out, -1)
+	if len(rows) != 2 {
+		t.Fatalf("expected 2 repo rows in breakdown, got %d: %v", len(rows), rows)
+	}
+	for _, r := range rows {
+		if r[2] != "1" {
+			t.Errorf("repo %s shows %s commits in profile breakdown — email filter leaked (want 1)", r[1], r[2])
+		}
+	}
+
+	// Colleague-exclusive file must not appear in my scope. The
+	// template renders file counts as `<td>N</td>` inside each row
+	// — indirect assertion: if the file-count bumped, the dev-filter
+	// on devCommits is wrong.
+	if strings.Contains(out, "colleague-only.go") {
+		t.Error("profile report mentions a file only the colleague touched")
+	}
 }
 
 func TestGenerate_EmptyDataset(t *testing.T) {
diff --git a/internal/scan/discovery_test.go b/internal/scan/discovery_test.go
index 8639acb..f07214d 100644
--- a/internal/scan/discovery_test.go
+++ b/internal/scan/discovery_test.go
@@ -8,6 +8,7 @@ import (
 	"os"
 	"path/filepath"
 	"testing"
+	"time"
 )
 
 func TestDiscover_FindsRepos(t *testing.T) {
@@ -184,6 +185,74 @@ func TestAssignSlugs_DeterministicUnderRetry(t *testing.T) {
 	}
 }
 
+// Mid-walk cancel: the pre-cancelled test below only proves the
+// first callback checks ctx. This test cancels AFTER Discover has
+// already started walking — the scenario users hit with Ctrl+C on
+// a long home-dir scan. Asserting the walk stops in flight, not
+// just when prompted at the very start.
+//
+// The tree is large enough (5,000 dirs × 3 subdirs = 20k nodes)
+// that a full walk on typical hardware takes tens of ms; sleeping
+// briefly before cancel lands the signal mid-flight. We assert:
+//   - err is context.Canceled (ctx check fired inside the walk)
+//   - elapsed time < full baseline (walk was actually interrupted)
+//   - at least one repo was discovered BEFORE cancel (not a
+//     pre-cancel scenario in disguise)
+func TestDiscover_AbortsMidWalk(t *testing.T) {
+	if testing.Short() {
+		t.Skip("builds a large synthetic tree; skipped in -short mode")
+	}
+	root := t.TempDir()
+	const topLevel = 5000
+	for i := 0; i < topLevel; i++ {
+		parent := filepath.Join(root, fmt.Sprintf("d-%05d", i))
+		for j := 0; j < 3; j++ {
+			if err := os.MkdirAll(filepath.Join(parent, fmt.Sprintf("sub-%d", j)), 0o755); err != nil {
+				t.Fatal(err)
+			}
+		}
+		// Every top-level dir is a repo so the result count is a
+		// direct proxy for how far the walk got.
+		mustMkRepo(t, parent)
+	}
+
+	// Baseline: uncancelled walk finds all topLevel repos.
+	baseStart := time.Now()
+	full, err := Discover(context.Background(), []string{root}, NewMatcher(nil), 0)
+	if err != nil {
+		t.Fatal(err)
+	}
+	baseline := time.Since(baseStart)
+	if len(full) != topLevel {
+		t.Fatalf("baseline walk should find all %d repos, got %d", topLevel, len(full))
+	}
+
+	// Cancelled walk: give the walk a head start, then cancel.
+	ctx, cancel := context.WithCancel(context.Background())
+	done := make(chan struct{})
+	go func() {
+		time.Sleep(baseline / 4)
+		cancel()
+		close(done)
+	}()
+
+	cancStart := time.Now()
+	repos, err := Discover(ctx, []string{root}, NewMatcher(nil), 0)
+	cancElapsed := time.Since(cancStart)
+	<-done
+
+	if err != context.Canceled {
+		t.Fatalf("want context.Canceled after mid-walk cancel, got err=%v (elapsed %v, baseline %v, repos %d)",
+			err, cancElapsed, baseline, len(repos))
+	}
+	if cancElapsed >= baseline {
+		t.Errorf("cancelled walk took %v — baseline was %v. Ctx respected but walk did not shortcut; users will see delayed Ctrl+C", cancElapsed, baseline)
+	}
+	if len(repos) >= topLevel {
+		t.Errorf("cancelled walk returned all %d repos — appears to have finished before cancel fired; baseline %v, elapsed %v", len(repos), baseline, cancElapsed)
+	}
+}
+
 // A cancelled context must abort the walk — previously Discover
 // ignored ctx entirely, so Ctrl+C during the walk phase only took
 // effect after every directory had been stat'd. Test uses a

From 0947705215d3828a5bb84cac10ebfc0f7aef018d Mon Sep 17 00:00:00 2001
From: lex0c <lex0c@proton.me>
Date: Mon, 20 Apr 2026 13:47:29 -0300
Subject: [PATCH 13/37] Document scan command and per-repo breakdown metric

README gains a full `Scan: discover and aggregate every repo under a
root` section covering the headline flows: consolidated team report,
`--email` personal cross-repo profile, multi-root workspaces. Covers
the output directory layout (slug.jsonl, slug.state, manifest.json),
the `.gitcortex-ignore` syntax with negation examples, and the
tuning flags forwarded to the consolidated report. HTML report
section notes the new Per-Repository Breakdown that appears on
multi-repo runs. Architecture tree picks up internal/scan and
internal/stats/repo_breakdown.

METRICS.md adds a Per-Repository Breakdown entry describing
columns, how the repo label is derived from the LoadMultiJSONL
path prefix, and the divergence-between-commits-and-churn signal.
Vendor/generated warning section notes the prefix-stripping for
multi-JSONL inputs so readers understand why suggestions come out
repo-relative.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 README.md       | 70 ++++++++++++++++++++++++++++++++++++++++++++++++-
 docs/METRICS.md | 23 ++++++++++++++++
 2 files changed, 92 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 29ea136..bdcf222 100644
--- a/README.md
+++ b/README.md
@@ -401,6 +401,67 @@ gitcortex stats --input auth.jsonl --input payments.jsonl --stat coupling --top
 
 Paths appear as `auth:src/main.go` and `payments:src/main.go`. Contributors are deduped by email across repos — the same developer contributing to both repos is counted once.
 
+For workspaces containing many repos (an engineer's `~/work`, a platform team's service folder), `gitcortex scan` discovers every `.git` under one or more roots and extracts them in parallel — see below.
+
+### Scan: discover and aggregate every repo under a root
+
+Walk one or more directories, find every git repository (working trees and bare clones both detected), extract them in parallel, and optionally generate a consolidated HTML report. The main use case is "show my manager every project I've touched in the last year" or "give a platform team one dashboard across N services" without scripting the multi-extract + multi-input pipeline manually.
+
+```bash
+# Discover and extract every repo under ~/work, one JSONL per repo
+gitcortex scan --root ~/work --output ./scan-out
+
+# Consolidated report across all discovered repos
+gitcortex scan --root ~/work --output ./scan-out --report ./all.html
+
+# Personal cross-repo profile: only MY commits, with per-repo breakdown
+gitcortex scan --root ~/work --output ./scan-out \
+  --report ./me.html --email me@company.com --since 1y \
+  --include-commit-messages
+
+# Multiple roots, higher parallelism, pre-set ignore patterns
+gitcortex scan --root ~/work --root ~/personal --root ~/oss \
+  --parallel 8 --max-depth 4 \
+  --output ./scan-out --report ./all.html
+```
+
+The scan output directory holds:
+
+| file | purpose |
+|---|---|
+| `<slug>.jsonl` | per-repo JSONL, one per discovered repo |
+| `<slug>.state` | resume checkpoint (safe to re-run scan to continue) |
+| `manifest.json` | discovery results, per-repo status (ok/failed/pending), timing |
+
+Each repo's slug is derived from its directory basename; colliding basenames get a short SHA-1 suffix (the suffix lengthens automatically on the rare truncation collision, so `<slug>.state` is stable across runs).
+
+**Filtering discovery with `.gitcortex-ignore`.** Create a gitignore-style file at the scan root:
+
+```
+# skip heavy clones we don't want in the report
+node_modules
+chromium.git
+linux.git
+
+# skip vendored repos except the one we own
+vendor/
+!vendor/in-house-fork
+```
+
+Directory rules, globs, `**/foo`, and `!path` negations all work. Globbed negations like `!vendor*/keep` are honored — discovery descends into any dir where a negation rule could match a descendant. If `--ignore-file` is not set, scan looks for `.gitcortex-ignore` in the first `--root`.
+
+**Consolidated report extras.** When a scan produces more than one repo's data, both the team report and `--email` profile report render a new *Per-Repository Breakdown* section: commits, churn, files, active days, and the share-of-total for each repo. On an `--email` profile the counts are filtered to that developer's contributions (files count reflects only files the dev touched).
+
+**Flags worth knowing:**
+
+- `--parallel N` — repos extracted concurrently (default 4). Git is I/O-bound, so values past NumCPU give diminishing returns.
+- `--max-depth N` — stop descending past N levels. Useful when a root contains a monorepo with deeply nested internal repos you don't want enumerated.
+- `--extract-ignore <glob>` (repeatable) — forwarded to each per-repo `extract --ignore`, e.g. `--extract-ignore 'package-lock.json' --extract-ignore 'dist/*'`.
+- `--from / --to / --since` — time window applied to the consolidated report (same semantics as `report`).
+- `--churn-half-life`, `--coupling-max-files`, `--coupling-min-changes`, `--network-min-files` — pass tuning to the consolidated report identical to `gitcortex report`.
+
+Partial failures are non-fatal: the manifest records which repos failed, and the report is built from whichever JSONLs completed. `Ctrl+C` aborts both the discovery walk and any in-flight extracts; re-running picks up from each repo's state file.
+
 ### Diff: compare time periods
 
 Compare stats between two time periods, or filter to a single period.
@@ -444,6 +505,8 @@ gitcortex report --input data.jsonl --email alice@company.com --output alice.htm
 
 Includes: summary cards, activity heatmap (with table toggle), top contributors, file hotspots, churn risk (with full-dataset label distribution strip above the truncated table), bus factor, file coupling, working patterns heatmap, top commits, developer network, and developer profiles. A collapsible glossary at the top defines the terms (bus factor, churn, legacy-hotspot, specialization, etc.) for readers who are not already familiar. Typical size: 50-500KB depending on number of contributors.
 
+When the input is multi-repo (from `gitcortex scan` or multiple `--input` files), both the team report and `--email` profile also render a *Per-Repository Breakdown* with commit/churn/files/active-days per repo and each repo's share of the total.
+
 > The HTML activity heatmap is always monthly (year × 12 months grid). For day/week/year buckets, use `gitcortex stats --stat activity --granularity <unit>`.
 
 ### CI: quality gates for pipelines
@@ -483,9 +546,14 @@ internal/
     parse.go                   Shared types (RawEntry, NumstatEntry)
     discard.go                 Malformed entry tracking
   extract/extract.go           Extraction orchestration, state, JSONL writing
+  scan/
+    scan.go                    Multi-repo orchestration (worker pool over extract)
+    discovery.go               Directory walk, bare-repo detection, slug uniqueness
+    ignore.go                  Gitignore-style matcher with negation support
   stats/
-    reader.go                  Streaming JSONL aggregator (single-pass)
+    reader.go                  Streaming JSONL aggregator (single-pass, multi-JSONL)
     stats.go                   Stat computations (9 stats)
+    repo_breakdown.go          Per-repository aggregate (scan consolidated report)
     format.go                  Table/CSV/JSON output formatting
 ```
 
diff --git a/docs/METRICS.md b/docs/METRICS.md
index 7ed473e..6749855 100644
--- a/docs/METRICS.md
+++ b/docs/METRICS.md
@@ -294,6 +294,27 @@ A `tree(1)`-style view of the repository's directory layout, built from paths se
 
 **When to use**: before drilling into hotspots or churn-risk, skim the structure to locate the modules those files live in. The tree is navigational context; ranked tables are where judgment happens.
 
+## Per-Repository Breakdown
+
+Cross-repo aggregation that appears only when the dataset was loaded from more than one JSONL (typically via `gitcortex scan` or multiple `--input` files). One row per repo:
+
+| column | meaning |
+|---|---|
+| `Commits` | author-dated commit count in this repo |
+| `% Commits` | share of total commits in the dataset |
+| `Churn` | additions + deletions attributed to this repo |
+| `% Churn` | share of total churn |
+| `Files` | distinct files touched in this repo; when filtered by `--email`, counts only files that developer touched |
+| `Active days` | distinct UTC author-dates |
+| `Devs` | unique author emails in this repo |
+| `First → Last` | earliest and latest author-date |
+
+**How the repo label is derived**: `LoadMultiJSONL` prefixes every path in the dataset with `<filename-stem>:` (so `WordPress.git.jsonl` contributes paths like `WordPress.git:wp-includes/foo.php`). The breakdown groups by that prefix. If only a single JSONL is loaded, no prefix is emitted and the breakdown collapses to a single `(repo)` row the HTML report hides.
+
+**Divergence between `% Commits` and `% Churn` is informative**: a repo dominating churn while holding modest commit share often signals large-content work (docs, data), while the reverse points to small-diff high-frequency repos (config, manifests).
+
+**SHA collisions across repos** (forks, mirrors, cherry-picks between sibling projects) are preserved here — the breakdown tracks commits per repo via a dedicated slice populated at ingest, not the SHA-keyed commit map. Other file-level metrics (bus factor, coupling, dev network) still key by SHA and will collapse collided commits onto one record; if exact attribution matters for those, scan and aggregate the sibling repos separately.
+
 ## Data Flow
 
 ```
@@ -364,6 +385,8 @@ Directory-segment heuristics (`vendor`, `node_modules`, `dist`, `build`, `third_
 
 The warning is advisory. Nothing is auto-filtered; the user decides whether to re-extract. Matches do not affect computed stats in that run. JSON/CSV output paths skip the warning since they're typically piped.
 
+On multi-JSONL loads (e.g. `stats --input a.jsonl --input b.jsonl` or a `gitcortex scan` dataset), paths carry a `<repo>:` prefix internally. The suspect detector strips that prefix before matching and suggesting, so root-level `vendor/`, `package-lock.json`, `go.sum`, etc. in any individual repo are detected and the emitted `--ignore` globs are repo-relative (drop-in for `extract --ignore` and `scan --extract-ignore`). Same-shape findings across repos collapse to one suggestion (`dist/*` applies everywhere rather than being listed once per repo).
+
 Statistical heuristics (very high churn-per-commit, single-author bulk updates) are deliberately out of scope — their false-positive rate on hand-authored code is higher than the path-based list and we'd rather stay quiet than cry wolf.
 
 ### `--mailmap` off by default

From 00b7321ffe6adebb3d758082ea4069a79b18ce75 Mon Sep 17 00:00:00 2001
From: lex0c <lex0c@proton.me>
Date: Mon, 20 Apr 2026 14:00:43 -0300
Subject: [PATCH 14/37] Validate --since before launching scan

scanCmd parsed --since only after scan.Run had already walked every
root and extracted every discovered repo. An invalid value
(`--since 1yy`, `--since nope`) cost minutes-to-hours of discovery
and extraction on large workspaces before the CLI finally rejected
the flag. Move parseSince up alongside the other flag validation so
obvious typos fail instantly without doing any filesystem work.

Test runs scanCmd against an empty TempDir with --since bogus and
asserts two things: the error mentions --since (early validation
fired), and it does NOT contain "no git repositories found" (the
error scan.Run would have produced had it been reached). Together
these prove scan.Run was not called.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 cmd/gitcortex/main.go      | 22 ++++++++++++---------
 cmd/gitcortex/main_test.go | 40 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 53 insertions(+), 9 deletions(-)

diff --git a/cmd/gitcortex/main.go b/cmd/gitcortex/main.go
index ddcf0eb..0886401 100644
--- a/cmd/gitcortex/main.go
+++ b/cmd/gitcortex/main.go
@@ -921,6 +921,19 @@ breakdown — handy for showing aggregated work across many repos.`,
 			if from != "" && to != "" && from > to {
 				return fmt.Errorf("--from (%s) must be on or before --to (%s)", from, to)
 			}
+			// Resolve --since up-front. If this fails we'd otherwise
+			// discover the typo only after a full multi-repo scan —
+			// minutes to hours on a large workspace, all thrown away
+			// because the user mistyped `--since 1yy`. Validate early,
+			// fail fast, keep the result for the report stage.
+			fromDate := from
+			if since != "" {
+				d, err := parseSince(since)
+				if err != nil {
+					return err
+				}
+				fromDate = d
+			}
 
 			cfg := scan.Config{
 				Roots:      roots,
@@ -958,15 +971,6 @@ breakdown — handy for showing aggregated work across many repos.`,
 				return fmt.Errorf("no successful repos extracted; cannot build report")
 			}
 
-			fromDate := from
-			if since != "" {
-				d, err := parseSince(since)
-				if err != nil {
-					return err
-				}
-				fromDate = d
-			}
-
 			ds, err := stats.LoadMultiJSONL(result.JSONLs, stats.LoadOptions{
 				From:         fromDate,
 				To:           to,
diff --git a/cmd/gitcortex/main_test.go b/cmd/gitcortex/main_test.go
index 57410bf..bf7edd3 100644
--- a/cmd/gitcortex/main_test.go
+++ b/cmd/gitcortex/main_test.go
@@ -1,10 +1,50 @@
 package main
 
 import (
+	"bytes"
 	"strings"
 	"testing"
 )
 
+// scanCmd must validate --since BEFORE running the discovery walk
+// and extract pool. Without the early check, an obvious typo like
+// `--since 1yy` only surfaces after scan.Run has already walked
+// every root and extracted every repo found — which can take
+// minutes-to-hours on a large workspace and waste the work.
+//
+// Use an empty TempDir so scan.Run would fail fast with "no git
+// repositories found" if we reached it — that error does not mention
+// "since", so an error containing "since" proves the early
+// validation fired first.
+func TestScanCmd_ValidatesSinceBeforeScanning(t *testing.T) {
+	cmd := scanCmd()
+	cmd.SilenceUsage = true
+	cmd.SilenceErrors = true
+	cmd.SetArgs([]string{
+		"--root", t.TempDir(),
+		"--output", t.TempDir(),
+		"--since", "bogus",
+	})
+	// Swallow any stderr output cobra might emit so we don't pollute
+	// go-test logs on success.
+	cmd.SetOut(&bytes.Buffer{})
+	cmd.SetErr(&bytes.Buffer{})
+
+	err := cmd.Execute()
+	if err == nil {
+		t.Fatal("expected error for --since bogus, got nil")
+	}
+	if !strings.Contains(err.Error(), "since") {
+		t.Errorf("expected error to mention --since (proving early validation), got %q", err)
+	}
+	// Reaching scan.Run on an empty TempDir would produce the
+	// discovery error; asserting its absence confirms the walk was
+	// not started.
+	if strings.Contains(err.Error(), "no git repositories found") {
+		t.Errorf("scan.Run was reached before --since was validated — discovery ran on an invalid-flag input: %q", err)
+	}
+}
+
 func TestValidateDate(t *testing.T) {
 	cases := []struct {
 		in      string

From 814269588033a7795220ad49d8d336c51dd07d11 Mon Sep 17 00:00:00 2001
From: lex0c <lex0c@proton.me>
Date: Mon, 20 Apr 2026 14:02:13 -0300
Subject: [PATCH 15/37] Resolve symlink roots before walking discovery tree
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

filepath.WalkDir has a well-known quirk: when the ROOT argument is
a symlink it visits only the link entry (reported as ModeSymlink)
and refuses to descend into the target. Users whose `~/work` is a
symlink to `/mnt/data/work` — a common setup on machines with a
separate data disk — would see discovery return zero repos and
scan conclude "no git repositories found" despite the target being
full of them.

os.Stat already followed the link for the is-directory check, so
the error path silently branched at WalkDir instead of surfacing
a clear problem. Canonicalize the root with filepath.EvalSymlinks
before stat'ing; the walk then starts at a real directory path.
Symlinks ENCOUNTERED during the walk (as opposed to the root
itself) are left as-is — we don't chase every link on disk, only
the one the user explicitly named as a root.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 internal/scan/discovery.go      | 19 +++++++++++++++++
 internal/scan/discovery_test.go | 36 +++++++++++++++++++++++++++++++++
 2 files changed, 55 insertions(+)

diff --git a/internal/scan/discovery.go b/internal/scan/discovery.go
index 2f2716a..3a76b8c 100644
--- a/internal/scan/discovery.go
+++ b/internal/scan/discovery.go
@@ -55,6 +55,25 @@ func Discover(ctx context.Context, roots []string, matcher *Matcher, maxDepth in
 		if err != nil {
 			return nil, fmt.Errorf("resolve %s: %w", root, err)
 		}
+		// Canonicalize via EvalSymlinks. os.Stat above followed the
+		// symlink for the is-directory check, but filepath.WalkDir
+		// treats a symlink ROOT specially: it visits only the link
+		// itself (reported with ModeSymlink) and does NOT descend
+		// into the target. For a user whose ~/work is a symlink to
+		// /mnt/data/work — a common setup — the walk would yield
+		// zero callbacks past the root and scan would conclude "no
+		// git repositories found under ..." despite the target being
+		// full of repos. EvalSymlinks dereferences the root once so
+		// WalkDir starts with a real directory path; links ENCOUNTERED
+		// during the walk are still left as-is (default WalkDir
+		// behavior), which is what we want — we don't chase every
+		// symlink we come across, only the ones the user explicitly
+		// named as a root.
+		resolved, err := filepath.EvalSymlinks(abs)
+		if err != nil {
+			return nil, fmt.Errorf("resolve symlinks for %s: %w", abs, err)
+		}
+		abs = resolved
 		info, err := os.Stat(abs)
 		if err != nil {
 			return nil, fmt.Errorf("stat %s: %w", abs, err)
diff --git a/internal/scan/discovery_test.go b/internal/scan/discovery_test.go
index f07214d..8dcaadc 100644
--- a/internal/scan/discovery_test.go
+++ b/internal/scan/discovery_test.go
@@ -198,6 +198,42 @@ func TestAssignSlugs_DeterministicUnderRetry(t *testing.T) {
 //   - elapsed time < full baseline (walk was actually interrupted)
 //   - at least one repo was discovered BEFORE cancel (not a
 //     pre-cancel scenario in disguise)
+// Regression: WalkDir given a symlink ROOT visits only the link
+// entry and refuses to descend. Users whose ~/work is a symlink to
+// a real data disk would previously see "no git repositories found"
+// despite the target being full of repos. Canonicalize via
+// EvalSymlinks before walking so the walk starts at a real dir.
+func TestDiscover_DereferencesSymlinkRoot(t *testing.T) {
+	real := t.TempDir()
+	mustMkRepo(t, filepath.Join(real, "repo-a"))
+	mustMkRepo(t, filepath.Join(real, "nested", "repo-b"))
+
+	// Place the symlink in a separate TempDir so the test doesn't
+	// have to clean it up explicitly.
+	linkDir := t.TempDir()
+	link := filepath.Join(linkDir, "work")
+	if err := os.Symlink(real, link); err != nil {
+		t.Skipf("symlinks not supported here: %v", err)
+	}
+
+	repos, err := Discover(context.Background(), []string{link}, NewMatcher(nil), 0)
+	if err != nil {
+		t.Fatalf("Discover via symlink root: %v", err)
+	}
+	if len(repos) != 2 {
+		t.Fatalf("want 2 repos found through symlink root, got %d: %+v", len(repos), repos)
+	}
+	got := map[string]bool{}
+	for _, r := range repos {
+		got[r.Slug] = true
+	}
+	for _, want := range []string{"repo-a", "repo-b"} {
+		if !got[want] {
+			t.Errorf("expected slug %q in %v (symlink root was not dereferenced)", want, got)
+		}
+	}
+}
+
 func TestDiscover_AbortsMidWalk(t *testing.T) {
 	if testing.Short() {
 		t.Skip("builds a large synthetic tree; skipped in -short mode")

From 5bd94ab0d9cdf5d645cc86bd7d807a9d38a1440e Mon Sep 17 00:00:00 2001
From: lex0c <lex0c@proton.me>
Date: Mon, 20 Apr 2026 17:12:01 -0300
Subject: [PATCH 16/37] Normalize slug uniqueness for case-insensitive
 filesystems
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

On macOS (APFS/HFS+ default) and Windows (NTFS default),
Repo.jsonl and repo.jsonl resolve to the same file on disk. The
case-sensitive counts[base] / seen[slug] checks treated two repos
named Repo and repo as distinct, handed each the bare basename,
and let the filesystem silently merge their JSONL and state files
— one scan's output overwriting the other with no warning.

Fold to lower case for both the duplicate count and the seen-set so
case-only basename differences now trigger the hash-suffix branch.
Emitted slugs keep their original case for readability; the
collision check that decides filesystem safety runs on the
lower-cased form, which is what the OS compares against.

Test pairs /a/Team/Repo with /b/OSS/repo and asserts the slugs
differ even under strings.EqualFold and both carry a hash suffix —
without the fix, both would land on bare "Repo"/"repo" and collide
on disk.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 internal/scan/discovery.go      | 21 +++++++++++++++++----
 internal/scan/discovery_test.go | 33 +++++++++++++++++++++++++++++++++
 2 files changed, 50 insertions(+), 4 deletions(-)

diff --git a/internal/scan/discovery.go b/internal/scan/discovery.go
index 3a76b8c..2989893 100644
--- a/internal/scan/discovery.go
+++ b/internal/scan/discovery.go
@@ -208,6 +208,18 @@ const initialSlugHashLen = 6
 // redo the pass with a longer hash. Grows up to the full 40 hex chars
 // before panicking — needing that much is a cryptographic event, not
 // a scan-time bug.
+//
+// Case-insensitive uniqueness: on macOS (APFS/HFS+ in the default
+// configuration) and Windows (NTFS), `Repo.jsonl` and `repo.jsonl`
+// resolve to the same file on disk. A case-sensitive slug compare
+// would treat the two repos as distinct, hand each the bare basename,
+// and then quietly let the filesystem merge their JSONL and state
+// files. Fold to lower case for BOTH the duplicate-detection count
+// and the uniqueness seen-set so collision-triggered hashing fires
+// on case-only differences. The emitted slug retains the original
+// case for readability (so paths named Repo and repo produce
+// `Repo-<hash>` and `repo-<hash>` — visually distinct to humans,
+// case-insensitively distinct on disk).
 func assignSlugs(repos []Repo) {
 	counts := make(map[string]int)
 	bases := make([]string, len(repos))
@@ -217,7 +229,7 @@ func assignSlugs(repos []Repo) {
 			base = "repo"
 		}
 		bases[i] = base
-		counts[base]++
+		counts[strings.ToLower(base)]++
 	}
 
 	for hashLen := initialSlugHashLen; hashLen <= 40; hashLen += 2 {
@@ -227,15 +239,16 @@ func assignSlugs(repos []Repo) {
 		for i := range repos {
 			base := bases[i]
 			slug := base
-			if counts[base] > 1 {
+			if counts[strings.ToLower(base)] > 1 {
 				h := sha1.Sum([]byte(repos[i].AbsPath))
 				slug = fmt.Sprintf("%s-%s", base, hex.EncodeToString(h[:])[:hashLen])
 			}
-			if prev, ok := seen[slug]; ok && prev != i {
+			key := strings.ToLower(slug)
+			if prev, ok := seen[key]; ok && prev != i {
 				collided = true
 				break
 			}
-			seen[slug] = i
+			seen[key] = i
 			proposed[i] = slug
 		}
 		if !collided {
diff --git a/internal/scan/discovery_test.go b/internal/scan/discovery_test.go
index 8dcaadc..5c1fa19 100644
--- a/internal/scan/discovery_test.go
+++ b/internal/scan/discovery_test.go
@@ -7,6 +7,7 @@ import (
 	"fmt"
 	"os"
 	"path/filepath"
+	"strings"
 	"testing"
 	"time"
 )
@@ -162,6 +163,38 @@ func findColliding6HexPaths(maxAttempts int) (string, string, bool) {
 	return "", "", false
 }
 
+// Regression: on macOS (APFS/HFS+ default) and Windows (NTFS),
+// `Repo.jsonl` and `repo.jsonl` are the same file on disk. A
+// case-sensitive slug compare would hand both repos the bare
+// basename and let the filesystem merge their outputs — one scan's
+// JSONL silently overwrites the other. Counts and seen-set are
+// lower-cased so any case-only difference is treated as a collision
+// and forces the hash suffix.
+func TestAssignSlugs_CaseInsensitiveUniqueness(t *testing.T) {
+	repos := []Repo{
+		{AbsPath: "/a/Team/Repo"},
+		{AbsPath: "/b/OSS/repo"},
+	}
+	assignSlugs(repos)
+
+	if repos[0].Slug == repos[1].Slug {
+		t.Fatalf("repos with case-only-differing basenames got same slug %q", repos[0].Slug)
+	}
+	// The fs collision only goes away if the slugs also differ when
+	// folded to lower case — which is what decides filenames on
+	// case-insensitive fs.
+	if strings.EqualFold(repos[0].Slug, repos[1].Slug) {
+		t.Fatalf("slugs %q and %q differ only in case — they would still collide on macOS/Windows", repos[0].Slug, repos[1].Slug)
+	}
+	// Sanity: both should have gained a hash suffix (the bare `Repo`
+	// and `repo` weren't safe to emit).
+	for _, r := range repos {
+		if !strings.Contains(r.Slug, "-") {
+			t.Errorf("repo %s got bare slug %q; case-sensitive branch did not trigger hashing", r.AbsPath, r.Slug)
+		}
+	}
+}
+
 // Same paths ingested twice must produce the same slugs — this is
 // what makes resume work across runs. Even with the retry loop
 // adjusting hash length under collisions, a deterministic input set

From 89df5862addb4ef2cf8c7e88f72859a827be152f Mon Sep 17 00:00:00 2001
From: lex0c <lex0c@proton.me>
Date: Mon, 20 Apr 2026 17:22:02 -0300
Subject: [PATCH 17/37] Fail scan when every repo's extract failed
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

scanCmd's no-report branch returned success unconditionally, so a
run where every discovered repo's extract failed still exited 0
after printing "Scan complete: 0 JSONL file(s)". CI and any other
automation that checked only the exit code treated a fully-failed
scan as successful and moved on to downstream steps with empty
artifacts. The report branch already had the guard; apply it
uniformly before the branch so both paths behave.

scan.Run is intentionally non-fatal on per-repo errors so one
transient failure doesn't tank a batch — that policy remains. But
when the successful count is zero, there's nothing useful on disk
to inspect and the CLI must surface the problem. The error message
points at the manifest, which is where per-repo Status fields
record which repos failed and why.

Test creates a fake repo (empty .git dir that discovery picks up
but extract rejects as "not a git repository") and asserts
scanCmd's error surfaces.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 cmd/gitcortex/main.go      | 17 ++++++++++++++---
 cmd/gitcortex/main_test.go | 38 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 52 insertions(+), 3 deletions(-)

diff --git a/cmd/gitcortex/main.go b/cmd/gitcortex/main.go
index 0886401..948aac4 100644
--- a/cmd/gitcortex/main.go
+++ b/cmd/gitcortex/main.go
@@ -963,13 +963,24 @@ breakdown — handy for showing aggregated work across many repos.`,
 				return err
 			}
 
+			// Exit non-zero when every repo failed to extract, regardless
+			// of whether a report was requested. scan.Run intentionally
+			// treats per-repo failures as non-fatal so a transient error
+			// on one repo doesn't tank the whole batch — but when the
+			// count of successes is zero, there's nothing useful on disk
+			// to inspect. Without this guard CI sees exit code 0 and
+			// continues with empty artifacts; the manifest's per-repo
+			// Status fields remain the only way to see the failures.
+			if len(result.JSONLs) == 0 {
+				return fmt.Errorf("scan found %d repositor(ies) but all extracts failed; see %s for per-repo status",
+					len(result.Manifest.Repos),
+					filepath.Join(result.OutputDir, "manifest.json"))
+			}
+
 			if reportPath == "" {
 				fmt.Fprintf(os.Stderr, "Scan complete: %d JSONL file(s) in %s\n", len(result.JSONLs), result.OutputDir)
 				return nil
 			}
-			if len(result.JSONLs) == 0 {
-				return fmt.Errorf("no successful repos extracted; cannot build report")
-			}
 
 			ds, err := stats.LoadMultiJSONL(result.JSONLs, stats.LoadOptions{
 				From:         fromDate,
diff --git a/cmd/gitcortex/main_test.go b/cmd/gitcortex/main_test.go
index bf7edd3..3c5fcc0 100644
--- a/cmd/gitcortex/main_test.go
+++ b/cmd/gitcortex/main_test.go
@@ -2,6 +2,8 @@ package main
 
 import (
 	"bytes"
+	"os"
+	"path/filepath"
 	"strings"
 	"testing"
 )
@@ -45,6 +47,42 @@ func TestScanCmd_ValidatesSinceBeforeScanning(t *testing.T) {
 	}
 }
 
+// scanCmd must exit non-zero when every discovered repo's extract
+// failed, regardless of whether --report was requested. scan.Run
+// intentionally treats per-repo failures as non-fatal (a transient
+// error on one repo shouldn't tank the whole batch), but if ZERO
+// repos succeed there's nothing useful on disk and CI should know.
+// Previously the no-report branch returned success unconditionally;
+// automation saw exit 0 and "Scan complete: 0 JSONL file(s)" and
+// continued with empty artifacts.
+//
+// Test uses a fake repo (a bare `.git` dir with no history) so
+// discovery picks it up but extract fails.
+func TestScanCmd_ExitsNonZeroWhenAllExtractsFail(t *testing.T) {
+	root := t.TempDir()
+	if err := os.MkdirAll(filepath.Join(root, "fake-repo", ".git"), 0o755); err != nil {
+		t.Fatal(err)
+	}
+
+	cmd := scanCmd()
+	cmd.SilenceUsage = true
+	cmd.SilenceErrors = true
+	cmd.SetOut(&bytes.Buffer{})
+	cmd.SetErr(&bytes.Buffer{})
+	cmd.SetArgs([]string{
+		"--root", root,
+		"--output", t.TempDir(),
+	})
+
+	err := cmd.Execute()
+	if err == nil {
+		t.Fatal("expected non-zero exit when all extracts fail; CI would silently treat this as success")
+	}
+	if !strings.Contains(err.Error(), "all extracts failed") {
+		t.Errorf("error should explain every repo failed; got %q", err)
+	}
+}
+
 func TestValidateDate(t *testing.T) {
 	cases := []struct {
 		in      string

From 66f45169bd5b2d4c98d56cf613142adc98f407cc Mon Sep 17 00:00:00 2001
From: lex0c <lex0c@proton.me>
Date: Mon, 20 Apr 2026 17:24:36 -0300
Subject: [PATCH 18/37] Reject missing explicit ignore file paths
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

LoadMatcher treated os.IsNotExist as success and returned an empty
matcher. A typo in `--ignore-file my.ignorre` silently disabled
every rule and widened discovery to include node_modules/, vendor/,
and any other trees the user thought they had excluded — with no
signal from the console that anything was wrong. Users would see a
consolidated report loaded with unwanted content and have no
breadcrumb pointing at the mistyped flag.

Fail loudly in LoadMatcher. The implicit default-path lookup
(scan.loadMatcher checking for .gitcortex-ignore at the first
root) already os.Stats the candidate before calling LoadMatcher,
so the "silent when absent" semantics survive exactly where they
belong: not on explicit user input.

Tests pair a unit-level LoadMatcher check with a CLI-layer
`--ignore-file /typo.ignore` check; the latter verifies the error
surfaces before discovery starts (no "no git repositories found"
wrapper from a downstream phase).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 cmd/gitcortex/main_test.go   | 33 +++++++++++++++++++++++++++++++++
 internal/scan/ignore.go      | 13 +++++++------
 internal/scan/ignore_test.go | 24 +++++++++++++++---------
 3 files changed, 55 insertions(+), 15 deletions(-)

diff --git a/cmd/gitcortex/main_test.go b/cmd/gitcortex/main_test.go
index 3c5fcc0..aca5896 100644
--- a/cmd/gitcortex/main_test.go
+++ b/cmd/gitcortex/main_test.go
@@ -83,6 +83,39 @@ func TestScanCmd_ExitsNonZeroWhenAllExtractsFail(t *testing.T) {
 	}
 }
 
+// `scan --ignore-file /nonexistent` must fail at the CLI layer —
+// silently falling back to zero rules would widen discovery scope
+// without telling the user (e.g. node_modules and vendor/ dirs the
+// user thought they excluded suddenly appear in the report). The
+// default-path lookup in scan.loadMatcher still tolerates a missing
+// `.gitcortex-ignore` at the first root; only the explicit flag is
+// strict.
+func TestScanCmd_RejectsMissingIgnoreFile(t *testing.T) {
+	cmd := scanCmd()
+	cmd.SilenceUsage = true
+	cmd.SilenceErrors = true
+	cmd.SetOut(&bytes.Buffer{})
+	cmd.SetErr(&bytes.Buffer{})
+	cmd.SetArgs([]string{
+		"--root", t.TempDir(),
+		"--output", t.TempDir(),
+		"--ignore-file", filepath.Join(t.TempDir(), "typo.ignore"),
+	})
+
+	err := cmd.Execute()
+	if err == nil {
+		t.Fatal("expected error for missing --ignore-file; a typo must not silently widen scope")
+	}
+	if !strings.Contains(err.Error(), "typo.ignore") {
+		t.Errorf("error should identify the missing file; got %q", err)
+	}
+	// Must fail BEFORE discovery starts — scan on an empty TempDir
+	// would otherwise produce "no git repositories found".
+	if strings.Contains(err.Error(), "no git repositories found") {
+		t.Errorf("ignore-file check ran after discovery; got %q", err)
+	}
+}
+
 func TestValidateDate(t *testing.T) {
 	cases := []struct {
 		in      string
diff --git a/internal/scan/ignore.go b/internal/scan/ignore.go
index 3101124..bffce2c 100644
--- a/internal/scan/ignore.go
+++ b/internal/scan/ignore.go
@@ -41,15 +41,16 @@ func NewMatcher(patterns []string) *Matcher {
 	return m
 }
 
-// LoadMatcher reads an ignore file and returns a matcher. A missing file
-// is not an error — scan simply proceeds with no user-provided rules.
-// This matches the gitignore contract where `.gitignore` is optional.
+// LoadMatcher reads an ignore file and returns a matcher. Missing files
+// are rejected — this entry point is used when the path was specified
+// explicitly (e.g. `--ignore-file foo.ignore`), where a typo should
+// fail loudly instead of silently disabling all ignore rules and
+// widening discovery scope. Callers that want "load if present, empty
+// matcher otherwise" semantics (implicit default-path lookup) should
+// os.Stat first.
 func LoadMatcher(file string) (*Matcher, error) {
 	f, err := os.Open(file)
 	if err != nil {
-		if os.IsNotExist(err) {
-			return NewMatcher(nil), nil
-		}
 		return nil, fmt.Errorf("open %s: %w", file, err)
 	}
 	defer f.Close()
diff --git a/internal/scan/ignore_test.go b/internal/scan/ignore_test.go
index ed9510f..cda4f25 100644
--- a/internal/scan/ignore_test.go
+++ b/internal/scan/ignore_test.go
@@ -3,6 +3,7 @@ package scan
 import (
 	"os"
 	"path/filepath"
+	"strings"
 	"testing"
 )
 
@@ -45,16 +46,21 @@ func TestMatcher_Basics(t *testing.T) {
 	}
 }
 
-func TestLoadMatcher_MissingFileIsOK(t *testing.T) {
-	m, err := LoadMatcher(filepath.Join(t.TempDir(), "missing"))
-	if err != nil {
-		t.Fatalf("expected no error for missing file, got %v", err)
-	}
-	if m == nil {
-		t.Fatal("matcher should not be nil")
+// LoadMatcher is called when the user supplied an explicit
+// --ignore-file; a missing file there is almost always a typo. If
+// we silently returned an empty matcher, every discovery would
+// widen to include node_modules/, vendor/, chromium-clones/, etc.
+// the user thought they had excluded — and they'd have no way to
+// tell from the console output. Fail loudly instead; the
+// default-path lookup in scan.loadMatcher handles the "silent when
+// absent" case via its own os.Stat before calling here.
+func TestLoadMatcher_MissingFileFails(t *testing.T) {
+	_, err := LoadMatcher(filepath.Join(t.TempDir(), "typo.ignore"))
+	if err == nil {
+		t.Fatal("expected error for missing explicit ignore file; a typo must not silently disable rules")
 	}
-	if m.Match("anything", false) {
-		t.Error("empty matcher should match nothing")
+	if !os.IsNotExist(err) && !strings.Contains(err.Error(), "typo.ignore") {
+		t.Errorf("error should identify the missing path; got %q", err)
 	}
 }
 

From f62cdc5a85d3280b6579c006e33e42d75042e90d Mon Sep 17 00:00:00 2001
From: lex0c <lex0c@proton.me>
Date: Mon, 20 Apr 2026 17:43:13 -0300
Subject: [PATCH 19/37] Recover from worker panics in the scan pool
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If extract.Run panicked inside a worker goroutine (unexpected git
binary output, nil deref from a future refactor, etc.) the
goroutine would crash the entire scan process before its
manifest.Repos[idx] slot was written. At best the repo stayed
forever at Status="pending" — indistinguishable from "worker
never dispatched this repo", which misled both operators and
downstream automation. At worst, the process died mid-batch and
the other in-flight extracts were lost.

Wrap the per-job call in runJobSafely: defer-recover converts any
panic into a ManifestRepo with Status="failed" and an Error
prefixed with "panic:". The sibling workers keep pulling from the
jobs channel, so the rest of the batch completes. extract.Run is
not expected to panic under normal inputs — this is a defensive
seam for the "impossible" cases the branch review flagged as the
only remaining untested failure mode.

Package-level runJobFn indirection lets the test override the
worker body to force a deterministic panic without needing a
broken git binary or a real extract bug to reproduce against.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 internal/scan/scan.go      | 37 +++++++++++++++++--
 internal/scan/scan_test.go | 73 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 108 insertions(+), 2 deletions(-)

diff --git a/internal/scan/scan.go b/internal/scan/scan.go
index 4417ad6..a634f21 100644
--- a/internal/scan/scan.go
+++ b/internal/scan/scan.go
@@ -126,8 +126,7 @@ func Run(ctx context.Context, cfg Config) (*Result, error) {
 		go func() {
 			defer wg.Done()
 			for j := range jobs {
-				entry := runOne(ctx, cfg, j.repo)
-				manifest.Repos[j.idx] = entry
+				manifest.Repos[j.idx] = runJobSafely(ctx, cfg, j.repo)
 			}
 		}()
 	}
@@ -197,6 +196,40 @@ func loadMatcher(cfg Config) (*Matcher, error) {
 	return LoadMatcher(path)
 }
 
+// runJobFn is the injection point the worker pool uses. Default is
+// the real runOne; tests override it to simulate extract panics and
+// verify the pool's recovery path. Not exposed via any user-facing
+// API — this is strictly a test seam.
+var runJobFn = runOne
+
+// runJobSafely wraps runJobFn in a defer-recover. If extract panics
+// inside a worker the goroutine would otherwise crash the entire
+// process and the repo's manifest slot would stay at "pending", which
+// is indistinguishable from a job that was never dispatched. Convert
+// panics to a well-formed ManifestRepo with Status="failed" so the
+// manifest reflects reality and the rest of the pool keeps running.
+//
+// extract.Run doesn't panic under normal inputs — the recover exists
+// for the "impossible" cases (e.g. a git binary that writes malformed
+// output, nil-deref bugs introduced by a future refactor). Better to
+// isolate the blast radius than leave a silent corruption mode.
+func runJobSafely(ctx context.Context, cfg Config, repo Repo) (entry ManifestRepo) {
+	defer func() {
+		if r := recover(); r != nil {
+			entry = ManifestRepo{
+				Slug:      repo.Slug,
+				Path:      repo.AbsPath,
+				JSONL:     filepath.Join(cfg.Output, repo.Slug+".jsonl"),
+				StateFile: filepath.Join(cfg.Output, repo.Slug+".state"),
+				Status:    "failed",
+				Error:     fmt.Sprintf("panic: %v", r),
+			}
+			log.Printf("scan: [%s] panicked: %v", repo.Slug, r)
+		}
+	}()
+	return runJobFn(ctx, cfg, repo)
+}
+
 func runOne(ctx context.Context, cfg Config, repo Repo) ManifestRepo {
 	jsonlPath := filepath.Join(cfg.Output, repo.Slug+".jsonl")
 	statePath := filepath.Join(cfg.Output, repo.Slug+".state")
diff --git a/internal/scan/scan_test.go b/internal/scan/scan_test.go
index db24410..ad0c073 100644
--- a/internal/scan/scan_test.go
+++ b/internal/scan/scan_test.go
@@ -13,6 +13,79 @@ import (
 	"github.com/lex0c/gitcortex/internal/stats"
 )
 
+// Regression: if extract.Run panics inside a worker (a future
+// refactor with a nil deref, an unexpected git binary output, etc.),
+// defer wg.Done() still closes the WaitGroup — but without a
+// recover, the goroutine crashes the whole process before the
+// manifest slot for that repo is written. Even without a full crash,
+// the slot would silently stay at "pending" indistinguishable from
+// "worker never reached it". The pool must convert panics to a
+// ManifestRepo with Status="failed" and continue with sibling jobs.
+//
+// Override the injection seam runJobFn: first repo panics, second
+// runs normally. Assert (1) the pool survives, (2) the panicking
+// repo's slot is Status=failed with a panic: prefix, (3) the
+// non-panicking repo still completes.
+func TestRun_RecoversFromWorkerPanic(t *testing.T) {
+	if _, err := exec.LookPath("git"); err != nil {
+		t.Skip("git not installed")
+	}
+
+	root := t.TempDir()
+	makeRealRepo(t, filepath.Join(root, "boom"), map[string]string{"x.go": "x\n"})
+	makeRealRepo(t, filepath.Join(root, "ok"), map[string]string{"y.go": "y\n"})
+
+	// Override runJobFn for the duration of this test only.
+	original := runJobFn
+	defer func() { runJobFn = original }()
+	runJobFn = func(ctx context.Context, cfg Config, repo Repo) ManifestRepo {
+		if repo.Slug == "boom" {
+			panic("simulated extract failure")
+		}
+		return original(ctx, cfg, repo)
+	}
+
+	output := filepath.Join(t.TempDir(), "out")
+	cfg := Config{
+		Roots:    []string{root},
+		Output:   output,
+		Parallel: 1, // force serial so the panicking job is reached
+		Extract: extract.Config{
+			BatchSize:      100,
+			CommandTimeout: extract.DefaultCommandTimeout,
+		},
+	}
+
+	res, err := Run(context.Background(), cfg)
+	if err != nil {
+		t.Fatalf("Run should not propagate the worker panic as an error; got %v", err)
+	}
+
+	gotStatus := map[string]ManifestRepo{}
+	for _, r := range res.Manifest.Repos {
+		gotStatus[r.Slug] = r
+	}
+	boom, ok := gotStatus["boom"]
+	if !ok {
+		t.Fatal("boom slot missing from manifest")
+	}
+	if boom.Status != "failed" {
+		t.Errorf("boom should be failed after panic, got Status=%q", boom.Status)
+	}
+	if !strings.Contains(boom.Error, "panic") {
+		t.Errorf("boom error should carry a panic: prefix so operators can tell panics from normal failures; got %q", boom.Error)
+	}
+	if okRepo, okFound := gotStatus["ok"]; !okFound || okRepo.Status != "ok" {
+		t.Errorf("sibling job must still complete despite the other worker's panic; got %+v", okRepo)
+	}
+	// Ensure the pool didn't leak: all slots have a non-pending status.
+	for _, r := range res.Manifest.Repos {
+		if r.Status == "pending" {
+			t.Errorf("repo %s left at 'pending' after Run returned — the pool abandoned a slot", r.Slug)
+		}
+	}
+}
+
 func TestRun_EndToEnd(t *testing.T) {
 	if _, err := exec.LookPath("git"); err != nil {
 		t.Skip("git not installed")

From ecce5ee12fbf1e73a94501027e2f57a923d6e953 Mon Sep 17 00:00:00 2001
From: lex0c <lex0c@proton.me>
Date: Mon, 20 Apr 2026 17:51:24 -0300
Subject: [PATCH 20/37] Scope Per-Repository Breakdown to profile reports only

The breakdown was rendered on both the team report and the --email
profile report whenever the dataset spanned multiple repos. On
reflection the team view adds little: "how is git history distributed
across the repos in this scan" is nearly tautological and usually
better answered by manifest.json or running stats per-repo. The
section earns its place only when filtered by developer, where the
same numbers turn into a real consolidation story: "where did this
person spend their time across projects".

Remove the section from the team template; keep it on the profile
template where it was already filtered by email. Drop the unused
ReportData.Repos field and the stats.RepoBreakdown call from
Generate; ProfileReportData.Repos remains the single populated home
for the metric. Team-report regression test now asserts the section
does NOT render even on a multi-repo dataset; the profile test
(unchanged) continues to cover the positive case with email
filtering. README and METRICS.md rewritten to frame the metric as
profile-only.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 README.md                      |  4 +--
 docs/METRICS.md                |  6 ++--
 internal/report/report.go      |  7 -----
 internal/report/report_test.go | 55 ++++++++++------------------------
 internal/report/template.go    | 38 -----------------------
 5 files changed, 21 insertions(+), 89 deletions(-)

diff --git a/README.md b/README.md
index bdcf222..df25f09 100644
--- a/README.md
+++ b/README.md
@@ -450,7 +450,7 @@ vendor/
 
 Directory rules, globs, `**/foo`, and `!path` negations all work. Globbed negations like `!vendor*/keep` are honored — discovery descends into any dir where a negation rule could match a descendant. If `--ignore-file` is not set, scan looks for `.gitcortex-ignore` in the first `--root`.
 
-**Consolidated report extras.** When a scan produces more than one repo's data, both the team report and `--email` profile report render a new *Per-Repository Breakdown* section: commits, churn, files, active days, and the share-of-total for each repo. On an `--email` profile the counts are filtered to that developer's contributions (files count reflects only files the dev touched).
+**Consolidated profile report.** When `scan --email me@company.com --report path.html` runs against a multi-repo dataset, the profile report renders a *Per-Repository Breakdown* section: commits, churn, files, active days, and share-of-total — all filtered to that developer's contributions (files count reflects only files the dev touched). The section is profile-only; the team report focuses on the aggregate across repos without a per-repo split (run `stats --input repo.jsonl` per-repo or read the `manifest.json` for that).
 
 **Flags worth knowing:**
 
@@ -505,7 +505,7 @@ gitcortex report --input data.jsonl --email alice@company.com --output alice.htm
 
 Includes: summary cards, activity heatmap (with table toggle), top contributors, file hotspots, churn risk (with full-dataset label distribution strip above the truncated table), bus factor, file coupling, working patterns heatmap, top commits, developer network, and developer profiles. A collapsible glossary at the top defines the terms (bus factor, churn, legacy-hotspot, specialization, etc.) for readers who are not already familiar. Typical size: 50-500KB depending on number of contributors.
 
-When the input is multi-repo (from `gitcortex scan` or multiple `--input` files), both the team report and `--email` profile also render a *Per-Repository Breakdown* with commit/churn/files/active-days per repo and each repo's share of the total.
+When the input is multi-repo (from `gitcortex scan` or multiple `--input` files) AND `--email` is set, the profile report renders a *Per-Repository Breakdown* with commit/churn/files/active-days per repo, filtered to that developer's contributions. The team-view report intentionally omits this section — per-repo aggregates on a consolidated dataset reduce to raw git-history distribution, which is more usefully inspected via `manifest.json` or `stats --input X.jsonl` per repo.
 
 > The HTML activity heatmap is always monthly (year × 12 months grid). For day/week/year buckets, use `gitcortex stats --stat activity --granularity <unit>`.
 
diff --git a/docs/METRICS.md b/docs/METRICS.md
index 6749855..72b9fff 100644
--- a/docs/METRICS.md
+++ b/docs/METRICS.md
@@ -296,7 +296,9 @@ A `tree(1)`-style view of the repository's directory layout, built from paths se
 
 ## Per-Repository Breakdown
 
-Cross-repo aggregation that appears only when the dataset was loaded from more than one JSONL (typically via `gitcortex scan` or multiple `--input` files). One row per repo:
+Cross-repo aggregation scoped to a single developer. Renders in the HTML profile report (`report --email me@x.com` or `scan --email me@x.com --report …`) when the dataset was loaded from more than one JSONL. The metric is deliberately profile-only: on a team-view consolidated report, per-repo aggregates reduce to raw git-history distribution and are better inspected via `manifest.json` or by running `stats --input X.jsonl` per-repo. Filtered by a developer's email, the same numbers answer a genuinely different question — *"where did this person spend their time?"* — which is what the section surfaces.
+
+One row per repo:
 
 | column | meaning |
 |---|---|
@@ -304,7 +306,7 @@ Cross-repo aggregation that appears only when the dataset was loaded from more t
 | `% Commits` | share of total commits in the dataset |
 | `Churn` | additions + deletions attributed to this repo |
 | `% Churn` | share of total churn |
-| `Files` | distinct files touched in this repo; when filtered by `--email`, counts only files that developer touched |
+| `Files` | distinct files the developer touched in this repo (always email-filtered, since the section only renders on profile reports) |
 | `Active days` | distinct UTC author-dates |
 | `Devs` | unique author emails in this repo |
 | `First → Last` | earliest and latest author-date |
diff --git a/internal/report/report.go b/internal/report/report.go
index 358c894..24c982f 100644
--- a/internal/report/report.go
+++ b/internal/report/report.go
@@ -68,12 +68,6 @@ type ReportData struct {
 	TotalDirectories    int
 	TotalExtensions     int
 	TotalBusFactorFiles int
-
-	// Repos holds per-repository aggregates for multi-repo (scan) reports.
-	// Empty on single-repo runs — the template gates the section behind
-	// `{{if gt (len .Repos) 1}}` so single-repo callers keep their
-	// existing layout untouched.
-	Repos []stats.RepoStat
 }
 
 // htmlTreeDepth caps the repo-structure tree baked into the HTML report.
@@ -382,7 +376,6 @@ func Generate(w io.Writer, ds *stats.Dataset, repoName string, topN int, sf stat
 		TotalDirectories:     stats.DirectoryCount(ds),
 		TotalExtensions:      stats.ExtensionCount(ds),
 		TotalBusFactorFiles:  stats.BusFactorCount(ds),
-		Repos:                stats.RepoBreakdown(ds, ""),
 	}
 	CapChildrenPerDir(data.Structure, htmlTreeMaxChildrenPerDir)
 
diff --git a/internal/report/report_test.go b/internal/report/report_test.go
index f27cf33..45a91eb 100644
--- a/internal/report/report_test.go
+++ b/internal/report/report_test.go
@@ -78,18 +78,16 @@ func TestGenerate_SmokeRender(t *testing.T) {
 	}
 }
 
-// Regression: the Per-Repository Breakdown section emits a CSS width
-// for each repo's commit-share bar. The width has to be a numeric
-// literal — if the template feeds a string through printf "%.0f"
-// (e.g. via the string-returning pctFloat helper) html/template's
-// CSS sanitizer replaces it with `ZgotmplZ` and the bars render at
-// zero width. Exercise the multi-repo path end-to-end and assert
-// the rendered widths are clean.
-func TestGenerate_PerRepoBreakdownWidthsAreNumeric(t *testing.T) {
+// The Per-Repository Breakdown is a consolidation metric about the
+// developer's work, not about the repository set. It belongs on the
+// profile (--email) report where "3 of my commits in auth, 50 in
+// payments" tells a story; on the team report it would just restate
+// git-history distribution, which is tautological. Assert the team
+// render does NOT surface the section even with a multi-repo dataset.
+func TestGenerate_TeamReportOmitsPerRepoBreakdown(t *testing.T) {
 	dir := t.TempDir()
 	alpha := filepath.Join(dir, "alpha.jsonl")
 	beta := filepath.Join(dir, "beta.jsonl")
-	// Distinct SHAs and content so both repos appear in the breakdown.
 	alphaRow := `{"type":"commit","sha":"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa","author_email":"me@x.com","author_name":"Me","author_date":"2024-01-01T00:00:00Z","additions":10,"deletions":0,"files_changed":1}
 {"type":"commit_file","commit":"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa","path_current":"a.go","additions":10,"deletions":0}
 `
@@ -111,37 +109,14 @@ func TestGenerate_PerRepoBreakdownWidthsAreNumeric(t *testing.T) {
 		t.Fatalf("Generate: %v", err)
 	}
 	out := buf.String()
-	if !strings.Contains(out, "Per-Repository Breakdown") {
-		t.Fatal("breakdown section missing from multi-repo report")
-	}
-	if strings.Contains(out, "ZgotmplZ") {
-		t.Error("report contains ZgotmplZ — template fed a non-numeric value to a CSS context (likely printf'd a string)")
-	}
-	if strings.Contains(out, "%!f(string=") {
-		t.Error("report contains %!f(string=...) — printf was given a string where a float was expected")
-	}
-
-	// Stronger guard: the absence-of-markers check above would pass
-	// if a future bug rendered every bar at `width:0%`. The breakdown
-	// tile has a green fill (background:#216e39) — extract every
-	// rendered width for those tiles and assert at least one is
-	// non-zero. With alpha's 10-churn commit and beta's 25-churn
-	// commit producing distinct totals, the commit-share proportions
-	// can't all collapse to zero unless the template is broken.
-	widthRe := regexp.MustCompile(`width:([0-9]+(?:\.[0-9]+)?)%; background:#216e39`)
-	matches := widthRe.FindAllStringSubmatch(out, -1)
-	if len(matches) == 0 {
-		t.Fatal("no repo-breakdown bar widths found; template may have changed selector")
-	}
-	sawNonZero := false
-	for _, m := range matches {
-		if m[1] != "0" && m[1] != "0.0" {
-			sawNonZero = true
-			break
-		}
-	}
-	if !sawNonZero {
-		t.Errorf("every repo bar rendered with width:0 — proportions would be invisible in the UI; widths: %v", matches)
+	if strings.Contains(out, "Per-Repository Breakdown") {
+		t.Error("team report should not render the Per-Repository Breakdown; it's a profile-only metric")
+	}
+	// Sanity: the scan JSONL prefix still shows up in file paths etc.
+	// — we didn't accidentally strip the repo context from the whole
+	// report.
+	if !strings.Contains(out, "alpha:") && !strings.Contains(out, "beta:") {
+		t.Error("expected repo-prefixed paths to appear somewhere in the team report; none found")
 	}
 }
 
diff --git a/internal/report/template.go b/internal/report/template.go
index 9e19846..d6a3a64 100644
--- a/internal/report/template.go
+++ b/internal/report/template.go
@@ -145,44 +145,6 @@ footer { margin-top: 40px; padding-top: 16px; border-top: 1px solid #d0d7de; col
   </div>
 </div>
 
-{{if gt (len .Repos) 1}}
-<h2>Per-Repository Breakdown <span style="font-size:13px; color:#656d76; font-weight:normal;">{{thousands (len .Repos)}} repositories</span></h2>
-<p class="hint">Cross-repo aggregation from <code>scan</code>. Use this view to see how work is distributed: a single repo dominating commits or churn often means the consolidated activity story is really about that one project; an even spread shows broad multi-repo engagement. Bar widths are the share of total commits across all scanned repos.</p>
-<table>
-<tr>
-  <th>Repository</th>
-  <th>Commits</th>
-  <th>% Commits</th>
-  <th>Churn</th>
-  <th>% Churn</th>
-  <th>Files</th>
-  <th>Active days</th>
-  <th>Devs</th>
-  <th>First → Last</th>
-</tr>
-{{range .Repos}}
-<tr>
-  <td class="mono">{{.Repo}}</td>
-  <td>{{thousands .Commits}}</td>
-  <td>
-    <div style="display:flex; align-items:center; gap:6px;">
-      <div style="flex:0 0 60px; height:8px; background:#eaeef2; border-radius:3px; overflow:hidden;">
-        <div style="height:100%; width:{{printf "%.0f" .PctOfTotalCommits}}%; background:#216e39;"></div>
-      </div>
-      <span class="mono">{{printf "%.1f" .PctOfTotalCommits}}%</span>
-    </div>
-  </td>
-  <td>{{thousands .Churn}}</td>
-  <td class="mono">{{printf "%.1f" .PctOfTotalChurn}}%</td>
-  <td>{{thousands .Files}}</td>
-  <td>{{.ActiveDays}}</td>
-  <td>{{.UniqueDevs}}</td>
-  <td class="mono" style="font-size:11px;">{{.FirstCommitDate}} → {{.LastCommitDate}}</td>
-</tr>
-{{end}}
-</table>
-{{end}}
-
 {{if .ActivityYears}}
 <h2 style="display:flex; justify-content:space-between; align-items:center;">Activity <button onclick="var h=document.getElementById('act-heatmap'),t=document.getElementById('act-table');h.hidden=!h.hidden;t.hidden=!t.hidden;this.textContent=h.hidden?'heatmap':'table'" style="font-size:11px; font-weight:normal; padding:2px 10px; border:1px solid #d0d7de; border-radius:4px; background:#f6f8fa; color:#24292f; cursor:pointer;">table</button></h2>
 <p class="hint">Monthly commit heatmap. Darker = more commits. Sudden drop-offs may mark team changes, re-orgs, or freezes; steady cadence signals healthy pace. Hover for details; toggle to table for exact numbers. · {{docRef "activity"}}</p>

From a0a6251af16c5da751344e997fa6f64be363c541 Mon Sep 17 00:00:00 2001
From: lex0c <lex0c@proton.me>
Date: Mon, 20 Apr 2026 18:07:25 -0300
Subject: [PATCH 21/37] Add --report-dir for per-repo HTML reports plus an
 index landing page
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The old `scan --report <file>` rendered one consolidated team report
across every scanned repo, which mixed hotspots, bus factor, and
coupling signal from unrelated codebases. A small `main.go` in a
100-commit repo could never surface beside a 50k-commit `wp-includes/`
— the ranking was dominated by history size, not architectural
significance.

Replace the team-consolidation flow with --report-dir: for each
successful repo in the manifest, load its JSONL as a standalone
Dataset and render <dir>/<slug>.html via the existing report
pipeline. Each HTML is equivalent to running `gitcortex report`
against that one repo; no cross-repo mixing. Afterward emit
<dir>/index.html with per-repo summary cards (commits, churn, devs,
first→last date, status pill, relative-volume bar) linking into
every successful report. Failed and pending entries surface inline
so operators can spot them without opening the manifest.

The only report flag that still takes a single file path is
--report <file> combined with --email, which renders the consolidated
profile (genuinely cross-repo signal: "where did this person spend
their time"). --report alone is now a CLI error pointing at both
alternatives.

Tests cover both the validation branch and the end-to-end --report-dir
flow: two real git repos produce two standalone HTMLs (asserted to
carry no `<repo>:` path prefix, proving LoadJSONL and not
LoadMultiJSONL was used) and an index with hrefs into each.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 README.md                     |  20 ++--
 cmd/gitcortex/main.go         | 190 ++++++++++++++++++++++++++++------
 cmd/gitcortex/main_test.go    | 115 ++++++++++++++++++++
 internal/report/scan_index.go | 157 ++++++++++++++++++++++++++++
 4 files changed, 445 insertions(+), 37 deletions(-)
 create mode 100644 internal/report/scan_index.go

diff --git a/README.md b/README.md
index df25f09..4d1cede 100644
--- a/README.md
+++ b/README.md
@@ -405,16 +405,22 @@ For workspaces containing many repos (an engineer's `~/work`, a platform team's
 
 ### Scan: discover and aggregate every repo under a root
 
-Walk one or more directories, find every git repository (working trees and bare clones both detected), extract them in parallel, and optionally generate a consolidated HTML report. The main use case is "show my manager every project I've touched in the last year" or "give a platform team one dashboard across N services" without scripting the multi-extract + multi-input pipeline manually.
+Walk one or more directories, find every git repository (working trees and bare clones both detected), extract them in parallel, and optionally render HTML. Two output modes:
+
+- `--report-dir <dir>` — one standalone HTML per repo plus an `index.html` landing page linking them. Each per-repo report is equivalent to running `gitcortex report` against that repo alone; no metric mixing across unrelated codebases.
+- `--report <file> --email <address>` — a **single** consolidated profile report for one developer across every scanned repo. The only cross-repo aggregation in the feature, because "where did this person spend their time?" is the only question that genuinely benefits from pooling signal across projects.
+
+There is no third mode. Cross-repo consolidation at the team/codebase level inflates hotspots, bus factor, and coupling with noise from unrelated codebases; if that's what you want, inspect `manifest.json` or run `gitcortex report` per JSONL.
 
 ```bash
-# Discover and extract every repo under ~/work, one JSONL per repo
+# Discover and extract every repo under ~/work (JSONLs + manifest, no HTML)
 gitcortex scan --root ~/work --output ./scan-out
 
-# Consolidated report across all discovered repos
-gitcortex scan --root ~/work --output ./scan-out --report ./all.html
+# Per-repo HTML reports + index landing page
+gitcortex scan --root ~/work --output ./scan-out --report-dir ./reports
+# opens ./reports/index.html → click through to each repo
 
-# Personal cross-repo profile: only MY commits, with per-repo breakdown
+# Personal cross-repo profile: only MY commits, consolidated into one HTML
 gitcortex scan --root ~/work --output ./scan-out \
   --report ./me.html --email me@company.com --since 1y \
   --include-commit-messages
@@ -422,7 +428,7 @@ gitcortex scan --root ~/work --output ./scan-out \
 # Multiple roots, higher parallelism, pre-set ignore patterns
 gitcortex scan --root ~/work --root ~/personal --root ~/oss \
   --parallel 8 --max-depth 4 \
-  --output ./scan-out --report ./all.html
+  --output ./scan-out --report-dir ./reports
 ```
 
 The scan output directory holds:
@@ -450,7 +456,7 @@ vendor/
 
 Directory rules, globs, `**/foo`, and `!path` negations all work. Globbed negations like `!vendor*/keep` are honored — discovery descends into any dir where a negation rule could match a descendant. If `--ignore-file` is not set, scan looks for `.gitcortex-ignore` in the first `--root`.
 
-**Consolidated profile report.** When `scan --email me@company.com --report path.html` runs against a multi-repo dataset, the profile report renders a *Per-Repository Breakdown* section: commits, churn, files, active days, and share-of-total — all filtered to that developer's contributions (files count reflects only files the dev touched). The section is profile-only; the team report focuses on the aggregate across repos without a per-repo split (run `stats --input repo.jsonl` per-repo or read the `manifest.json` for that).
+**Consolidated profile report.** When `scan --email me@company.com --report path.html` runs against a multi-repo dataset, the profile report renders a *Per-Repository Breakdown* section: commits, churn, files, active days, and share-of-total — all filtered to that developer's contributions (files count reflects only files the dev touched). This is the one report that legitimately aggregates across repos; team-level views live in `--report-dir` (one HTML per repo, never mixed).
 
 **Flags worth knowing:**
 
diff --git a/cmd/gitcortex/main.go b/cmd/gitcortex/main.go
index 948aac4..90aaaf5 100644
--- a/cmd/gitcortex/main.go
+++ b/cmd/gitcortex/main.go
@@ -872,6 +872,114 @@ func reportCmd() *cobra.Command {
 	return cmd
 }
 
+// renderScanReportDir is `scan --report-dir` in one place: for each
+// successful repo in the manifest, load only that repo's JSONL as a
+// standalone dataset, run the normal `report` pipeline into
+// <dir>/<slug>.html, and afterward emit <dir>/index.html linking
+// all of them. Intentionally uses LoadJSONL (single-input) rather
+// than LoadMultiJSONL so each per-repo report's paths carry no
+// `<repo>:` prefix — each tab of the experience is self-contained.
+func renderScanReportDir(result *scan.Result, dir string, loadOpts stats.LoadOptions, sf stats.StatsFlags, topN int) error {
+	if err := os.MkdirAll(dir, 0o755); err != nil {
+		return fmt.Errorf("create report dir: %w", err)
+	}
+
+	// Build an index-row slice parallel to Manifest.Repos so the
+	// landing page preserves scan order. We can't skip failed/pending
+	// entries — operators need to see what didn't produce a report.
+	entries := make([]reportpkg.ScanIndexEntry, 0, len(result.Manifest.Repos))
+	var totalCommits int
+	allDevs := map[string]struct{}{}
+	maxCommits := 0
+
+	for _, m := range result.Manifest.Repos {
+		entry := reportpkg.ScanIndexEntry{
+			Slug:   m.Slug,
+			Path:   m.Path,
+			Status: m.Status,
+			Error:  m.Error,
+		}
+		if m.Status != "ok" {
+			entries = append(entries, entry)
+			continue
+		}
+
+		ds, err := stats.LoadJSONL(m.JSONL, loadOpts)
+		if err != nil {
+			// Treat load failure as a per-repo failure — index shows it,
+			// but the scan itself was already successful to that point.
+			entry.Status = "failed"
+			entry.Error = fmt.Sprintf("load %s: %v", m.JSONL, err)
+			entries = append(entries, entry)
+			continue
+		}
+
+		outFile := filepath.Join(dir, m.Slug+".html")
+		f, err := os.Create(outFile)
+		if err != nil {
+			return fmt.Errorf("create %s: %w", outFile, err)
+		}
+		if err := reportpkg.Generate(f, ds, m.Slug, topN, sf); err != nil {
+			f.Close()
+			return fmt.Errorf("generate report for %s: %w", m.Slug, err)
+		}
+		f.Close()
+
+		summary := stats.ComputeSummary(ds)
+		entry.Commits = summary.TotalCommits
+		entry.Devs = summary.TotalDevs
+		entry.Files = summary.TotalFiles
+		entry.Churn = summary.TotalAdditions + summary.TotalDeletions
+		entry.FirstCommitDate = summary.FirstCommitDate
+		entry.LastCommitDate = summary.LastCommitDate
+		entry.ReportHref = m.Slug + ".html"
+
+		totalCommits += entry.Commits
+		if entry.Commits > maxCommits {
+			maxCommits = entry.Commits
+		}
+		for _, c := range stats.TopContributors(ds, 0) {
+			allDevs[c.Email] = struct{}{}
+		}
+
+		entries = append(entries, entry)
+	}
+
+	indexPath := filepath.Join(dir, "index.html")
+	f, err := os.Create(indexPath)
+	if err != nil {
+		return fmt.Errorf("create index: %w", err)
+	}
+	defer f.Close()
+
+	var okCount, failedCount int
+	for _, e := range entries {
+		switch e.Status {
+		case "ok":
+			okCount++
+		case "failed", "pending":
+			failedCount++
+		}
+	}
+
+	data := reportpkg.ScanIndexData{
+		Roots:        result.Manifest.Roots,
+		Repos:        entries,
+		TotalRepos:   len(entries),
+		OKRepos:      okCount,
+		FailedRepos:  failedCount,
+		TotalCommits: totalCommits,
+		TotalDevs:    len(allDevs),
+		MaxCommits:   maxCommits,
+	}
+	if err := reportpkg.GenerateScanIndex(f, data); err != nil {
+		return fmt.Errorf("generate index: %w", err)
+	}
+	fmt.Fprintf(os.Stderr, "Per-repo reports written to %s (%d ok, %d failed); open %s\n",
+		dir, okCount, failedCount, fileURL(indexPath))
+	return nil
+}
+
 // --- Scan ---
 
 func scanCmd() *cobra.Command {
@@ -886,6 +994,7 @@ func scanCmd() *cobra.Command {
 		to                 string
 		since              string
 		reportPath         string
+		reportDir          string
 		topN               int
 		extractIgnore      []string
 		batchSize          int
@@ -921,6 +1030,21 @@ breakdown — handy for showing aggregated work across many repos.`,
 			if from != "" && to != "" && from > to {
 				return fmt.Errorf("--from (%s) must be on or before --to (%s)", from, to)
 			}
+			// Report-flag combinations. --report is reserved for the
+			// only case that genuinely consolidates signal across repos
+			// — the per-developer profile; cross-repo team aggregation
+			// inflates hotspots/bus-factor/coupling with mixed-codebase
+			// noise and has been replaced by --report-dir which emits
+			// one standalone HTML per repo plus an index landing page.
+			if reportPath != "" && email == "" {
+				return fmt.Errorf("--report requires --email (profile consolidation); for per-repo HTML reports use --report-dir <dir>")
+			}
+			if reportPath != "" && reportDir != "" {
+				return fmt.Errorf("--report and --report-dir are mutually exclusive; pick profile (--report + --email) or per-repo (--report-dir)")
+			}
+			if email != "" && reportDir != "" {
+				return fmt.Errorf("--email filters a single consolidated profile report; combine it with --report <file>, not --report-dir")
+			}
 			// Resolve --since up-front. If this fails we'd otherwise
 			// discover the typo only after a full multi-repo scan —
 			// minutes to hours on a large workspace, all thrown away
@@ -977,48 +1101,53 @@ breakdown — handy for showing aggregated work across many repos.`,
 					filepath.Join(result.OutputDir, "manifest.json"))
 			}
 
-			if reportPath == "" {
-				fmt.Fprintf(os.Stderr, "Scan complete: %d JSONL file(s) in %s\n", len(result.JSONLs), result.OutputDir)
-				return nil
-			}
-
-			ds, err := stats.LoadMultiJSONL(result.JSONLs, stats.LoadOptions{
+			sf := stats.StatsFlags{CouplingMinChanges: couplingMinChanges, NetworkMinFiles: networkMinFiles}
+			loadOpts := stats.LoadOptions{
 				From:         fromDate,
 				To:           to,
 				HalfLifeDays: churnHalfLife,
 				CoupMaxFiles: couplingMaxFiles,
-			})
-			if err != nil {
-				return fmt.Errorf("load consolidated dataset: %w", err)
 			}
-			fmt.Fprintf(os.Stderr, "Loaded %d commits across %d repo(s)\n", ds.CommitCount, len(result.JSONLs))
 
-			f, err := os.Create(reportPath)
-			if err != nil {
-				return fmt.Errorf("create report: %w", err)
-			}
-			defer f.Close()
-
-			// Label the report after the basename of the first --root
-			// (or the output dir as a fallback). "scan-scan-output" was
-			// the previous default; users find the root name far more
-			// recognizable as the H1 of the report.
-			repoLabel := filepath.Base(result.OutputDir)
-			if len(cfg.Roots) > 0 {
-				repoLabel = filepath.Base(absPath(cfg.Roots[0]))
-			}
-			sf := stats.StatsFlags{CouplingMinChanges: couplingMinChanges, NetworkMinFiles: networkMinFiles}
+			// Path 1: profile consolidation — the one cross-repo
+			// aggregation that earns its place (a single dev's work
+			// across all scanned repos). Build a multi-repo Dataset
+			// (path-prefixed via LoadMultiJSONL) and filter by email
+			// at render time.
 			if email != "" {
+				ds, err := stats.LoadMultiJSONL(result.JSONLs, loadOpts)
+				if err != nil {
+					return fmt.Errorf("load consolidated dataset: %w", err)
+				}
+				fmt.Fprintf(os.Stderr, "Loaded %d commits across %d repo(s)\n", ds.CommitCount, len(result.JSONLs))
+
+				f, err := os.Create(reportPath)
+				if err != nil {
+					return fmt.Errorf("create report: %w", err)
+				}
+				defer f.Close()
+
+				repoLabel := filepath.Base(absPath(cfg.Roots[0]))
 				if err := reportpkg.GenerateProfile(f, ds, repoLabel, email); err != nil {
 					return fmt.Errorf("generate profile report: %w", err)
 				}
 				fmt.Fprintf(os.Stderr, "Profile report for %s written to %s\n", email, fileURL(reportPath))
 				return nil
 			}
-			if err := reportpkg.Generate(f, ds, repoLabel, topN, sf); err != nil {
-				return fmt.Errorf("generate report: %w", err)
+
+			// Path 2: per-repo reports + index. Each repo is loaded
+			// standalone so its stats reflect its own codebase — no
+			// hotspot/bus-factor/coupling mixing across unrelated
+			// projects. The index.html lists every repo with cards
+			// linking into each per-repo report; failed extracts are
+			// surfaced inline so operators can spot them without
+			// opening the manifest.
+			if reportDir != "" {
+				return renderScanReportDir(result, reportDir, loadOpts, sf, topN)
 			}
-			fmt.Fprintf(os.Stderr, "Consolidated report written to %s\n", fileURL(reportPath))
+
+			// Path 3: no report flag — just confirm JSONLs landed.
+			fmt.Fprintf(os.Stderr, "Scan complete: %d JSONL file(s) in %s\n", len(result.JSONLs), result.OutputDir)
 			return nil
 		},
 	}
@@ -1032,8 +1161,9 @@ breakdown — handy for showing aggregated work across many repos.`,
 	cmd.Flags().StringVar(&from, "from", "", "Window start date YYYY-MM-DD (forwarded to the consolidated report)")
 	cmd.Flags().StringVar(&to, "to", "", "Window end date YYYY-MM-DD (forwarded to the consolidated report)")
 	cmd.Flags().StringVar(&since, "since", "", "Filter to recent period (e.g. 7d, 4w, 3m, 1y); mutually exclusive with --from/--to")
-	cmd.Flags().StringVar(&reportPath, "report", "", "If set, generate a consolidated HTML report at this path after the scan")
-	cmd.Flags().IntVar(&topN, "top", 20, "Top-N entries per section in the consolidated report")
+	cmd.Flags().StringVar(&reportPath, "report", "", "With --email, write a developer profile consolidated across all scanned repos to this HTML file. For per-repo reports use --report-dir instead.")
+	cmd.Flags().StringVar(&reportDir, "report-dir", "", "Directory to write one standalone HTML report per repo plus an index.html landing page. Each per-repo report is equivalent to running `gitcortex report` on that repo alone — no cross-repo metric mixing.")
+	cmd.Flags().IntVar(&topN, "top", 20, "Top-N entries per section in each per-repo report")
 	cmd.Flags().StringSliceVar(&extractIgnore, "extract-ignore", nil, "Glob patterns forwarded to per-repo extract --ignore (e.g. package-lock.json)")
 	cmd.Flags().IntVar(&batchSize, "batch-size", 1000, "Per-repo extract checkpoint interval")
 	cmd.Flags().BoolVar(&mailmap, "mailmap", false, "Use .mailmap (per repo) to normalize identities")
diff --git a/cmd/gitcortex/main_test.go b/cmd/gitcortex/main_test.go
index aca5896..884db8f 100644
--- a/cmd/gitcortex/main_test.go
+++ b/cmd/gitcortex/main_test.go
@@ -3,6 +3,7 @@ package main
 import (
 	"bytes"
 	"os"
+	"os/exec"
 	"path/filepath"
 	"strings"
 	"testing"
@@ -116,6 +117,120 @@ func TestScanCmd_RejectsMissingIgnoreFile(t *testing.T) {
 	}
 }
 
+// --report without --email is ambiguous under the "no team-wide
+// consolidation" design: the only meaningful single-HTML output
+// across multiple repos is a developer profile. The error must
+// point the user at the right flag for each intent.
+func TestScanCmd_RejectsReportWithoutEmail(t *testing.T) {
+	cmd := scanCmd()
+	cmd.SilenceUsage = true
+	cmd.SilenceErrors = true
+	cmd.SetOut(&bytes.Buffer{})
+	cmd.SetErr(&bytes.Buffer{})
+	cmd.SetArgs([]string{
+		"--root", t.TempDir(),
+		"--output", t.TempDir(),
+		"--report", filepath.Join(t.TempDir(), "out.html"),
+	})
+	err := cmd.Execute()
+	if err == nil {
+		t.Fatal("expected error for --report without --email")
+	}
+	if !strings.Contains(err.Error(), "--report-dir") || !strings.Contains(err.Error(), "--email") {
+		t.Errorf("error should point at both --report-dir and --email as remediation; got %q", err)
+	}
+}
+
+// `scan --report-dir <dir>` writes index.html plus one HTML per
+// successful repo, with each per-repo HTML rendered from a
+// single-input Dataset (no cross-repo path prefix). End-to-end
+// check against two real repos built with git.
+func TestScanCmd_ReportDirGeneratesPerRepoHTMLAndIndex(t *testing.T) {
+	if _, err := exec.LookPath("git"); err != nil {
+		t.Skip("git not installed")
+	}
+	root := t.TempDir()
+	makeRepoWithCommit(t, filepath.Join(root, "alpha"), "a.go", "package a\n")
+	makeRepoWithCommit(t, filepath.Join(root, "beta"), "b.py", "print('b')\n")
+
+	scanOut := t.TempDir()
+	reportDir := t.TempDir()
+
+	cmd := scanCmd()
+	cmd.SilenceUsage = true
+	cmd.SilenceErrors = true
+	cmd.SetOut(&bytes.Buffer{})
+	cmd.SetErr(&bytes.Buffer{})
+	cmd.SetArgs([]string{
+		"--root", root,
+		"--output", scanOut,
+		"--report-dir", reportDir,
+	})
+	if err := cmd.Execute(); err != nil {
+		t.Fatalf("scan --report-dir: %v", err)
+	}
+
+	// Every successful repo gets a standalone HTML named <slug>.html.
+	for _, slug := range []string{"alpha", "beta"} {
+		path := filepath.Join(reportDir, slug+".html")
+		b, err := os.ReadFile(path)
+		if err != nil {
+			t.Fatalf("per-repo report missing: %v", err)
+		}
+		if !strings.Contains(string(b), "<!DOCTYPE html>") {
+			t.Errorf("%s should be a full HTML document", path)
+		}
+		// Paths inside the per-repo report must NOT carry a
+		// `<repo>:` prefix — each tab is standalone, so LoadJSONL
+		// (not LoadMultiJSONL) was used and file paths appear raw.
+		if strings.Contains(string(b), slug+":") {
+			t.Errorf("per-repo report %s contains a `%s:` path prefix; Dataset was loaded as multi-repo", path, slug)
+		}
+	}
+
+	// index.html links to each per-repo report.
+	indexBytes, err := os.ReadFile(filepath.Join(reportDir, "index.html"))
+	if err != nil {
+		t.Fatalf("index.html missing: %v", err)
+	}
+	index := string(indexBytes)
+	if !strings.Contains(index, `href="alpha.html"`) || !strings.Contains(index, `href="beta.html"`) {
+		t.Errorf("index.html should link to both per-repo reports; got body excerpt: %.300s", index)
+	}
+	if !strings.Contains(index, "Scan Index") {
+		t.Errorf("index.html missing title block")
+	}
+}
+
+// makeRepoWithCommit initializes a git repo with one file and one
+// commit using a deterministic identity.
+func makeRepoWithCommit(t *testing.T, dir, file, contents string) {
+	t.Helper()
+	if err := os.MkdirAll(dir, 0o755); err != nil {
+		t.Fatal(err)
+	}
+	if err := os.WriteFile(filepath.Join(dir, file), []byte(contents), 0o644); err != nil {
+		t.Fatal(err)
+	}
+	run := func(args ...string) {
+		cmd := exec.Command("git", args...)
+		cmd.Dir = dir
+		cmd.Env = append(os.Environ(),
+			"GIT_AUTHOR_NAME=Test",
+			"GIT_AUTHOR_EMAIL=test@example.com",
+			"GIT_COMMITTER_NAME=Test",
+			"GIT_COMMITTER_EMAIL=test@example.com",
+		)
+		out, err := cmd.CombinedOutput()
+		if err != nil {
+			t.Fatalf("git %s: %v\n%s", strings.Join(args, " "), err, out)
+		}
+	}
+	run("init", "-q", "-b", "main")
+	run("add", ".")
+	run("commit", "-q", "-m", "initial")
+}
+
 func TestValidateDate(t *testing.T) {
 	cases := []struct {
 		in      string
diff --git a/internal/report/scan_index.go b/internal/report/scan_index.go
new file mode 100644
index 0000000..82e39b7
--- /dev/null
+++ b/internal/report/scan_index.go
@@ -0,0 +1,157 @@
+package report
+
+import (
+	"html/template"
+	"io"
+	"time"
+)
+
+// ScanIndexEntry is one repo's row on the scan-index landing page.
+// Successful repos populate the numeric fields; failed / pending
+// repos leave them zero and surface Error instead. ReportHref is the
+// relative URL the index uses to link into each per-repo report —
+// empty when no report exists for that entry.
+type ScanIndexEntry struct {
+	Slug            string
+	Path            string
+	Status          string
+	Error           string
+	Commits         int
+	Devs            int
+	Files           int
+	Churn           int64
+	FirstCommitDate string
+	LastCommitDate  string
+	ReportHref      string
+}
+
+// ScanIndexData is the top-level template input for the index page.
+type ScanIndexData struct {
+	GeneratedAt string
+	Roots       []string
+	Repos       []ScanIndexEntry
+	// TotalRepos / OKRepos / FailedRepos are precomputed so the
+	// template doesn't need conditional arithmetic; the summary strip
+	// at the top renders directly from these.
+	TotalRepos    int
+	OKRepos       int
+	FailedRepos   int
+	TotalCommits  int
+	TotalDevs     int
+	// Largest repo commit count — used to normalize the bar widths so
+	// the relative-volume bars are visually comparable across repos.
+	MaxCommits int
+}
+
+// GenerateScanIndex writes the scan landing page: a per-repo card
+// list with links to each repo's standalone report and a short
+// summary strip. Failures are surfaced inline rather than hidden so
+// operators can spot them at a glance and dig into the manifest.
+func GenerateScanIndex(w io.Writer, data ScanIndexData) error {
+	if data.GeneratedAt == "" {
+		data.GeneratedAt = time.Now().Format("2006-01-02 15:04")
+	}
+	return scanIndexTmpl.Execute(w, data)
+}
+
+var scanIndexTmpl = template.Must(template.New("scan-index").Funcs(funcMap).Parse(scanIndexHTML))
+
+const scanIndexHTML = `<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<title>gitcortex scan index ({{.TotalRepos}} repositories)</title>
+<style>
+* { margin: 0; padding: 0; box-sizing: border-box; }
+body { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif; color: #24292f; background: #f6f8fa; padding: 20px; max-width: 1200px; margin: 0 auto; }
+h1 { font-size: 24px; margin-bottom: 4px; }
+.subtitle { color: #656d76; font-size: 13px; margin-bottom: 24px; }
+.summary { display: grid; grid-template-columns: repeat(auto-fit, minmax(160px, 1fr)); gap: 12px; margin-bottom: 24px; }
+.summary-card { background: #fff; border: 1px solid #d0d7de; border-radius: 6px; padding: 16px; }
+.summary-card .label { font-size: 12px; color: #656d76; text-transform: uppercase; }
+.summary-card .value { font-size: 24px; font-weight: 600; margin-top: 4px; }
+.repo { background: #fff; border: 1px solid #d0d7de; border-radius: 6px; padding: 16px 20px; margin-bottom: 10px; display: grid; grid-template-columns: 2fr 1fr 1fr 1fr 1fr 1.2fr; gap: 16px; align-items: center; }
+.repo.failed { border-left: 4px solid #cf222e; }
+.repo.pending { border-left: 4px solid #bf8700; }
+.repo.ok { border-left: 4px solid #2da44e; }
+.repo .name { font-weight: 600; font-size: 15px; }
+.repo .name a { color: #0969da; text-decoration: none; }
+.repo .name a:hover { text-decoration: underline; }
+.repo .path { font-family: "SF Mono", Consolas, monospace; font-size: 11px; color: #656d76; margin-top: 2px; word-break: break-all; }
+.repo .error { font-size: 12px; color: #cf222e; margin-top: 4px; }
+.repo .metric { font-size: 13px; }
+.repo .metric .val { font-weight: 600; font-size: 16px; }
+.repo .metric .lbl { font-size: 11px; color: #656d76; text-transform: uppercase; display: block; margin-top: 2px; }
+.repo .bar-container { display: flex; align-items: center; gap: 8px; }
+.repo .bar-outer { flex: 1; height: 8px; background: #eaeef2; border-radius: 3px; overflow: hidden; }
+.repo .bar-inner { height: 100%; background: #2da44e; }
+.repo .dates { font-size: 11px; color: #656d76; font-family: "SF Mono", Consolas, monospace; }
+.status-pill { display: inline-block; padding: 2px 8px; border-radius: 10px; font-size: 11px; font-weight: 600; }
+.status-ok { background: #dafbe1; color: #1a7f37; }
+.status-failed { background: #ffebe9; color: #cf222e; }
+.status-pending { background: #fff4d4; color: #9a6700; }
+footer { margin-top: 30px; color: #656d76; font-size: 12px; text-align: center; border-top: 1px solid #d0d7de; padding-top: 14px; }
+.hint { font-size: 12px; color: #656d76; font-style: italic; margin-bottom: 14px; }
+</style>
+</head>
+<body>
+
+<h1>Scan Index <span style="font-size:14px; color:#656d76; font-weight:normal;">{{.TotalRepos}} repositor{{if eq .TotalRepos 1}}y{{else}}ies{{end}}</span></h1>
+<p class="subtitle">Generated {{.GeneratedAt}}{{if .Roots}} · roots: {{range $i, $r := .Roots}}{{if $i}}, {{end}}<code>{{$r}}</code>{{end}}{{end}}</p>
+
+<div class="summary">
+  <div class="summary-card"><div class="label">Repositories</div><div class="value">{{thousands .OKRepos}}{{if gt .FailedRepos 0}} <span style="font-size:14px; color:#cf222e;">({{.FailedRepos}} failed)</span>{{end}}</div></div>
+  <div class="summary-card"><div class="label">Total commits</div><div class="value" title="{{thousands .TotalCommits}}">{{humanize .TotalCommits}}</div></div>
+  <div class="summary-card"><div class="label">Unique devs</div><div class="value">{{thousands .TotalDevs}}</div></div>
+</div>
+
+<p class="hint">Each repo below links to its own standalone report. Metrics are per-repository — no cross-repo aggregation that would mix signals from unrelated codebases. For a consolidated developer view, use <code>gitcortex scan --report &lt;file&gt; --email &lt;address&gt;</code>.</p>
+
+{{$max := .MaxCommits}}
+{{range .Repos}}
+<div class="repo {{.Status}}">
+  <div>
+    <div class="name">
+      {{if .ReportHref}}<a href="{{.ReportHref}}">{{.Slug}}</a>{{else}}{{.Slug}}{{end}}
+      <span class="status-pill status-{{.Status}}">{{.Status}}</span>
+    </div>
+    <div class="path">{{.Path}}</div>
+    {{if .Error}}<div class="error">{{.Error}}</div>{{end}}
+  </div>
+  {{if eq .Status "ok"}}
+  <div class="metric">
+    <span class="val" title="{{thousands .Commits}}">{{humanize .Commits}}</span>
+    <span class="lbl">commits</span>
+  </div>
+  <div class="metric">
+    <span class="val" title="{{thousands .Churn}}">{{humanize .Churn}}</span>
+    <span class="lbl">churn</span>
+  </div>
+  <div class="metric">
+    <span class="val">{{.Devs}}</span>
+    <span class="lbl">devs</span>
+  </div>
+  <div class="metric">
+    <span class="val">{{humanize .Files}}</span>
+    <span class="lbl">files</span>
+  </div>
+  <div class="dates">
+    {{.FirstCommitDate}}<br>→ {{.LastCommitDate}}
+  </div>
+  {{else}}
+  <div style="grid-column: span 5; color:#656d76; font-style:italic;">No report available.</div>
+  {{end}}
+</div>
+{{if eq .Status "ok"}}
+<div class="bar-container" style="margin:-4px 0 12px 20px; max-width:600px;">
+  <div class="bar-outer"><div class="bar-inner" style="width:{{if gt $max 0}}{{pctInt .Commits $max}}%{{else}}0%{{end}};"></div></div>
+  <span style="font-size:11px; color:#656d76; font-family: 'SF Mono', Consolas, monospace; white-space:nowrap;">{{humanize .Commits}} / {{humanize $max}} max</span>
+</div>
+{{end}}
+{{end}}
+
+<footer>Generated by <a href="https://github.com/lex0c/gitcortex" style="color:#0969da;">gitcortex</a></footer>
+
+</body>
+</html>
+`

From fefced53834a3727f366fcc4f3f2b857a27f0d31 Mon Sep 17 00:00:00 2001
From: lex0c <lex0c@proton.me>
Date: Mon, 20 Apr 2026 18:17:58 -0300
Subject: [PATCH 22/37] Fix report-dir pending accounting, downgrade render
 errors, reserve index slug
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three MAJORs from the review of commit a0a6251:

1. Pending/skipped entries were counted as failed in both the index
   summary card and the CLI log line. A user-cancelled scan read as
   "N failed" — misleading because nothing actually broke. Separate
   pendingCount bucket, new ScanIndexData.PendingRepos field, and
   the summary card shows "(M failed) (K pending)" as distinct
   chips. Log line drops zero buckets so a clean run still says
   "N ok" instead of "N ok, 0 failed, 0 pending".

2. os.Create and reportpkg.Generate errors aborted the entire
   batch, discarding every HTML already written AND skipping the
   index. Policy now matches LoadJSONL: the entry is downgraded to
   Status=failed with an explanatory Error, best-effort os.Remove
   cleans up any partial file, and the loop keeps going. Operators
   get an index listing exactly which repos rendered and which
   didn't.

3. A repo literally named `index` would emit <dir>/index.html and
   then be overwritten by the landing page. Add a reservedSlugs
   registry to scan.discovery; isReservedSlug triggers the same
   hash branch as a duplicate-basename collision. Case-insensitive
   per the existing fold (macOS/Windows can't tell `Index` from
   `index` on default filesystems).

Tests cover all three: mixed ok+failed+pending manifest asserts the
correct counts appear in the index, slug collisions force the hash,
and all-failed still emits index.html without tripping the
MaxCommits==0 guard in the bar-width template expression.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 cmd/gitcortex/main.go           |  47 ++++++++++++--
 cmd/gitcortex/main_test.go      | 111 ++++++++++++++++++++++++++++++++
 internal/report/scan_index.go   |  22 ++++---
 internal/scan/discovery.go      |  19 +++++-
 internal/scan/discovery_test.go |  26 ++++++++
 5 files changed, 209 insertions(+), 16 deletions(-)

diff --git a/cmd/gitcortex/main.go b/cmd/gitcortex/main.go
index 90aaaf5..897f33c 100644
--- a/cmd/gitcortex/main.go
+++ b/cmd/gitcortex/main.go
@@ -914,14 +914,29 @@ func renderScanReportDir(result *scan.Result, dir string, loadOpts stats.LoadOpt
 			continue
 		}
 
+		// All render-time errors for a single repo (create the output
+		// file, template.Execute) get demoted to an inline failure and
+		// the loop continues to the next repo — matching the policy
+		// already applied above for LoadJSONL. Aborting the batch on
+		// one bad render would discard every HTML already written and
+		// skip the index entirely, leaving operators staring at an
+		// incomplete directory with no way to discover the state.
 		outFile := filepath.Join(dir, m.Slug+".html")
 		f, err := os.Create(outFile)
 		if err != nil {
-			return fmt.Errorf("create %s: %w", outFile, err)
+			entry.Status = "failed"
+			entry.Error = fmt.Sprintf("create %s: %v", outFile, err)
+			entries = append(entries, entry)
+			continue
 		}
 		if err := reportpkg.Generate(f, ds, m.Slug, topN, sf); err != nil {
 			f.Close()
-			return fmt.Errorf("generate report for %s: %w", m.Slug, err)
+			// Best-effort cleanup: leave nothing half-written on disk.
+			_ = os.Remove(outFile)
+			entry.Status = "failed"
+			entry.Error = fmt.Sprintf("render %s: %v", m.Slug, err)
+			entries = append(entries, entry)
+			continue
 		}
 		f.Close()
 
@@ -952,12 +967,20 @@ func renderScanReportDir(result *scan.Result, dir string, loadOpts stats.LoadOpt
 	}
 	defer f.Close()
 
-	var okCount, failedCount int
+	// Pending and failed live in separate buckets: pending means the
+	// worker never reached the repo (user cancelled mid-scan, or a
+	// future scheduling change), while failed means the extract was
+	// attempted and broke. Conflating them in the summary strip would
+	// mislead operators reading a cancel-shaped failure as a real
+	// repo-level problem.
+	var okCount, failedCount, pendingCount int
 	for _, e := range entries {
 		switch e.Status {
 		case "ok":
 			okCount++
-		case "failed", "pending":
+		case "pending", "skipped":
+			pendingCount++
+		default:
 			failedCount++
 		}
 	}
@@ -968,6 +991,7 @@ func renderScanReportDir(result *scan.Result, dir string, loadOpts stats.LoadOpt
 		TotalRepos:   len(entries),
 		OKRepos:      okCount,
 		FailedRepos:  failedCount,
+		PendingRepos: pendingCount,
 		TotalCommits: totalCommits,
 		TotalDevs:    len(allDevs),
 		MaxCommits:   maxCommits,
@@ -975,8 +999,19 @@ func renderScanReportDir(result *scan.Result, dir string, loadOpts stats.LoadOpt
 	if err := reportpkg.GenerateScanIndex(f, data); err != nil {
 		return fmt.Errorf("generate index: %w", err)
 	}
-	fmt.Fprintf(os.Stderr, "Per-repo reports written to %s (%d ok, %d failed); open %s\n",
-		dir, okCount, failedCount, fileURL(indexPath))
+
+	// Log line tracks the summary card: only non-zero buckets appear
+	// so a fully-successful scan doesn't confuse the operator with
+	// trailing "0 failed, 0 pending".
+	parts := []string{fmt.Sprintf("%d ok", okCount)}
+	if failedCount > 0 {
+		parts = append(parts, fmt.Sprintf("%d failed", failedCount))
+	}
+	if pendingCount > 0 {
+		parts = append(parts, fmt.Sprintf("%d pending", pendingCount))
+	}
+	fmt.Fprintf(os.Stderr, "Per-repo reports written to %s (%s); open %s\n",
+		dir, strings.Join(parts, ", "), fileURL(indexPath))
 	return nil
 }
 
diff --git a/cmd/gitcortex/main_test.go b/cmd/gitcortex/main_test.go
index 884db8f..2b361d1 100644
--- a/cmd/gitcortex/main_test.go
+++ b/cmd/gitcortex/main_test.go
@@ -7,6 +7,9 @@ import (
 	"path/filepath"
 	"strings"
 	"testing"
+
+	"github.com/lex0c/gitcortex/internal/scan"
+	"github.com/lex0c/gitcortex/internal/stats"
 )
 
 // scanCmd must validate --since BEFORE running the discovery walk
@@ -202,6 +205,114 @@ func TestScanCmd_ReportDirGeneratesPerRepoHTMLAndIndex(t *testing.T) {
 	}
 }
 
+// renderScanReportDir must render the index for a manifest that
+// mixes ok / failed / pending statuses. Failed entries show their
+// error message, pending entries render with the pending pill, and
+// neither inflates the "failed" count in the summary card.
+func TestRenderScanReportDir_MixedStatuses(t *testing.T) {
+	dir := t.TempDir()
+	okJSONL := filepath.Join(dir, "good.jsonl")
+	jsonlContent := `{"type":"commit","sha":"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa","author_email":"me@x.com","author_name":"Me","author_date":"2024-01-01T00:00:00Z","additions":10,"deletions":0,"files_changed":1}
+{"type":"commit_file","commit":"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa","path_current":"a.go","additions":10,"deletions":0}
+`
+	if err := os.WriteFile(okJSONL, []byte(jsonlContent), 0o644); err != nil {
+		t.Fatal(err)
+	}
+
+	reportDir := t.TempDir()
+	result := &scan.Result{
+		OutputDir: dir,
+		Manifest: scan.Manifest{
+			Roots: []string{"/fake/root"},
+			Repos: []scan.ManifestRepo{
+				{Slug: "good", Path: "/fake/root/good", Status: "ok", JSONL: okJSONL},
+				{Slug: "broken", Path: "/fake/root/broken", Status: "failed", Error: "simulated extract crash"},
+				{Slug: "skipped", Path: "/fake/root/skipped", Status: "pending"},
+			},
+		},
+	}
+
+	err := renderScanReportDir(result, reportDir,
+		stats.LoadOptions{HalfLifeDays: 90, CoupMaxFiles: 50},
+		stats.StatsFlags{CouplingMinChanges: 1, NetworkMinFiles: 1},
+		10)
+	if err != nil {
+		t.Fatalf("renderScanReportDir returned error despite inline-failure policy: %v", err)
+	}
+
+	// OK entry's report file exists; failed/pending do not.
+	if _, err := os.Stat(filepath.Join(reportDir, "good.html")); err != nil {
+		t.Errorf("good.html missing: %v", err)
+	}
+	for _, unwanted := range []string{"broken.html", "skipped.html"} {
+		if _, err := os.Stat(filepath.Join(reportDir, unwanted)); err == nil {
+			t.Errorf("%s should NOT be written — status was not ok", unwanted)
+		}
+	}
+
+	indexB, err := os.ReadFile(filepath.Join(reportDir, "index.html"))
+	if err != nil {
+		t.Fatalf("index.html: %v", err)
+	}
+	index := string(indexB)
+
+	// Summary card splits failed and pending correctly: "(1 failed)"
+	// AND "(1 pending)" appear, not "(2 failed)".
+	if strings.Contains(index, "(2 failed)") {
+		t.Error("summary miscounted pending as failed — `(2 failed)` appeared instead of separate buckets")
+	}
+	if !strings.Contains(index, "1 failed") {
+		t.Errorf("summary should show `1 failed`; index excerpt: %.600s", index)
+	}
+	if !strings.Contains(index, "1 pending") {
+		t.Errorf("summary should show `1 pending`; index excerpt: %.600s", index)
+	}
+
+	// Failed entry's error bubbles up to the card.
+	if !strings.Contains(index, "simulated extract crash") {
+		t.Error("failed entry's error message missing from index")
+	}
+
+	// Status pills render for each entry. Grep for the pill classes.
+	for _, want := range []string{"status-ok", "status-failed", "status-pending"} {
+		if !strings.Contains(index, want) {
+			t.Errorf("index missing %q pill — a status branch lost its style", want)
+		}
+	}
+
+	// Only OK entries carry an href; failed/pending must not link to
+	// a file that doesn't exist.
+	if strings.Contains(index, `href="broken.html"`) || strings.Contains(index, `href="skipped.html"`) {
+		t.Error("index links to a non-existent per-repo HTML for a non-ok entry")
+	}
+}
+
+// Render with zero successful repos must still emit an index
+// without tripping on the MaxCommits==0 guard in the bar-width
+// template expression.
+func TestRenderScanReportDir_AllFailedStillEmitsIndex(t *testing.T) {
+	reportDir := t.TempDir()
+	result := &scan.Result{
+		OutputDir: t.TempDir(),
+		Manifest: scan.Manifest{
+			Repos: []scan.ManifestRepo{
+				{Slug: "a", Status: "failed", Error: "boom"},
+				{Slug: "b", Status: "failed", Error: "also boom"},
+			},
+		},
+	}
+	err := renderScanReportDir(result, reportDir,
+		stats.LoadOptions{HalfLifeDays: 90, CoupMaxFiles: 50},
+		stats.StatsFlags{CouplingMinChanges: 1, NetworkMinFiles: 1},
+		10)
+	if err != nil {
+		t.Fatalf("render should not fail on all-failed manifest: %v", err)
+	}
+	if _, err := os.Stat(filepath.Join(reportDir, "index.html")); err != nil {
+		t.Fatalf("index.html must be emitted even when every repo failed: %v", err)
+	}
+}
+
 // makeRepoWithCommit initializes a git repo with one file and one
 // commit using a deterministic identity.
 func makeRepoWithCommit(t *testing.T, dir, file, contents string) {
diff --git a/internal/report/scan_index.go b/internal/report/scan_index.go
index 82e39b7..80f410a 100644
--- a/internal/report/scan_index.go
+++ b/internal/report/scan_index.go
@@ -30,14 +30,18 @@ type ScanIndexData struct {
 	GeneratedAt string
 	Roots       []string
 	Repos       []ScanIndexEntry
-	// TotalRepos / OKRepos / FailedRepos are precomputed so the
-	// template doesn't need conditional arithmetic; the summary strip
-	// at the top renders directly from these.
-	TotalRepos    int
-	OKRepos       int
-	FailedRepos   int
-	TotalCommits  int
-	TotalDevs     int
+	// TotalRepos / OKRepos / FailedRepos / PendingRepos are
+	// precomputed so the template doesn't need conditional arithmetic.
+	// Pending is distinct from failed: a pending repo is one the
+	// worker never reached (cancelled mid-scan), a failed repo is one
+	// whose extract or render broke. Mixing them in the summary would
+	// read a cancel-shaped partial run as a fleet-of-errors.
+	TotalRepos   int
+	OKRepos      int
+	FailedRepos  int
+	PendingRepos int
+	TotalCommits int
+	TotalDevs    int
 	// Largest repo commit count — used to normalize the bar widths so
 	// the relative-volume bars are visually comparable across repos.
 	MaxCommits int
@@ -100,7 +104,7 @@ footer { margin-top: 30px; color: #656d76; font-size: 12px; text-align: center;
 <p class="subtitle">Generated {{.GeneratedAt}}{{if .Roots}} · roots: {{range $i, $r := .Roots}}{{if $i}}, {{end}}<code>{{$r}}</code>{{end}}{{end}}</p>
 
 <div class="summary">
-  <div class="summary-card"><div class="label">Repositories</div><div class="value">{{thousands .OKRepos}}{{if gt .FailedRepos 0}} <span style="font-size:14px; color:#cf222e;">({{.FailedRepos}} failed)</span>{{end}}</div></div>
+  <div class="summary-card"><div class="label">Repositories</div><div class="value">{{thousands .OKRepos}}{{if gt .FailedRepos 0}} <span style="font-size:14px; color:#cf222e;">({{.FailedRepos}} failed)</span>{{end}}{{if gt .PendingRepos 0}} <span style="font-size:14px; color:#9a6700;">({{.PendingRepos}} pending)</span>{{end}}</div></div>
   <div class="summary-card"><div class="label">Total commits</div><div class="value" title="{{thousands .TotalCommits}}">{{humanize .TotalCommits}}</div></div>
   <div class="summary-card"><div class="label">Unique devs</div><div class="value">{{thousands .TotalDevs}}</div></div>
 </div>
diff --git a/internal/scan/discovery.go b/internal/scan/discovery.go
index 2989893..2d13774 100644
--- a/internal/scan/discovery.go
+++ b/internal/scan/discovery.go
@@ -187,6 +187,23 @@ func Discover(ctx context.Context, roots []string, matcher *Matcher, maxDepth in
 // grows the length if a truncation collision occurs.
 const initialSlugHashLen = 6
 
+// reservedSlugs are base names that cannot be emitted bare because
+// downstream consumers reserve the name for their own output file.
+// `scan --report-dir` writes <dir>/index.html as the landing page;
+// a repo whose basename was literally `index` would collide and
+// silently get overwritten by the landing or vice versa. Forcing
+// the hash branch for reserved names avoids the collision without
+// losing the repo. Compared case-insensitively to align with the
+// case-folding done elsewhere in assignSlugs.
+var reservedSlugs = map[string]struct{}{
+	"index": {},
+}
+
+func isReservedSlug(base string) bool {
+	_, ok := reservedSlugs[strings.ToLower(base)]
+	return ok
+}
+
 // assignSlugs derives a unique slug per repo from its basename, falling
 // back to `<basename>-<shortHash(absPath)>` when two repos share a name.
 // The slug is also the JSONL filename stem and the persistence key
@@ -239,7 +256,7 @@ func assignSlugs(repos []Repo) {
 		for i := range repos {
 			base := bases[i]
 			slug := base
-			if counts[strings.ToLower(base)] > 1 {
+			if counts[strings.ToLower(base)] > 1 || isReservedSlug(base) {
 				h := sha1.Sum([]byte(repos[i].AbsPath))
 				slug = fmt.Sprintf("%s-%s", base, hex.EncodeToString(h[:])[:hashLen])
 			}
diff --git a/internal/scan/discovery_test.go b/internal/scan/discovery_test.go
index 5c1fa19..f8fd900 100644
--- a/internal/scan/discovery_test.go
+++ b/internal/scan/discovery_test.go
@@ -163,6 +163,32 @@ func findColliding6HexPaths(maxAttempts int) (string, string, bool) {
 	return "", "", false
 }
 
+// `index` is the landing-page filename used by `scan --report-dir`.
+// A repo whose basename is literally `index` would emit
+// `<dir>/index.html`, colliding with (and getting overwritten by)
+// the landing page. Force the hash branch for reserved names so
+// the per-repo report file never lands on a reserved path.
+func TestAssignSlugs_ReservesIndex(t *testing.T) {
+	repos := []Repo{
+		{AbsPath: "/workspace/index"},
+		{AbsPath: "/other/unrelated"},
+	}
+	assignSlugs(repos)
+
+	for _, r := range repos {
+		if filepath.Base(r.AbsPath) == "index" && r.Slug == "index" {
+			t.Errorf("`index` basename must not produce the bare slug; landing page would be overwritten (path=%s, slug=%s)", r.AbsPath, r.Slug)
+		}
+	}
+	// Case-insensitivity: `Index` would also collide on a case-
+	// insensitive filesystem serving the HTML.
+	repos = []Repo{{AbsPath: "/workspace/Index"}}
+	assignSlugs(repos)
+	if strings.EqualFold(repos[0].Slug, "index") {
+		t.Errorf("`Index` basename must also be reserved; got %q", repos[0].Slug)
+	}
+}
+
 // Regression: on macOS (APFS/HFS+ default) and Windows (NTFS),
 // `Repo.jsonl` and `repo.jsonl` are the same file on disk. A
 // case-sensitive slug compare would hand both repos the bare

From f2810b5cd679afccc6328b91b569d5d29125927f Mon Sep 17 00:00:00 2001
From: lex0c <lex0c@proton.me>
Date: Mon, 20 Apr 2026 18:24:05 -0300
Subject: [PATCH 23/37] Align scan-index footer with the per-repo report footer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Both pages now render the identical 'Generated by gitcortex · <date>'
footer — same link attributes, same color and text-decoration, same
border-top spacing. The duplicate date in the subtitle (removed)
kept the top strip uncluttered for the only stable identifier that
varied between index and repo pages: the scanned roots. Visually
the index now reads as the sibling of every per-repo report it
links to instead of a one-off landing.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 internal/report/scan_index.go | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/internal/report/scan_index.go b/internal/report/scan_index.go
index 80f410a..ec0a063 100644
--- a/internal/report/scan_index.go
+++ b/internal/report/scan_index.go
@@ -94,14 +94,14 @@ h1 { font-size: 24px; margin-bottom: 4px; }
 .status-ok { background: #dafbe1; color: #1a7f37; }
 .status-failed { background: #ffebe9; color: #cf222e; }
 .status-pending { background: #fff4d4; color: #9a6700; }
-footer { margin-top: 30px; color: #656d76; font-size: 12px; text-align: center; border-top: 1px solid #d0d7de; padding-top: 14px; }
+footer { margin-top: 40px; padding-top: 16px; border-top: 1px solid #d0d7de; color: #656d76; font-size: 12px; }
 .hint { font-size: 12px; color: #656d76; font-style: italic; margin-bottom: 14px; }
 </style>
 </head>
 <body>
 
 <h1>Scan Index <span style="font-size:14px; color:#656d76; font-weight:normal;">{{.TotalRepos}} repositor{{if eq .TotalRepos 1}}y{{else}}ies{{end}}</span></h1>
-<p class="subtitle">Generated {{.GeneratedAt}}{{if .Roots}} · roots: {{range $i, $r := .Roots}}{{if $i}}, {{end}}<code>{{$r}}</code>{{end}}{{end}}</p>
+<p class="subtitle">{{if .Roots}}roots: {{range $i, $r := .Roots}}{{if $i}}, {{end}}<code>{{$r}}</code>{{end}}{{end}}</p>
 
 <div class="summary">
   <div class="summary-card"><div class="label">Repositories</div><div class="value">{{thousands .OKRepos}}{{if gt .FailedRepos 0}} <span style="font-size:14px; color:#cf222e;">({{.FailedRepos}} failed)</span>{{end}}{{if gt .PendingRepos 0}} <span style="font-size:14px; color:#9a6700;">({{.PendingRepos}} pending)</span>{{end}}</div></div>
@@ -154,7 +154,7 @@ footer { margin-top: 30px; color: #656d76; font-size: 12px; text-align: center;
 {{end}}
 {{end}}
 
-<footer>Generated by <a href="https://github.com/lex0c/gitcortex" style="color:#0969da;">gitcortex</a></footer>
+<footer>Generated by <a href="https://github.com/lex0c/gitcortex" target="_blank" rel="noopener noreferrer" style="color:#0969da; text-decoration:none;">gitcortex</a> · {{.GeneratedAt}}</footer>
 
 </body>
 </html>

From c0bfccb4d4d1c8a7c74dd99562c4377ee5ff8527 Mon Sep 17 00:00:00 2001
From: lex0c <lex0c@proton.me>
Date: Mon, 20 Apr 2026 19:01:13 -0300
Subject: [PATCH 24/37] Simplify scan-index heading and drop roots subtitle
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Shorten the H1 from "Scan Index" to "Index" and drop the
"roots: ..." subtitle line. The roots were borderline-useful — the
user picked them, so echoing them back added chrome without
adding information. With the data summary cards carrying the
interesting counts below, a terser heading reads cleaner.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 cmd/gitcortex/main_test.go    | 2 +-
 internal/report/scan_index.go | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/cmd/gitcortex/main_test.go b/cmd/gitcortex/main_test.go
index 2b361d1..7b5c9c3 100644
--- a/cmd/gitcortex/main_test.go
+++ b/cmd/gitcortex/main_test.go
@@ -200,7 +200,7 @@ func TestScanCmd_ReportDirGeneratesPerRepoHTMLAndIndex(t *testing.T) {
 	if !strings.Contains(index, `href="alpha.html"`) || !strings.Contains(index, `href="beta.html"`) {
 		t.Errorf("index.html should link to both per-repo reports; got body excerpt: %.300s", index)
 	}
-	if !strings.Contains(index, "Scan Index") {
+	if !strings.Contains(index, "<h1>Index") {
 		t.Errorf("index.html missing title block")
 	}
 }
diff --git a/internal/report/scan_index.go b/internal/report/scan_index.go
index ec0a063..2340a8c 100644
--- a/internal/report/scan_index.go
+++ b/internal/report/scan_index.go
@@ -100,8 +100,7 @@ footer { margin-top: 40px; padding-top: 16px; border-top: 1px solid #d0d7de; col
 </head>
 <body>
 
-<h1>Scan Index <span style="font-size:14px; color:#656d76; font-weight:normal;">{{.TotalRepos}} repositor{{if eq .TotalRepos 1}}y{{else}}ies{{end}}</span></h1>
-<p class="subtitle">{{if .Roots}}roots: {{range $i, $r := .Roots}}{{if $i}}, {{end}}<code>{{$r}}</code>{{end}}{{end}}</p>
+<h1>Index <span style="font-size:14px; color:#656d76; font-weight:normal;">{{.TotalRepos}} repositor{{if eq .TotalRepos 1}}y{{else}}ies{{end}}</span></h1>
 
 <div class="summary">
   <div class="summary-card"><div class="label">Repositories</div><div class="value">{{thousands .OKRepos}}{{if gt .FailedRepos 0}} <span style="font-size:14px; color:#cf222e;">({{.FailedRepos}} failed)</span>{{end}}{{if gt .PendingRepos 0}} <span style="font-size:14px; color:#9a6700;">({{.PendingRepos}} pending)</span>{{end}}</div></div>

From 3494e0e1ecb0cb5318d7cd284f715533af93d7a7 Mon Sep 17 00:00:00 2001
From: lex0c <lex0c@proton.me>
Date: Mon, 20 Apr 2026 19:03:34 -0300
Subject: [PATCH 25/37] Trim redundant chrome from scan-index cards
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two small subtractions driven by the same principle — don't repeat
signal the surrounding layout already conveys:

- Drop the "ok" status pill. The link itself plus the populated
  metric cells make status obvious for successful entries;
  showing a neutral pill on every card added noise. Failed and
  pending pills stay because those statuses swap the card body
  for an italic "No report available." and the pill is the only
  affordance marking which failure mode it was.

- Drop the "N / M max" caption next to the relative-volume bar.
  The bar already visualizes the ratio, the "commits" metric cell
  already shows N in the same row, and the max is inferred by
  eyeballing the widest bar. Keeping the bar alone preserves
  comparison-at-a-glance without a third copy of the same
  number.

Test updated to assert `status-ok` pill is absent while failed /
pending pills remain.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 cmd/gitcortex/main_test.go    | 11 ++++++++---
 internal/report/scan_index.go |  3 +--
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/cmd/gitcortex/main_test.go b/cmd/gitcortex/main_test.go
index 7b5c9c3..ba25b4c 100644
--- a/cmd/gitcortex/main_test.go
+++ b/cmd/gitcortex/main_test.go
@@ -273,12 +273,17 @@ func TestRenderScanReportDir_MixedStatuses(t *testing.T) {
 		t.Error("failed entry's error message missing from index")
 	}
 
-	// Status pills render for each entry. Grep for the pill classes.
-	for _, want := range []string{"status-ok", "status-failed", "status-pending"} {
+	// Status pills render only for non-ok entries (ok is implicit
+	// given the link itself and the presence of metric cells). Assert
+	// the two non-ok pills exist and the ok one does not.
+	for _, want := range []string{"status-failed", "status-pending"} {
 		if !strings.Contains(index, want) {
-			t.Errorf("index missing %q pill — a status branch lost its style", want)
+			t.Errorf("index missing %q pill — status branch lost its style", want)
 		}
 	}
+	if strings.Contains(index, `class="status-pill status-ok"`) {
+		t.Error("ok entries should not render a status pill — redundant with the link and metric cells")
+	}
 
 	// Only OK entries carry an href; failed/pending must not link to
 	// a file that doesn't exist.
diff --git a/internal/report/scan_index.go b/internal/report/scan_index.go
index 2340a8c..460ec43 100644
--- a/internal/report/scan_index.go
+++ b/internal/report/scan_index.go
@@ -116,7 +116,7 @@ footer { margin-top: 40px; padding-top: 16px; border-top: 1px solid #d0d7de; col
   <div>
     <div class="name">
       {{if .ReportHref}}<a href="{{.ReportHref}}">{{.Slug}}</a>{{else}}{{.Slug}}{{end}}
-      <span class="status-pill status-{{.Status}}">{{.Status}}</span>
+      {{if ne .Status "ok"}}<span class="status-pill status-{{.Status}}">{{.Status}}</span>{{end}}
     </div>
     <div class="path">{{.Path}}</div>
     {{if .Error}}<div class="error">{{.Error}}</div>{{end}}
@@ -148,7 +148,6 @@ footer { margin-top: 40px; padding-top: 16px; border-top: 1px solid #d0d7de; col
 {{if eq .Status "ok"}}
 <div class="bar-container" style="margin:-4px 0 12px 20px; max-width:600px;">
   <div class="bar-outer"><div class="bar-inner" style="width:{{if gt $max 0}}{{pctInt .Commits $max}}%{{else}}0%{{end}};"></div></div>
-  <span style="font-size:11px; color:#656d76; font-family: 'SF Mono', Consolas, monospace; white-space:nowrap;">{{humanize .Commits}} / {{humanize $max}} max</span>
 </div>
 {{end}}
 {{end}}

From 49cd00daee7a706211a29ef0a53b86324c0b2771 Mon Sep 17 00:00:00 2001
From: lex0c <lex0c@proton.me>
Date: Mon, 20 Apr 2026 19:05:19 -0300
Subject: [PATCH 26/37] Drop green left-border accent from ok scan-index cards

The green .repo.ok border competed with the layout's default
border-radius and made a table of healthy repos look noisier than
it needed to. Red (failed) and amber (pending) borders stay as
attention cues for the genuinely actionable states; the absence of
a stripe on ok cards now reads as "nothing to flag" by contrast.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 internal/report/scan_index.go | 1 -
 1 file changed, 1 deletion(-)

diff --git a/internal/report/scan_index.go b/internal/report/scan_index.go
index 460ec43..898b467 100644
--- a/internal/report/scan_index.go
+++ b/internal/report/scan_index.go
@@ -77,7 +77,6 @@ h1 { font-size: 24px; margin-bottom: 4px; }
 .repo { background: #fff; border: 1px solid #d0d7de; border-radius: 6px; padding: 16px 20px; margin-bottom: 10px; display: grid; grid-template-columns: 2fr 1fr 1fr 1fr 1fr 1.2fr; gap: 16px; align-items: center; }
 .repo.failed { border-left: 4px solid #cf222e; }
 .repo.pending { border-left: 4px solid #bf8700; }
-.repo.ok { border-left: 4px solid #2da44e; }
 .repo .name { font-weight: 600; font-size: 15px; }
 .repo .name a { color: #0969da; text-decoration: none; }
 .repo .name a:hover { text-decoration: underline; }

From 2ae796156495ac977b41ce525ab2ce65c392353a Mon Sep 17 00:00:00 2001
From: lex0c <lex0c@proton.me>
Date: Mon, 20 Apr 2026 19:13:37 -0300
Subject: [PATCH 27/37] Surface last-commit recency on scan index cards
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The dates cell used to show "2024-01-15 → 2026-04-18" raw, forcing
operators scanning a list of 30 repos to do date math to pick out
the abandoned ones. Add a recency chip above the date range:
"today" / "Nd ago" / "Nmo ago" / "Ny ago", colored by freshness
bucket — green for fresh (≤30d), gray for stable (≤1y), amber for
stale (>1y). The raw dates stay as supporting detail for anyone who
wants the exact range.

HumanizeAgo is the caller entry point (uses time.Now); humanizeAgoAt
takes an explicit reference time for deterministic tests. Table test
pins the day/month/year transitions and the 365-day stable/stale
boundary; render test asserts the chip class reaches the HTML and
is absent from failed entries that have no dates.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 cmd/gitcortex/main.go              |  1 +
 internal/report/scan_index.go      | 58 +++++++++++++++++-
 internal/report/scan_index_test.go | 98 ++++++++++++++++++++++++++++++
 3 files changed, 156 insertions(+), 1 deletion(-)
 create mode 100644 internal/report/scan_index_test.go

diff --git a/cmd/gitcortex/main.go b/cmd/gitcortex/main.go
index 897f33c..e1f3132 100644
--- a/cmd/gitcortex/main.go
+++ b/cmd/gitcortex/main.go
@@ -947,6 +947,7 @@ func renderScanReportDir(result *scan.Result, dir string, loadOpts stats.LoadOpt
 		entry.Churn = summary.TotalAdditions + summary.TotalDeletions
 		entry.FirstCommitDate = summary.FirstCommitDate
 		entry.LastCommitDate = summary.LastCommitDate
+		entry.LastCommitAgo, entry.RecencyBucket = reportpkg.HumanizeAgo(summary.LastCommitDate)
 		entry.ReportHref = m.Slug + ".html"
 
 		totalCommits += entry.Commits
diff --git a/internal/report/scan_index.go b/internal/report/scan_index.go
index 898b467..64084f2 100644
--- a/internal/report/scan_index.go
+++ b/internal/report/scan_index.go
@@ -1,6 +1,7 @@
 package report
 
 import (
+	"fmt"
 	"html/template"
 	"io"
 	"time"
@@ -22,7 +23,57 @@ type ScanIndexEntry struct {
 	Churn           int64
 	FirstCommitDate string
 	LastCommitDate  string
-	ReportHref      string
+	// LastCommitAgo is LastCommitDate humanized relative to the
+	// index generation time: "today" / "Nd ago" / "Nmo ago" / "Ny ago".
+	// Lets operators spot abandoned repos in a list of 30 without
+	// doing date math in their head.
+	LastCommitAgo string
+	// RecencyBucket classifies LastCommitAgo into "fresh" (≤ 30 days),
+	// "stable" (≤ 1 year), "stale" (> 1 year). Drives the badge color
+	// in the template so the cold repos stand out at a glance.
+	RecencyBucket string
+	ReportHref    string
+}
+
+// HumanizeAgo is the caller entry point for the index: formats
+// `lastDate` (YYYY-MM-DD) as a compact "ago" phrase relative to
+// wall-clock now, and classifies recency into fresh / stable /
+// stale for template coloring. Empty pair on unparsable input so
+// failed/pending rows with no dates render nothing.
+func HumanizeAgo(lastDate string) (label, bucket string) {
+	return humanizeAgoAt(lastDate, time.Now())
+}
+
+// humanizeAgoAt is the testable core — same logic as HumanizeAgo
+// but takes an explicit "now" so tests can pin the reference time
+// and avoid drift.
+func humanizeAgoAt(lastDate string, now time.Time) (label, bucket string) {
+	t, err := time.Parse("2006-01-02", lastDate)
+	if err != nil {
+		return "", ""
+	}
+	days := int(now.Sub(t).Hours() / 24)
+	switch {
+	case days < 0:
+		return "", ""
+	case days == 0:
+		label = "today"
+	case days < 30:
+		label = fmt.Sprintf("%dd ago", days)
+	case days < 365:
+		label = fmt.Sprintf("%dmo ago", days/30)
+	default:
+		label = fmt.Sprintf("%dy ago", days/365)
+	}
+	switch {
+	case days <= 30:
+		bucket = "fresh"
+	case days <= 365:
+		bucket = "stable"
+	default:
+		bucket = "stale"
+	}
+	return label, bucket
 }
 
 // ScanIndexData is the top-level template input for the index page.
@@ -89,6 +140,10 @@ h1 { font-size: 24px; margin-bottom: 4px; }
 .repo .bar-outer { flex: 1; height: 8px; background: #eaeef2; border-radius: 3px; overflow: hidden; }
 .repo .bar-inner { height: 100%; background: #2da44e; }
 .repo .dates { font-size: 11px; color: #656d76; font-family: "SF Mono", Consolas, monospace; }
+.recency { display: inline-block; padding: 2px 8px; border-radius: 10px; font-size: 11px; font-weight: 600; font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif; margin-bottom: 4px; }
+.recency.fresh { background: #dafbe1; color: #1a7f37; }
+.recency.stable { background: #eaeef2; color: #656d76; }
+.recency.stale { background: #fff4d4; color: #9a6700; }
 .status-pill { display: inline-block; padding: 2px 8px; border-radius: 10px; font-size: 11px; font-weight: 600; }
 .status-ok { background: #dafbe1; color: #1a7f37; }
 .status-failed { background: #ffebe9; color: #cf222e; }
@@ -138,6 +193,7 @@ footer { margin-top: 40px; padding-top: 16px; border-top: 1px solid #d0d7de; col
     <span class="lbl">files</span>
   </div>
   <div class="dates">
+    {{if .LastCommitAgo}}<span class="recency {{.RecencyBucket}}">{{.LastCommitAgo}}</span><br>{{end}}
     {{.FirstCommitDate}}<br>→ {{.LastCommitDate}}
   </div>
   {{else}}
diff --git a/internal/report/scan_index_test.go b/internal/report/scan_index_test.go
new file mode 100644
index 0000000..6164640
--- /dev/null
+++ b/internal/report/scan_index_test.go
@@ -0,0 +1,98 @@
+package report
+
+import (
+	"bytes"
+	"strings"
+	"testing"
+	"time"
+)
+
+func TestHumanizeAgoAt(t *testing.T) {
+	now := time.Date(2026, 4, 20, 12, 0, 0, 0, time.UTC)
+
+	cases := []struct {
+		name      string
+		lastDate  string
+		wantLabel string
+		wantBucket string
+	}{
+		// Boundary on day 0 — "today" reads cleaner than "0d ago".
+		{"same day", "2026-04-20", "today", "fresh"},
+		{"one day", "2026-04-19", "1d ago", "fresh"},
+		{"29 days", "2026-03-22", "29d ago", "fresh"},
+		// Transition from days to months at 30.
+		{"30 days exact", "2026-03-21", "1mo ago", "fresh"},
+		{"two months", "2026-02-15", "2mo ago", "stable"},
+		{"eleven months", "2025-05-25", "11mo ago", "stable"},
+		// 365-day boundary: still "stable" at exactly one year,
+		// "stale" the day after. 2025-04-20 is 365 days before now.
+		{"one year exact (stable boundary)", "2025-04-20", "1y ago", "stable"},
+		{"one year + one day (stale)", "2025-04-19", "1y ago", "stale"},
+		{"two years stale", "2024-04-10", "2y ago", "stale"},
+		// Parse failure yields empty.
+		{"bad input", "not-a-date", "", ""},
+		// Future dates (clock skew) yield empty — we don't label "in 3d"
+		// on an index that exists to surface recency of PAST commits.
+		{"future date", "2026-05-01", "", ""},
+	}
+	for _, c := range cases {
+		t.Run(c.name, func(t *testing.T) {
+			label, bucket := humanizeAgoAt(c.lastDate, now)
+			if label != c.wantLabel {
+				t.Errorf("label: got %q, want %q", label, c.wantLabel)
+			}
+			if bucket != c.wantBucket {
+				t.Errorf("bucket: got %q, want %q", bucket, c.wantBucket)
+			}
+		})
+	}
+}
+
+// End-to-end check that the recency chip renders for an ok entry
+// and is absent for failed entries (which have no dates). Confirms
+// the CSS bucket class is reachable by the template.
+func TestGenerateScanIndex_RecencyChipRenders(t *testing.T) {
+	data := ScanIndexData{
+		GeneratedAt: "2026-04-20 12:00",
+		TotalRepos:  2,
+		OKRepos:     1,
+		FailedRepos: 1,
+		MaxCommits:  10,
+		Repos: []ScanIndexEntry{
+			{
+				Slug:            "alive",
+				Path:            "/work/alive",
+				Status:          "ok",
+				Commits:         10,
+				FirstCommitDate: "2024-01-01",
+				LastCommitDate:  "2026-04-18",
+				LastCommitAgo:   "2d ago",
+				RecencyBucket:   "fresh",
+				ReportHref:      "alive.html",
+			},
+			{
+				Slug:   "broken",
+				Path:   "/work/broken",
+				Status: "failed",
+				Error:  "boom",
+			},
+		},
+	}
+	var buf bytes.Buffer
+	if err := GenerateScanIndex(&buf, data); err != nil {
+		t.Fatal(err)
+	}
+	out := buf.String()
+	if !strings.Contains(out, `class="recency fresh"`) {
+		t.Error("fresh recency chip missing from ok entry")
+	}
+	if !strings.Contains(out, `>2d ago<`) {
+		t.Error("recency label text missing")
+	}
+	// Failed entry has no dates block, so no recency chip should
+	// render for it — a weak but useful guard against a template
+	// restructure leaking the chip into the failure render.
+	if strings.Count(out, `class="recency`) != 1 {
+		t.Errorf("expected exactly one recency chip (ok entry only); got %d", strings.Count(out, `class="recency`))
+	}
+}

From d59338ef6817927803efdde6bc36b0752cb6d4d5 Mon Sep 17 00:00:00 2001
From: lex0c <lex0c@proton.me>
Date: Mon, 20 Apr 2026 19:37:32 -0300
Subject: [PATCH 28/37] Address 7 minors from the recency-batch review
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Seven small polish items surfaced by the post-recency review. All
live in or around the new scan-index landing page:

- Recency label at 360-364 days read as "12mo ago" under the
  stable color. Clamp months at 11 so the label progression never
  crosses into two-digit territory inside the still-stable band.

- Ok entry whose dataset had no dates (empty JSONL, rename-only
  history) rendered an orphan arrow `<br>→ ` in the dates cell.
  Guard the date range behind `{{if .LastCommitDate}}` so the cell
  emits nothing instead of a lone glyph.

- Log line started every run with `0 ok` even when the count was
  zero, drifting from the summary card's zero-suppression
  convention. Build the log bucket list the same way — only
  non-zero counts, and open with the first non-zero bucket.

- `Data.Roots` field and `.subtitle` CSS rule were dead after the
  roots-subtitle removal commit. Drop both plus the now-unused
  Roots assignment at the call site.

- Convert `reservedSlugs` from a package-level mutable map to a
  `isReservedSlug` switch. Keeps the reserved set immutable — no
  risk of a future test silently leaking an entry across the
  process.

- Strengthen the recency render test: anchor the chip inside the
  `.dates` wrapper so a refactor that moves the chip outside its
  structural context fails the test instead of passing on the
  CSS class alone.

- Extend the humanizeAgoAt table with 360/364-day "stable" cases
  (pinning the new month clamp) and a sub-day-future case that
  documents the "today" fallback for intra-day committer-date
  skew.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 cmd/gitcortex/main.go              | 16 +++++++++++-----
 internal/report/scan_index.go      | 25 +++++++++++++++++++++----
 internal/report/scan_index_test.go | 24 +++++++++++++++++++++++-
 internal/scan/discovery.go         | 28 +++++++++++++++-------------
 4 files changed, 70 insertions(+), 23 deletions(-)

diff --git a/cmd/gitcortex/main.go b/cmd/gitcortex/main.go
index e1f3132..3937fcb 100644
--- a/cmd/gitcortex/main.go
+++ b/cmd/gitcortex/main.go
@@ -987,7 +987,6 @@ func renderScanReportDir(result *scan.Result, dir string, loadOpts stats.LoadOpt
 	}
 
 	data := reportpkg.ScanIndexData{
-		Roots:        result.Manifest.Roots,
 		Repos:        entries,
 		TotalRepos:   len(entries),
 		OKRepos:      okCount,
@@ -1001,16 +1000,23 @@ func renderScanReportDir(result *scan.Result, dir string, loadOpts stats.LoadOpt
 		return fmt.Errorf("generate index: %w", err)
 	}
 
-	// Log line tracks the summary card: only non-zero buckets appear
-	// so a fully-successful scan doesn't confuse the operator with
-	// trailing "0 failed, 0 pending".
-	parts := []string{fmt.Sprintf("%d ok", okCount)}
+	// Log line tracks the summary card's zero-suppression: only
+	// non-zero buckets appear, and the opener is whichever bucket
+	// has a count so an all-failed scan reads as "2 failed" instead
+	// of the misleading "0 ok, 2 failed".
+	var parts []string
+	if okCount > 0 {
+		parts = append(parts, fmt.Sprintf("%d ok", okCount))
+	}
 	if failedCount > 0 {
 		parts = append(parts, fmt.Sprintf("%d failed", failedCount))
 	}
 	if pendingCount > 0 {
 		parts = append(parts, fmt.Sprintf("%d pending", pendingCount))
 	}
+	if len(parts) == 0 {
+		parts = []string{"0 repos"} // defensive; renderScanReportDir requires > 0 manifest entries to be reached at all
+	}
 	fmt.Fprintf(os.Stderr, "Per-repo reports written to %s (%s); open %s\n",
 		dir, strings.Join(parts, ", "), fileURL(indexPath))
 	return nil
diff --git a/internal/report/scan_index.go b/internal/report/scan_index.go
index 64084f2..4887641 100644
--- a/internal/report/scan_index.go
+++ b/internal/report/scan_index.go
@@ -47,6 +47,16 @@ func HumanizeAgo(lastDate string) (label, bucket string) {
 // humanizeAgoAt is the testable core — same logic as HumanizeAgo
 // but takes an explicit "now" so tests can pin the reference time
 // and avoid drift.
+//
+// Future-commit policy: a date strictly in the past ("days >= 0
+// when measured against midnight-to-midnight") is labeled.
+// Anything earlier than today's date (even by one hour, via
+// committer-date clock skew or CI rewrite) yields empty — the
+// index exists to surface how stale each repo is, and "in 3d" on
+// a repo whose scanner clock drifted would be actively misleading.
+// Sub-day skew within TODAY lands in `days == 0 → "today"`, which
+// is the most reasonable fallback given the template's day
+// granularity.
 func humanizeAgoAt(lastDate string, now time.Time) (label, bucket string) {
 	t, err := time.Parse("2006-01-02", lastDate)
 	if err != nil {
@@ -61,7 +71,16 @@ func humanizeAgoAt(lastDate string, now time.Time) (label, bucket string) {
 	case days < 30:
 		label = fmt.Sprintf("%dd ago", days)
 	case days < 365:
-		label = fmt.Sprintf("%dmo ago", days/30)
+		// Cap the month reading at 11 — otherwise `days/30` emits
+		// "12mo ago" at days 360-364, which reads as older than
+		// "1y ago" even though it's actually in the still-stable
+		// (≤365d) band. Clamping keeps the label progression
+		// monotonic with the bucket color.
+		months := days / 30
+		if months > 11 {
+			months = 11
+		}
+		label = fmt.Sprintf("%dmo ago", months)
 	default:
 		label = fmt.Sprintf("%dy ago", days/365)
 	}
@@ -79,7 +98,6 @@ func humanizeAgoAt(lastDate string, now time.Time) (label, bucket string) {
 // ScanIndexData is the top-level template input for the index page.
 type ScanIndexData struct {
 	GeneratedAt string
-	Roots       []string
 	Repos       []ScanIndexEntry
 	// TotalRepos / OKRepos / FailedRepos / PendingRepos are
 	// precomputed so the template doesn't need conditional arithmetic.
@@ -120,7 +138,6 @@ const scanIndexHTML = `<!DOCTYPE html>
 * { margin: 0; padding: 0; box-sizing: border-box; }
 body { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif; color: #24292f; background: #f6f8fa; padding: 20px; max-width: 1200px; margin: 0 auto; }
 h1 { font-size: 24px; margin-bottom: 4px; }
-.subtitle { color: #656d76; font-size: 13px; margin-bottom: 24px; }
 .summary { display: grid; grid-template-columns: repeat(auto-fit, minmax(160px, 1fr)); gap: 12px; margin-bottom: 24px; }
 .summary-card { background: #fff; border: 1px solid #d0d7de; border-radius: 6px; padding: 16px; }
 .summary-card .label { font-size: 12px; color: #656d76; text-transform: uppercase; }
@@ -194,7 +211,7 @@ footer { margin-top: 40px; padding-top: 16px; border-top: 1px solid #d0d7de; col
   </div>
   <div class="dates">
     {{if .LastCommitAgo}}<span class="recency {{.RecencyBucket}}">{{.LastCommitAgo}}</span><br>{{end}}
-    {{.FirstCommitDate}}<br>→ {{.LastCommitDate}}
+    {{if .LastCommitDate}}{{.FirstCommitDate}}<br>→ {{.LastCommitDate}}{{end}}
   </div>
   {{else}}
   <div style="grid-column: span 5; color:#656d76; font-style:italic;">No report available.</div>
diff --git a/internal/report/scan_index_test.go b/internal/report/scan_index_test.go
index 6164640..aca7ff1 100644
--- a/internal/report/scan_index_test.go
+++ b/internal/report/scan_index_test.go
@@ -24,6 +24,11 @@ func TestHumanizeAgoAt(t *testing.T) {
 		{"30 days exact", "2026-03-21", "1mo ago", "fresh"},
 		{"two months", "2026-02-15", "2mo ago", "stable"},
 		{"eleven months", "2025-05-25", "11mo ago", "stable"},
+		// Month label must not exceed "11mo" — days/30 = 12 on the
+		// 360-day boundary, but the label is clamped so the stable
+		// band always reads as sub-year.
+		{"month clamp at 360 days", "2025-04-25", "11mo ago", "stable"},
+		{"month clamp at 364 days", "2025-04-21", "11mo ago", "stable"},
 		// 365-day boundary: still "stable" at exactly one year,
 		// "stale" the day after. 2025-04-20 is 365 days before now.
 		{"one year exact (stable boundary)", "2025-04-20", "1y ago", "stable"},
@@ -33,7 +38,12 @@ func TestHumanizeAgoAt(t *testing.T) {
 		{"bad input", "not-a-date", "", ""},
 		// Future dates (clock skew) yield empty — we don't label "in 3d"
 		// on an index that exists to surface recency of PAST commits.
-		{"future date", "2026-05-01", "", ""},
+		{"full-day future", "2026-05-01", "", ""},
+		// Sub-day future (committer-date hours ahead of scanner clock)
+		// lands in days==0 → "today". Documented: the index works at
+		// day granularity and "today" is the least-misleading fallback
+		// for intra-day skew.
+		{"same-day future (sub-day skew)", "2026-04-20", "today", "fresh"},
 	}
 	for _, c := range cases {
 		t.Run(c.name, func(t *testing.T) {
@@ -89,6 +99,18 @@ func TestGenerateScanIndex_RecencyChipRenders(t *testing.T) {
 	if !strings.Contains(out, `>2d ago<`) {
 		t.Error("recency label text missing")
 	}
+	// Anchor the chip inside the `.dates` wrapper — without this,
+	// a future template change that moved the chip outside the
+	// dates cell (losing the date context) would still satisfy the
+	// class-only assertion above.
+	if !strings.Contains(out, `class="dates"`) {
+		t.Error("`.dates` wrapper missing — the recency chip has lost its structural anchor")
+	}
+	datesIdx := strings.Index(out, `class="dates"`)
+	recencyIdx := strings.Index(out, `class="recency fresh"`)
+	if datesIdx < 0 || recencyIdx < 0 || recencyIdx < datesIdx {
+		t.Errorf("recency chip not inside `.dates` wrapper: dates@%d recency@%d", datesIdx, recencyIdx)
+	}
 	// Failed entry has no dates block, so no recency chip should
 	// render for it — a weak but useful guard against a template
 	// restructure leaking the chip into the failure render.
diff --git a/internal/scan/discovery.go b/internal/scan/discovery.go
index 2d13774..533f522 100644
--- a/internal/scan/discovery.go
+++ b/internal/scan/discovery.go
@@ -187,21 +187,23 @@ func Discover(ctx context.Context, roots []string, matcher *Matcher, maxDepth in
 // grows the length if a truncation collision occurs.
 const initialSlugHashLen = 6
 
-// reservedSlugs are base names that cannot be emitted bare because
-// downstream consumers reserve the name for their own output file.
+// isReservedSlug reports whether base would collide with a name
+// downstream consumers already use for their own output file.
 // `scan --report-dir` writes <dir>/index.html as the landing page;
-// a repo whose basename was literally `index` would collide and
-// silently get overwritten by the landing or vice versa. Forcing
-// the hash branch for reserved names avoids the collision without
-// losing the repo. Compared case-insensitively to align with the
-// case-folding done elsewhere in assignSlugs.
-var reservedSlugs = map[string]struct{}{
-	"index": {},
-}
-
+// a repo whose basename was literally `index` would overwrite it
+// (or be overwritten by it). Forcing the hash branch for reserved
+// names avoids the collision without losing the repo.
+//
+// Switch, not a package-level map, to keep the reserved set
+// immutable — a mutable map would let one test silently leak an
+// entry into every other test that runs afterward. Case folded to
+// align with the case-insensitivity elsewhere in assignSlugs.
 func isReservedSlug(base string) bool {
-	_, ok := reservedSlugs[strings.ToLower(base)]
-	return ok
+	switch strings.ToLower(base) {
+	case "index":
+		return true
+	}
+	return false
 }
 
 // assignSlugs derives a unique slug per repo from its basename, falling

From 87d3e8b5818fe243f382bff43e3fe84a52e954c6 Mon Sep 17 00:00:00 2001
From: lex0c <lex0c@proton.me>
Date: Mon, 20 Apr 2026 19:39:27 -0300
Subject: [PATCH 29/37] Drop scan-index H1; promote summary cards to page
 anchor
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The "Index · N repositories" heading restated information the
summary strip's "Repositories" card already surfaces one line
below. Removing it lets the three-card summary (Repositories,
Total commits, Unique devs) open the page — the counts users
actually came to see. Dead `h1` CSS rule dropped at the same time.

Test now anchors on the "Repositories" label in the summary strip,
which survives layout tweaks better than a specific H1 string.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 cmd/gitcortex/main_test.go    | 7 +++++--
 internal/report/scan_index.go | 3 ---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/cmd/gitcortex/main_test.go b/cmd/gitcortex/main_test.go
index ba25b4c..34c55ee 100644
--- a/cmd/gitcortex/main_test.go
+++ b/cmd/gitcortex/main_test.go
@@ -200,8 +200,11 @@ func TestScanCmd_ReportDirGeneratesPerRepoHTMLAndIndex(t *testing.T) {
 	if !strings.Contains(index, `href="alpha.html"`) || !strings.Contains(index, `href="beta.html"`) {
 		t.Errorf("index.html should link to both per-repo reports; got body excerpt: %.300s", index)
 	}
-	if !strings.Contains(index, "<h1>Index") {
-		t.Errorf("index.html missing title block")
+	// The summary card strip is now the top-of-page anchor (no H1).
+	// Asserting on the "Repositories" label keeps the test meaningful
+	// if the card layout changes.
+	if !strings.Contains(index, "Repositories") {
+		t.Errorf("index.html missing summary strip")
 	}
 }
 
diff --git a/internal/report/scan_index.go b/internal/report/scan_index.go
index 4887641..2393546 100644
--- a/internal/report/scan_index.go
+++ b/internal/report/scan_index.go
@@ -137,7 +137,6 @@ const scanIndexHTML = `<!DOCTYPE html>
 <style>
 * { margin: 0; padding: 0; box-sizing: border-box; }
 body { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif; color: #24292f; background: #f6f8fa; padding: 20px; max-width: 1200px; margin: 0 auto; }
-h1 { font-size: 24px; margin-bottom: 4px; }
 .summary { display: grid; grid-template-columns: repeat(auto-fit, minmax(160px, 1fr)); gap: 12px; margin-bottom: 24px; }
 .summary-card { background: #fff; border: 1px solid #d0d7de; border-radius: 6px; padding: 16px; }
 .summary-card .label { font-size: 12px; color: #656d76; text-transform: uppercase; }
@@ -171,8 +170,6 @@ footer { margin-top: 40px; padding-top: 16px; border-top: 1px solid #d0d7de; col
 </head>
 <body>
 
-<h1>Index <span style="font-size:14px; color:#656d76; font-weight:normal;">{{.TotalRepos}} repositor{{if eq .TotalRepos 1}}y{{else}}ies{{end}}</span></h1>
-
 <div class="summary">
   <div class="summary-card"><div class="label">Repositories</div><div class="value">{{thousands .OKRepos}}{{if gt .FailedRepos 0}} <span style="font-size:14px; color:#cf222e;">({{.FailedRepos}} failed)</span>{{end}}{{if gt .PendingRepos 0}} <span style="font-size:14px; color:#9a6700;">({{.PendingRepos}} pending)</span>{{end}}</div></div>
   <div class="summary-card"><div class="label">Total commits</div><div class="value" title="{{thousands .TotalCommits}}">{{humanize .TotalCommits}}</div></div>

From 60b4c3731404050668d5002e210fcd9273219fdd Mon Sep 17 00:00:00 2001
From: lex0c <lex0c@proton.me>
Date: Mon, 20 Apr 2026 19:42:27 -0300
Subject: [PATCH 30/37] Unify scan-index summary-card CSS with the team/profile
 templates
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The index template shipped its own `.summary` / `.summary-card`
classes with rules byte-for-byte identical to `.cards` / `.card`
in the team and profile templates. Two copies of the same
definition drift — a future restyle of "our card look" would land
in one place and quietly skip the other.

Rename both the CSS rules and the HTML usage to the shared
`.cards` / `.card` names. No visual change; the three pages now
share one vocabulary for the summary-strip pattern.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 internal/report/scan_index.go | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/internal/report/scan_index.go b/internal/report/scan_index.go
index 2393546..834e03b 100644
--- a/internal/report/scan_index.go
+++ b/internal/report/scan_index.go
@@ -137,10 +137,10 @@ const scanIndexHTML = `<!DOCTYPE html>
 <style>
 * { margin: 0; padding: 0; box-sizing: border-box; }
 body { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif; color: #24292f; background: #f6f8fa; padding: 20px; max-width: 1200px; margin: 0 auto; }
-.summary { display: grid; grid-template-columns: repeat(auto-fit, minmax(160px, 1fr)); gap: 12px; margin-bottom: 24px; }
-.summary-card { background: #fff; border: 1px solid #d0d7de; border-radius: 6px; padding: 16px; }
-.summary-card .label { font-size: 12px; color: #656d76; text-transform: uppercase; }
-.summary-card .value { font-size: 24px; font-weight: 600; margin-top: 4px; }
+.cards { display: grid; grid-template-columns: repeat(auto-fit, minmax(160px, 1fr)); gap: 12px; margin-bottom: 24px; }
+.card { background: #fff; border: 1px solid #d0d7de; border-radius: 6px; padding: 16px; }
+.card .label { font-size: 12px; color: #656d76; text-transform: uppercase; }
+.card .value { font-size: 24px; font-weight: 600; margin-top: 4px; }
 .repo { background: #fff; border: 1px solid #d0d7de; border-radius: 6px; padding: 16px 20px; margin-bottom: 10px; display: grid; grid-template-columns: 2fr 1fr 1fr 1fr 1fr 1.2fr; gap: 16px; align-items: center; }
 .repo.failed { border-left: 4px solid #cf222e; }
 .repo.pending { border-left: 4px solid #bf8700; }
@@ -170,10 +170,10 @@ footer { margin-top: 40px; padding-top: 16px; border-top: 1px solid #d0d7de; col
 </head>
 <body>
 
-<div class="summary">
-  <div class="summary-card"><div class="label">Repositories</div><div class="value">{{thousands .OKRepos}}{{if gt .FailedRepos 0}} <span style="font-size:14px; color:#cf222e;">({{.FailedRepos}} failed)</span>{{end}}{{if gt .PendingRepos 0}} <span style="font-size:14px; color:#9a6700;">({{.PendingRepos}} pending)</span>{{end}}</div></div>
-  <div class="summary-card"><div class="label">Total commits</div><div class="value" title="{{thousands .TotalCommits}}">{{humanize .TotalCommits}}</div></div>
-  <div class="summary-card"><div class="label">Unique devs</div><div class="value">{{thousands .TotalDevs}}</div></div>
+<div class="cards">
+  <div class="card"><div class="label">Repositories</div><div class="value">{{thousands .OKRepos}}{{if gt .FailedRepos 0}} <span style="font-size:14px; color:#cf222e;">({{.FailedRepos}} failed)</span>{{end}}{{if gt .PendingRepos 0}} <span style="font-size:14px; color:#9a6700;">({{.PendingRepos}} pending)</span>{{end}}</div></div>
+  <div class="card"><div class="label">Total commits</div><div class="value" title="{{thousands .TotalCommits}}">{{humanize .TotalCommits}}</div></div>
+  <div class="card"><div class="label">Unique devs</div><div class="value">{{thousands .TotalDevs}}</div></div>
 </div>
 
 <p class="hint">Each repo below links to its own standalone report. Metrics are per-repository — no cross-repo aggregation that would mix signals from unrelated codebases. For a consolidated developer view, use <code>gitcortex scan --report &lt;file&gt; --email &lt;address&gt;</code>.</p>

From 7666e57eb5b436cc1b88b185a810e8f214a953ad Mon Sep 17 00:00:00 2001
From: lex0c <lex0c@proton.me>
Date: Mon, 20 Apr 2026 19:48:33 -0300
Subject: [PATCH 31/37] Order scan index as a triage view, not alphabetical
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The index inherited discovery's case-sensitive slug sort, which on
a real scan surfaced WordPress.git before gitcortex.git purely
because the uppercase W sorts before lowercase g. The order didn't
reflect anything an operator cares about.

Resort before rendering: failed first (actionable — and the red-
bordered "No report available" cards need to be impossible to miss),
then pending/skipped, then ok entries ranked by commit count desc
so the heaviest healthy repos open the tail. Slug asc breaks ties
in every bucket so identical-shaped scans still render identically.

Test with a mixed manifest {big=50c, broken=failed, pending, small=5c}
asserts the rendered order is broken → pending → big → small, i.e.
status bucket wins over alphabetical, commits win within the ok
bucket.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 cmd/gitcortex/main.go      | 39 ++++++++++++++++++
 cmd/gitcortex/main_test.go | 84 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 123 insertions(+)

diff --git a/cmd/gitcortex/main.go b/cmd/gitcortex/main.go
index 3937fcb..16304ac 100644
--- a/cmd/gitcortex/main.go
+++ b/cmd/gitcortex/main.go
@@ -9,6 +9,7 @@ import (
 	"os"
 	"os/signal"
 	"path/filepath"
+	"sort"
 	"strings"
 	"syscall"
 	"time"
@@ -872,6 +873,25 @@ func reportCmd() *cobra.Command {
 	return cmd
 }
 
+// scanIndexStatusRank maps a ManifestRepo.Status to a sort bucket
+// used by the index page ordering. Lower rank renders first; failed
+// entries float to the top so operators spot them immediately,
+// pending/skipped land in the middle, and ok entries make up the
+// "everything else" tail ranked by commit count inside
+// renderScanReportDir.
+func scanIndexStatusRank(status string) int {
+	switch status {
+	case "failed":
+		return 0
+	case "pending", "skipped":
+		return 1
+	case "ok":
+		return 2
+	default:
+		return 3
+	}
+}
+
 // renderScanReportDir is `scan --report-dir` in one place: for each
 // successful repo in the manifest, load only that repo's JSONL as a
 // standalone dataset, run the normal `report` pipeline into
@@ -961,6 +981,25 @@ func renderScanReportDir(result *scan.Result, dir string, loadOpts stats.LoadOpt
 		entries = append(entries, entry)
 	}
 
+	// Order the index as a triage view rather than by discovery's
+	// alphabetical slug order. Failed entries come first (they need
+	// attention and their absence of metrics makes the red border
+	// the only signal), then pending/skipped, then ok entries
+	// ranked by commit count desc so the heaviest repos surface at
+	// the top of the "healthy" section. Slug asc is the tiebreaker
+	// in every bucket so scans with identical shapes still produce
+	// identical page order.
+	sort.SliceStable(entries, func(i, j int) bool {
+		a, b := scanIndexStatusRank(entries[i].Status), scanIndexStatusRank(entries[j].Status)
+		if a != b {
+			return a < b
+		}
+		if entries[i].Status == "ok" && entries[i].Commits != entries[j].Commits {
+			return entries[i].Commits > entries[j].Commits
+		}
+		return entries[i].Slug < entries[j].Slug
+	})
+
 	indexPath := filepath.Join(dir, "index.html")
 	f, err := os.Create(indexPath)
 	if err != nil {
diff --git a/cmd/gitcortex/main_test.go b/cmd/gitcortex/main_test.go
index 34c55ee..766ce8f 100644
--- a/cmd/gitcortex/main_test.go
+++ b/cmd/gitcortex/main_test.go
@@ -2,6 +2,7 @@ package main
 
 import (
 	"bytes"
+	"fmt"
 	"os"
 	"os/exec"
 	"path/filepath"
@@ -295,6 +296,89 @@ func TestRenderScanReportDir_MixedStatuses(t *testing.T) {
 	}
 }
 
+// Index order is a triage view: failed first (actionable), then
+// pending (skipped work, less urgent), then ok ranked by commit
+// count desc (heaviest healthy repos first). Slug asc breaks ties
+// everywhere.
+func TestRenderScanReportDir_IndexOrderFailedFirstThenByCommits(t *testing.T) {
+	dir := t.TempDir()
+	// One JSONL with 50 commits, one with 5 — lets us see the
+	// ok-bucket reorder from the "small -> big" alphabetical slug
+	// order to "big -> small" commit-count order.
+	bigJSONL := filepath.Join(dir, "big.jsonl")
+	smallJSONL := filepath.Join(dir, "small.jsonl")
+	if err := os.WriteFile(bigJSONL, []byte(manyCommitsJSONL(50)), 0o644); err != nil {
+		t.Fatal(err)
+	}
+	if err := os.WriteFile(smallJSONL, []byte(manyCommitsJSONL(5)), 0o644); err != nil {
+		t.Fatal(err)
+	}
+
+	reportDir := t.TempDir()
+	result := &scan.Result{
+		OutputDir: dir,
+		Manifest: scan.Manifest{
+			Repos: []scan.ManifestRepo{
+				// Slug asc would give: big, broken, pending, small.
+				// Triage order expected: broken (failed), pending, big (50c), small (5c).
+				{Slug: "big", Path: "/x/big", Status: "ok", JSONL: bigJSONL},
+				{Slug: "broken", Path: "/x/broken", Status: "failed", Error: "explode"},
+				{Slug: "pending", Path: "/x/pending", Status: "pending"},
+				{Slug: "small", Path: "/x/small", Status: "ok", JSONL: smallJSONL},
+			},
+		},
+	}
+
+	err := renderScanReportDir(result, reportDir,
+		stats.LoadOptions{HalfLifeDays: 90, CoupMaxFiles: 50},
+		stats.StatsFlags{CouplingMinChanges: 1, NetworkMinFiles: 1},
+		10)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	b, err := os.ReadFile(filepath.Join(reportDir, "index.html"))
+	if err != nil {
+		t.Fatal(err)
+	}
+	out := string(b)
+
+	// Each card renders `<div class="path">{{.Path}}</div>` with the
+	// unique filesystem path. Finding each path's offset gives the
+	// render order; ok entries also include an anchor but failed /
+	// pending do not, so the `.path` cell is the stable locator.
+	idxOf := func(p string) int { return strings.Index(out, p) }
+	brokenIdx := idxOf("/x/broken")
+	pendingIdx := idxOf("/x/pending")
+	bigIdx := idxOf("/x/big")
+	smallIdx := idxOf("/x/small")
+	for name, i := range map[string]int{"broken": brokenIdx, "pending": pendingIdx, "big": bigIdx, "small": smallIdx} {
+		if i < 0 {
+			t.Fatalf("%q path not found in index output", name)
+		}
+	}
+	if !(brokenIdx < pendingIdx && pendingIdx < bigIdx && bigIdx < smallIdx) {
+		t.Errorf("triage order failed: broken=%d pending=%d big=%d small=%d — want broken < pending < big < small",
+			brokenIdx, pendingIdx, bigIdx, smallIdx)
+	}
+}
+
+// manyCommitsJSONL returns n valid commit+file rows, each with a
+// distinct SHA so LoadJSONL counts them as N commits.
+func manyCommitsJSONL(n int) string {
+	var sb strings.Builder
+	for i := 0; i < n; i++ {
+		sha := fmt.Sprintf("%040x", i+1)
+		fmt.Fprintf(&sb,
+			`{"type":"commit","sha":"%s","author_email":"me@x.com","author_name":"Me","author_date":"2024-01-%02dT00:00:00Z","additions":10,"deletions":0,"files_changed":1}`+"\n",
+			sha, (i%28)+1)
+		fmt.Fprintf(&sb,
+			`{"type":"commit_file","commit":"%s","path_current":"f%d.go","additions":10,"deletions":0}`+"\n",
+			sha, i)
+	}
+	return sb.String()
+}
+
 // Render with zero successful repos must still emit an index
 // without tripping on the MaxCommits==0 guard in the bar-width
 // template expression.

From 549775ff5f5a87212a9d16fa7402b76afadeccf7 Mon Sep 17 00:00:00 2001
From: lex0c <lex0c@proton.me>
Date: Mon, 20 Apr 2026 20:05:29 -0300
Subject: [PATCH 32/37] Validate --email requires --report before launching
 scan
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

--email alone used to fall through validation, reach the profile
branch after scan.Run completed, and call os.Create("") — surfacing
a cryptic `open : no such file or directory` error only AFTER the
slow multi-repo extract phase had already finished. On a home-dir
scan that's minutes to hours of work thrown away because of a
flag pair that was always going to fail.

Add the symmetric up-front check alongside the existing
--report-without-email rejection: --email requires --report <file>,
refuse with a clear message before scan.Run. Test uses an empty
TempDir to confirm the error surfaces without the "no git
repositories found" wrapper that would indicate discovery ran.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 cmd/gitcortex/main.go      |  8 ++++++++
 cmd/gitcortex/main_test.go | 30 ++++++++++++++++++++++++++++++
 2 files changed, 38 insertions(+)

diff --git a/cmd/gitcortex/main.go b/cmd/gitcortex/main.go
index 16304ac..a1318ed 100644
--- a/cmd/gitcortex/main.go
+++ b/cmd/gitcortex/main.go
@@ -1120,6 +1120,14 @@ breakdown — handy for showing aggregated work across many repos.`,
 			if reportPath != "" && email == "" {
 				return fmt.Errorf("--report requires --email (profile consolidation); for per-repo HTML reports use --report-dir <dir>")
 			}
+			if email != "" && reportPath == "" {
+				// Without this up-front check, --email alone would
+				// run the full scan, reach the profile branch, and
+				// only then call os.Create("") — surfacing a cryptic
+				// `open : no such file or directory` after the slow
+				// multi-repo extract phase had already completed.
+				return fmt.Errorf("--email requires --report <file> pointing at the profile HTML output path")
+			}
 			if reportPath != "" && reportDir != "" {
 				return fmt.Errorf("--report and --report-dir are mutually exclusive; pick profile (--report + --email) or per-repo (--report-dir)")
 			}
diff --git a/cmd/gitcortex/main_test.go b/cmd/gitcortex/main_test.go
index 766ce8f..4918c73 100644
--- a/cmd/gitcortex/main_test.go
+++ b/cmd/gitcortex/main_test.go
@@ -121,6 +121,36 @@ func TestScanCmd_RejectsMissingIgnoreFile(t *testing.T) {
 	}
 }
 
+// --email without --report would silently reach os.Create("") after
+// the full scan/extract phase finished, surfacing a cryptic "open :
+// no such file or directory" error minutes into a wasted run. The
+// flag pair is mandatory in both directions: reject up front
+// alongside the other flag-combination checks.
+func TestScanCmd_RejectsEmailWithoutReport(t *testing.T) {
+	cmd := scanCmd()
+	cmd.SilenceUsage = true
+	cmd.SilenceErrors = true
+	cmd.SetOut(&bytes.Buffer{})
+	cmd.SetErr(&bytes.Buffer{})
+	cmd.SetArgs([]string{
+		"--root", t.TempDir(),
+		"--output", t.TempDir(),
+		"--email", "me@x.com",
+	})
+	err := cmd.Execute()
+	if err == nil {
+		t.Fatal("expected error for --email without --report; silent fall-through would waste a full scan before failing")
+	}
+	if !strings.Contains(err.Error(), "--report") {
+		t.Errorf("error should name --report as the missing flag; got %q", err)
+	}
+	// Must fail BEFORE discovery starts; empty TempDir would
+	// otherwise produce "no git repositories found".
+	if strings.Contains(err.Error(), "no git repositories found") {
+		t.Errorf("flag validation ran after scan; got %q", err)
+	}
+}
+
 // --report without --email is ambiguous under the "no team-wide
 // consolidation" design: the only meaningful single-HTML output
 // across multiple repos is a developer profile. The error must

From 049cdb546372d057d7837ddd142890c4678547e6 Mon Sep 17 00:00:00 2001
From: lex0c <lex0c@proton.me>
Date: Mon, 20 Apr 2026 20:07:33 -0300
Subject: [PATCH 33/37] Guard future dates before truncating day differences
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

humanizeAgoAt computed `days = int(now.Sub(t).Hours() / 24)`. Go's
int conversion truncates toward zero for negative values, so a
last-commit date 12 hours in the future (tomorrow's YYYY-MM-DD
relative to a noon `now`) produced int(-0.5) == 0 — slipping past
the `case days < 0` future-date guard and rendering as "today" /
fresh. Quietly mislabeled clock-skewed commits as today's activity.

Compare UTC date midnights directly: parse `lastDate` as midnight
UTC, reduce `now` to its UTC midnight, return empty whenever
`t.After(today)`. Same-day sub-day skew still falls into days==0
→ "today" (documented day-granularity fallback). The "case days <
0" branch is kept defensively for refactor safety.

Tests add tomorrow and two-days-future cases that trip the gap
under the old arithmetic and now correctly produce empty labels.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 internal/report/scan_index.go      | 33 ++++++++++++++++++++++--------
 internal/report/scan_index_test.go |  8 +++++++-
 2 files changed, 31 insertions(+), 10 deletions(-)

diff --git a/internal/report/scan_index.go b/internal/report/scan_index.go
index 834e03b..e8b943d 100644
--- a/internal/report/scan_index.go
+++ b/internal/report/scan_index.go
@@ -48,23 +48,38 @@ func HumanizeAgo(lastDate string) (label, bucket string) {
 // but takes an explicit "now" so tests can pin the reference time
 // and avoid drift.
 //
-// Future-commit policy: a date strictly in the past ("days >= 0
-// when measured against midnight-to-midnight") is labeled.
-// Anything earlier than today's date (even by one hour, via
-// committer-date clock skew or CI rewrite) yields empty — the
-// index exists to surface how stale each repo is, and "in 3d" on
-// a repo whose scanner clock drifted would be actively misleading.
-// Sub-day skew within TODAY lands in `days == 0 → "today"`, which
-// is the most reasonable fallback given the template's day
+// Future-commit policy: dates strictly AFTER today's UTC midnight
+// (e.g. tomorrow's YYYY-MM-DD due to clock skew or a future-dated
+// CI rewrite) yield empty — the index surfaces how STALE each repo
+// is, and labeling "in 3d" as "fresh" would actively mislead. Both
+// sides of the comparison reduce to UTC midnights so "tomorrow"
+// is detected even when the user's wall clock is mid-day; sub-day
+// committer-clock skew inside today's date still falls into
+// `days == 0 → "today"`, documented as the safe fallback at day
 // granularity.
+//
+// The earlier implementation computed `days` as
+// int(now.Sub(t).Hours() / 24), which truncates toward zero for
+// negative durations. A "tomorrow" date 12 hours ahead produced
+// days == 0 instead of -1, slipping past the future-date guard
+// and rendering "today" / fresh. Comparing the parsed date
+// directly against today's UTC midnight removes the truncation
+// gap.
 func humanizeAgoAt(lastDate string, now time.Time) (label, bucket string) {
 	t, err := time.Parse("2006-01-02", lastDate)
 	if err != nil {
 		return "", ""
 	}
-	days := int(now.Sub(t).Hours() / 24)
+	today := now.UTC().Truncate(24 * time.Hour)
+	if t.After(today) {
+		return "", ""
+	}
+	days := int(today.Sub(t).Hours() / 24)
 	switch {
 	case days < 0:
+		// Defensive: t.After(today) above already covered this, but
+		// keep the branch so a later refactor that moves the guard
+		// can't silently produce a negative-days label.
 		return "", ""
 	case days == 0:
 		label = "today"
diff --git a/internal/report/scan_index_test.go b/internal/report/scan_index_test.go
index aca7ff1..470c10a 100644
--- a/internal/report/scan_index_test.go
+++ b/internal/report/scan_index_test.go
@@ -39,10 +39,16 @@ func TestHumanizeAgoAt(t *testing.T) {
 		// Future dates (clock skew) yield empty — we don't label "in 3d"
 		// on an index that exists to surface recency of PAST commits.
 		{"full-day future", "2026-05-01", "", ""},
+		// Regression for the truncation-gap bug: tomorrow (~12h ahead
+		// of `now`) used to yield int(-0.5) == 0 and silently render
+		// as "today" / fresh. Comparing UTC date midnights instead
+		// of raw durations rejects it correctly.
+		{"tomorrow (sub-24h future)", "2026-04-21", "", ""},
+		{"two days future", "2026-04-22", "", ""},
 		// Sub-day future (committer-date hours ahead of scanner clock)
 		// lands in days==0 → "today". Documented: the index works at
 		// day granularity and "today" is the least-misleading fallback
-		// for intra-day skew.
+		// for intra-day skew within today's date.
 		{"same-day future (sub-day skew)", "2026-04-20", "today", "fresh"},
 	}
 	for _, c := range cases {

From 302d13f7eff09d31dc7daa2b5d07b7d286e97baa Mon Sep 17 00:00:00 2001
From: lex0c <lex0c@proton.me>
Date: Mon, 20 Apr 2026 20:24:20 -0300
Subject: [PATCH 34/37] Validate .git entry content before treating a dir as a
 repo
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The working-tree check accepted any non-symlink `.git` entry, which
meant a random regular file called `.git` (source dump, build
artifact, a user accidentally creating the wrong thing) made the
parent directory register as a repo. The walker then SkipDir'd the
entire subtree, which had two bad outcomes:

  1. Any real repo nested under that parent silently disappeared
     from the scan — the landing index never listed it and
     operators had no hint it was being skipped.
  2. The fake parent hit the extract phase, failed with the
     cryptic "not a git repository", and inflated the failed
     count in the manifest with a bug that wasn't a repo problem
     at all.

Factor the detection into isWorkingTreeEntry: accept directories
and regular files whose first 8 bytes are "gitdir: " (the linked-
worktree / submodule pointer format git actually writes), reject
everything else including irregular files and symlinks. The
symlink policy is unchanged — this just widens the validation to
catch the "arbitrary file named .git" case too.

Tests cover all four paths: dir (existing), gitdir pointer (new
positive), arbitrary file (new negative — also asserts the nested
real repo underneath IS discovered after the fix), and symlink
(existing).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 internal/scan/discovery.go      | 67 +++++++++++++++++++++++++++------
 internal/scan/discovery_test.go | 66 ++++++++++++++++++++++++++++++++
 2 files changed, 121 insertions(+), 12 deletions(-)

diff --git a/internal/scan/discovery.go b/internal/scan/discovery.go
index 533f522..8fc96e5 100644
--- a/internal/scan/discovery.go
+++ b/internal/scan/discovery.go
@@ -134,23 +134,22 @@ func Discover(ctx context.Context, roots []string, matcher *Matcher, maxDepth in
 			}
 
 			// Two repo shapes to detect:
-			//   1. Working tree:   path/.git is a dir (normal) or file
-			//      (worktree/submodule pointer).
+			//   1. Working tree:   path/.git is a dir (normal) or a
+			//      `gitdir: …` pointer file (worktree/submodule).
 			//   2. Bare repo:      path itself contains HEAD + objects/
 			//      + refs/ — common for clones used as fixtures or
 			//      mirrors (`git clone --bare`, GitHub-style server dirs
 			//      named foo.git).
-			// Lstat (not Stat) on the .git entry so a symlink named .git
-			// pointing somewhere arbitrary doesn't get treated as a real
-			// repo. The bare check uses os.Stat by way of isBareRepo,
-			// which inspects three required entries — much harder to
-			// trick than a single dirent check.
+			// Validating the .git entry (not just its presence) matters:
+			// a random regular file named `.git` would otherwise be
+			// accepted, the parent path recorded as a repo, and the
+			// walker SkipDir'd out of the subtree — hiding any real
+			// repos nested underneath and guaranteeing a downstream
+			// "not a git repository" failure during extract. isBareRepo
+			// separately requires HEAD + objects + refs so a stray HEAD
+			// or empty objects dir can't false-positive either.
 			gitEntry := filepath.Join(path, ".git")
-			isWorkingTree := false
-			if info, statErr := os.Lstat(gitEntry); statErr == nil && info.Mode()&os.ModeSymlink == 0 {
-				isWorkingTree = true
-			}
-			if isWorkingTree || isBareRepo(path) {
+			if isWorkingTreeEntry(gitEntry) || isBareRepo(path) {
 				if seen[path] {
 					return filepath.SkipDir
 				}
@@ -294,6 +293,50 @@ func isBareRepo(path string) bool {
 	return true
 }
 
+// isWorkingTreeEntry reports whether gitEntry (the path of a `.git`
+// dirent inside a candidate repo) marks a real git working tree.
+// Accepts:
+//   - A directory named `.git` — the standard layout.
+//   - A regular file named `.git` beginning with `gitdir: ` — the
+//     pointer format git writes for linked worktrees and submodules.
+// Rejects everything else: symlinks (by existing policy — the target
+// could be anywhere), irregular files, and — the case that motivated
+// the validation — a regular file named `.git` that happens to exist
+// with unrelated content. Without this check, any plain file called
+// `.git` (source dump, build artifact, user mistake) made discovery
+// record its parent as a repo and SkipDir out of the subtree, hiding
+// nested real repos and producing a guaranteed extract failure.
+func isWorkingTreeEntry(gitEntry string) bool {
+	info, err := os.Lstat(gitEntry)
+	if err != nil {
+		return false
+	}
+	if info.Mode()&os.ModeSymlink != 0 {
+		return false
+	}
+	if info.IsDir() {
+		return true
+	}
+	if !info.Mode().IsRegular() {
+		return false
+	}
+	// A valid gitdir pointer file starts with literally "gitdir: ".
+	// Checking only the first 8 bytes is sufficient (and cheap) — the
+	// full content isn't parsed here; extract will fail loudly if the
+	// pointer target is broken, which is a better signal than silently
+	// accepting every regular file and leaving the same confusion for
+	// the extract phase.
+	f, openErr := os.Open(gitEntry)
+	if openErr != nil {
+		return false
+	}
+	defer f.Close()
+	const prefix = "gitdir: "
+	buf := make([]byte, len(prefix))
+	n, _ := f.Read(buf)
+	return n == len(prefix) && string(buf) == prefix
+}
+
 // sanitizeSlug strips characters that would break the LoadMultiJSONL prefix
 // contract (`<slug>:`) or filesystem paths. Keeps alphanumerics, dash,
 // underscore, and dot.
diff --git a/internal/scan/discovery_test.go b/internal/scan/discovery_test.go
index f8fd900..97d9e70 100644
--- a/internal/scan/discovery_test.go
+++ b/internal/scan/discovery_test.go
@@ -537,6 +537,72 @@ func TestDiscover_SlugDeterministicAcrossRuns(t *testing.T) {
 	}
 }
 
+// Regression: a random regular file named `.git` (not the
+// `gitdir: …` pointer format) used to make discovery record the
+// parent as a repo, SkipDir out of the subtree, and hide any real
+// repos nested below. Now the .git entry is validated — arbitrary
+// files are rejected AND the walker keeps descending to find real
+// repos deeper.
+func TestDiscover_RejectsArbitraryGitFileAndDescends(t *testing.T) {
+	root := t.TempDir()
+	// Parent with a bogus `.git` file — looks like a repo by naive
+	// existence check, isn't by content check.
+	bogus := filepath.Join(root, "bogus")
+	if err := os.MkdirAll(bogus, 0o755); err != nil {
+		t.Fatal(err)
+	}
+	if err := os.WriteFile(filepath.Join(bogus, ".git"), []byte("not a git pointer file\n"), 0o644); err != nil {
+		t.Fatal(err)
+	}
+	// Real nested repo the old code would have hidden via SkipDir
+	// after recording the bogus parent.
+	mustMkRepo(t, filepath.Join(bogus, "nested"))
+
+	repos, err := Discover(context.Background(), []string{root}, NewMatcher(nil), 0)
+	if err != nil {
+		t.Fatal(err)
+	}
+	got := map[string]bool{}
+	for _, r := range repos {
+		got[r.RelPath] = true
+	}
+	if got["bogus"] {
+		t.Error("parent with a non-pointer `.git` file must not be recorded as a repo")
+	}
+	if !got["bogus/nested"] {
+		t.Errorf("real repo nested under a bogus .git file should still be discovered; got %+v", repos)
+	}
+}
+
+// Regression companion: a valid `gitdir: …` pointer file (the
+// format git writes for linked worktrees and submodules) must still
+// be accepted as a repo.
+func TestDiscover_AcceptsGitdirPointerFile(t *testing.T) {
+	root := t.TempDir()
+	realGit := filepath.Join(t.TempDir(), "real-git-dir")
+	if err := os.MkdirAll(realGit, 0o755); err != nil {
+		t.Fatal(err)
+	}
+
+	wt := filepath.Join(root, "worktree")
+	if err := os.MkdirAll(wt, 0o755); err != nil {
+		t.Fatal(err)
+	}
+	// Pointer file: git's actual format is literally "gitdir: <path>\n".
+	pointer := []byte("gitdir: " + realGit + "\n")
+	if err := os.WriteFile(filepath.Join(wt, ".git"), pointer, 0o644); err != nil {
+		t.Fatal(err)
+	}
+
+	repos, err := Discover(context.Background(), []string{root}, NewMatcher(nil), 0)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(repos) != 1 || repos[0].RelPath != "worktree" {
+		t.Fatalf("expected to discover the gitdir-pointer worktree, got %+v", repos)
+	}
+}
+
 func TestDiscover_RejectsSymlinkGit(t *testing.T) {
 	root := t.TempDir()
 	// A "repo" whose .git is a symlink — should not be picked up.

From 9d61c47c1c68763d055bc6296d30b9c465d5bc98 Mon Sep 17 00:00:00 2001
From: lex0c <lex0c@proton.me>
Date: Mon, 20 Apr 2026 21:00:57 -0300
Subject: [PATCH 35/37] Normalize skipped statuses before rendering scan index
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

scan.runOne emits Status="skipped" when the worker picks up a job
after ctx has already cancelled. The summary strip already folded
skipped into PendingRepos, but the per-row renderer forwarded the
raw status into `class="repo skipped"` and `status-skipped` — CSS
selectors the template never defined, so skipped rows lost the
amber border and status pill. A cancelled scan would show a
"1 pending" card in the summary but N unstyled white rows below,
hiding exactly the cancellation fallout that most needs triage.

Fold skipped into pending at the entry boundary in
renderScanReportDir so the row class names align with the summary
counts. Rather than widen the template's CSS vocabulary to two
near-identical rules, collapse the synonym at the one place where
the internal status becomes user-visible chrome.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 cmd/gitcortex/main.go      | 14 +++++++++-
 cmd/gitcortex/main_test.go | 53 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 66 insertions(+), 1 deletion(-)

diff --git a/cmd/gitcortex/main.go b/cmd/gitcortex/main.go
index a1318ed..777b9da 100644
--- a/cmd/gitcortex/main.go
+++ b/cmd/gitcortex/main.go
@@ -913,10 +913,22 @@ func renderScanReportDir(result *scan.Result, dir string, loadOpts stats.LoadOpt
 	maxCommits := 0
 
 	for _, m := range result.Manifest.Repos {
+		// Collapse "skipped" (worker picked the job up and saw
+		// ctx.Err before running extract) into "pending" for the
+		// index. The summary strip already buckets them together
+		// under PendingRepos; without normalizing here too, the
+		// per-row render would emit class="repo skipped" /
+		// status-skipped — CSS selectors the template doesn't
+		// define, so skipped rows lost the amber border and pill
+		// that make cancellation fallout easy to spot at a glance.
+		status := m.Status
+		if status == "skipped" {
+			status = "pending"
+		}
 		entry := reportpkg.ScanIndexEntry{
 			Slug:   m.Slug,
 			Path:   m.Path,
-			Status: m.Status,
+			Status: status,
 			Error:  m.Error,
 		}
 		if m.Status != "ok" {
diff --git a/cmd/gitcortex/main_test.go b/cmd/gitcortex/main_test.go
index 4918c73..7230508 100644
--- a/cmd/gitcortex/main_test.go
+++ b/cmd/gitcortex/main_test.go
@@ -409,6 +409,59 @@ func manyCommitsJSONL(n int) string {
 	return sb.String()
 }
 
+// Regression: scan emits Status="skipped" for repos the worker
+// picked up but abandoned when ctx.Err fired (see scan.runOne).
+// The index template only styles .repo.pending / .status-pending;
+// a raw "skipped" status would produce class="repo skipped" with
+// no CSS match, hiding the amber border and pill that make
+// cancellation fallout triagable. The renderer must fold skipped
+// into pending so the row visuals stay consistent with the
+// summary-strip count (which already buckets them together).
+func TestRenderScanReportDir_NormalizesSkippedToPending(t *testing.T) {
+	reportDir := t.TempDir()
+	result := &scan.Result{
+		OutputDir: t.TempDir(),
+		Manifest: scan.Manifest{
+			Repos: []scan.ManifestRepo{
+				{Slug: "a", Path: "/x/a", Status: "skipped", Error: "context canceled"},
+			},
+		},
+	}
+	err := renderScanReportDir(result, reportDir,
+		stats.LoadOptions{HalfLifeDays: 90, CoupMaxFiles: 50},
+		stats.StatsFlags{CouplingMinChanges: 1, NetworkMinFiles: 1},
+		10)
+	if err != nil {
+		t.Fatal(err)
+	}
+	indexB, err := os.ReadFile(filepath.Join(reportDir, "index.html"))
+	if err != nil {
+		t.Fatal(err)
+	}
+	index := string(indexB)
+
+	// Summary counts skipped under pending (already the case, but
+	// anchor it so a future refactor doesn't drift).
+	if !strings.Contains(index, "1 pending") {
+		t.Errorf("skipped entry should count as pending in summary; got index excerpt: %.400s", index)
+	}
+	// Row CSS must use the pending selector — a raw "skipped" class
+	// has no style defined so the row would lose its warning affordance.
+	if !strings.Contains(index, `class="repo pending"`) {
+		t.Error("skipped entry's card must render with `class=\"repo pending\"`")
+	}
+	if strings.Contains(index, `class="repo skipped"`) {
+		t.Error("raw `skipped` leaked into CSS class — template selectors don't match it")
+	}
+	// Pill class must also normalize.
+	if !strings.Contains(index, `status-pending`) {
+		t.Error("skipped entry's status pill should read as `status-pending`")
+	}
+	if strings.Contains(index, `status-skipped`) {
+		t.Error("raw `skipped` leaked into pill class")
+	}
+}
+
 // Render with zero successful repos must still emit an index
 // without tripping on the MaxCommits==0 guard in the bar-width
 // template expression.

From 13749d3f6d0a2b569ac54b29bb130a9839e4372d Mon Sep 17 00:00:00 2001
From: lex0c <lex0c@proton.me>
Date: Mon, 20 Apr 2026 21:03:33 -0300
Subject: [PATCH 36/37] Use an aggregated label for multi-root profile reports
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

scan --report --email hardcoded the profile's RepoName to
filepath.Base(roots[0]) — fine for single-root scans, misleading
for multi-root runs where the dataset actually spans every --root.
In practice: `gitcortex scan --root ~/work --root ~/oss ... --email
me@x.com` produced a report titled "· work", which readers
interpreted as belonging only to the first root when the stats
covered both.

profileScanLabel picks a label based on root cardinality:

  - 1 root → basename (unchanged common case)
  - 2-3 roots → joined with " + " so the scope is readable
  - 4+ roots → "N roots" (joining many basenames bloats the H1
    and the exact set lives in the scan log and manifest)

Smoke-tested on a 50-root microservice workspace: H1 now reads
"<dev> · 50 roots" instead of "<dev> · <first-ms-name>".

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 cmd/gitcortex/main.go      | 31 ++++++++++++++++++++++++++++++-
 cmd/gitcortex/main_test.go | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 62 insertions(+), 1 deletion(-)

diff --git a/cmd/gitcortex/main.go b/cmd/gitcortex/main.go
index 777b9da..640b3ae 100644
--- a/cmd/gitcortex/main.go
+++ b/cmd/gitcortex/main.go
@@ -873,6 +873,35 @@ func reportCmd() *cobra.Command {
 	return cmd
 }
 
+// profileScanLabel builds the `RepoName` header string for
+// `scan --report --email`. The profile dataset aggregates commits
+// from every successful repo across all --root values, so titling
+// the report after roots[0] alone would mislead readers into
+// thinking the scope is a single root's work.
+//
+// Label policy:
+//   - 1 root → root basename (the default case, unchanged).
+//   - 2-3 roots → joined with " + " so the scope is visible at a glance.
+//   - 4+ roots → "N roots" because joining many basenames bloats
+//     the H1 and the exact set is available in the scan log and
+//     manifest anyway.
+func profileScanLabel(roots []string) string {
+	switch len(roots) {
+	case 0:
+		return "scan"
+	case 1:
+		return filepath.Base(absPath(roots[0]))
+	case 2, 3:
+		parts := make([]string, len(roots))
+		for i, r := range roots {
+			parts[i] = filepath.Base(absPath(r))
+		}
+		return strings.Join(parts, " + ")
+	default:
+		return fmt.Sprintf("%d roots", len(roots))
+	}
+}
+
 // scanIndexStatusRank maps a ManifestRepo.Status to a sort bucket
 // used by the index page ordering. Lower rank renders first; failed
 // entries float to the top so operators spot them immediately,
@@ -1228,7 +1257,7 @@ breakdown — handy for showing aggregated work across many repos.`,
 				}
 				defer f.Close()
 
-				repoLabel := filepath.Base(absPath(cfg.Roots[0]))
+				repoLabel := profileScanLabel(cfg.Roots)
 				if err := reportpkg.GenerateProfile(f, ds, repoLabel, email); err != nil {
 					return fmt.Errorf("generate profile report: %w", err)
 				}
diff --git a/cmd/gitcortex/main_test.go b/cmd/gitcortex/main_test.go
index 7230508..70e31e1 100644
--- a/cmd/gitcortex/main_test.go
+++ b/cmd/gitcortex/main_test.go
@@ -517,6 +517,38 @@ func makeRepoWithCommit(t *testing.T, dir, file, contents string) {
 	run("commit", "-q", "-m", "initial")
 }
 
+// The profile report title used to echo roots[0] even when the
+// scan aggregated many roots, so "gitcortex scan --root ~/work
+// --root ~/oss --report me.html --email me@x.com" produced a
+// report titled "· work" that readers interpreted as belonging
+// only to the first root. Ensure the label reflects the actual
+// scope.
+func TestProfileScanLabel(t *testing.T) {
+	cases := []struct {
+		name  string
+		roots []string
+		want  string
+	}{
+		{"no roots", nil, "scan"},
+		{"single root uses basename", []string{"/home/me/work"}, "work"},
+		// Two and three-root scans join with " + " so the scope is
+		// visible at a glance rather than being summarised as a number.
+		{"two roots joined", []string{"/home/me/work", "/home/me/personal"}, "work + personal"},
+		{"three roots joined", []string{"/a/one", "/b/two", "/c/three"}, "one + two + three"},
+		// Large scans fall back to a count — joining 50 basenames
+		// would bloat the H1; the exact set lives in the scan log
+		// and manifest.
+		{"many roots use count", []string{"/a/x", "/b/x", "/c/x", "/d/x", "/e/x"}, "5 roots"},
+	}
+	for _, c := range cases {
+		t.Run(c.name, func(t *testing.T) {
+			if got := profileScanLabel(c.roots); got != c.want {
+				t.Errorf("profileScanLabel(%v) = %q, want %q", c.roots, got, c.want)
+			}
+		})
+	}
+}
+
 func TestValidateDate(t *testing.T) {
 	cases := []struct {
 		in      string

From 247cd16db4c1e4ddf23eedf74ae08ade45cf03a1 Mon Sep 17 00:00:00 2001
From: lex0c <lex0c@proton.me>
Date: Tue, 21 Apr 2026 11:40:24 -0300
Subject: [PATCH 37/37] Raise scan parallel default and skip the sort in
 dev-email aggregation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two small perf wins bundled together — both touch the scan landing
code path.

--parallel default was hardcoded to 4. Fine for 2008-era laptops,
conservative on anything modern — a 12-thread dev box was leaving
8 workers on the table whenever the operator forgot to pass the
flag. Swap in defaultScanParallel(): runtime.NumCPU() floored at 2
(single-core CI still makes progress) and capped at 16 (git I/O
stops scaling past ~8-16 on SSDs; avoids over-subscription on
64-core servers).

renderScanReportDir's per-repo dev-set aggregation called
stats.TopContributors(ds, 0) just to walk email strings. That
function copies every ContributorStat by value and runs a full
sort by commit count — both wasted work when the caller discards
the struct and ordering. New stats.DevEmails(ds) returns only the
email slice, unordered. Per-repo savings are ms-scale but
cumulative on large scans.

Tests: DefaultScanParallel asserts the [2, 16] invariant; DevEmails
pins the set for the standard makeDataset fixture.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 cmd/gitcortex/main.go        | 25 ++++++++++++++++++++++---
 cmd/gitcortex/main_test.go   | 11 +++++++++++
 internal/stats/stats.go      | 13 +++++++++++++
 internal/stats/stats_test.go | 24 ++++++++++++++++++++++++
 4 files changed, 70 insertions(+), 3 deletions(-)

diff --git a/cmd/gitcortex/main.go b/cmd/gitcortex/main.go
index 640b3ae..256ec87 100644
--- a/cmd/gitcortex/main.go
+++ b/cmd/gitcortex/main.go
@@ -9,6 +9,7 @@ import (
 	"os"
 	"os/signal"
 	"path/filepath"
+	"runtime"
 	"sort"
 	"strings"
 	"syscall"
@@ -873,6 +874,24 @@ func reportCmd() *cobra.Command {
 	return cmd
 }
 
+// defaultScanParallel picks an initial concurrency that scales with
+// the host but caps the aggressive end: git extract is a mix of
+// CPU (diff/numstat parsing) and disk I/O, so more than ~8 workers
+// hit diminishing returns on typical SSDs and can thrash on
+// spinning disks. The cap at 16 keeps 64-core servers from
+// over-subscribing; the floor at 2 keeps single-core CI from
+// serialising every scan.
+func defaultScanParallel() int {
+	n := runtime.NumCPU()
+	if n > 16 {
+		n = 16
+	}
+	if n < 2 {
+		n = 2
+	}
+	return n
+}
+
 // profileScanLabel builds the `RepoName` header string for
 // `scan --report --email`. The profile dataset aggregates commits
 // from every successful repo across all --root values, so titling
@@ -1015,8 +1034,8 @@ func renderScanReportDir(result *scan.Result, dir string, loadOpts stats.LoadOpt
 		if entry.Commits > maxCommits {
 			maxCommits = entry.Commits
 		}
-		for _, c := range stats.TopContributors(ds, 0) {
-			allDevs[c.Email] = struct{}{}
+		for _, email := range stats.DevEmails(ds) {
+			allDevs[email] = struct{}{}
 		}
 
 		entries = append(entries, entry)
@@ -1286,7 +1305,7 @@ breakdown — handy for showing aggregated work across many repos.`,
 	cmd.Flags().StringVar(&output, "output", "scan-output", "Directory to write per-repo JSONL files and the manifest")
 	cmd.Flags().StringVar(&ignoreFile, "ignore-file", "", "Gitignore-style file with directories to skip during discovery. When unset, only the first --root is searched for a .gitcortex-ignore; pass an explicit path to apply rules across all roots.")
 	cmd.Flags().IntVar(&maxDepth, "max-depth", 0, "Maximum directory depth to descend into when looking for repos (0 = unlimited)")
-	cmd.Flags().IntVar(&parallel, "parallel", 4, "Number of repositories to extract in parallel")
+	cmd.Flags().IntVar(&parallel, "parallel", defaultScanParallel(), "Number of repositories to extract in parallel")
 	cmd.Flags().StringVar(&email, "email", "", "Generate a per-developer profile report (only when --report is set)")
 	cmd.Flags().StringVar(&from, "from", "", "Window start date YYYY-MM-DD (forwarded to the consolidated report)")
 	cmd.Flags().StringVar(&to, "to", "", "Window end date YYYY-MM-DD (forwarded to the consolidated report)")
diff --git a/cmd/gitcortex/main_test.go b/cmd/gitcortex/main_test.go
index 70e31e1..2795e2b 100644
--- a/cmd/gitcortex/main_test.go
+++ b/cmd/gitcortex/main_test.go
@@ -549,6 +549,17 @@ func TestProfileScanLabel(t *testing.T) {
 	}
 }
 
+// defaultScanParallel must stay within [2, 16] regardless of host
+// NumCPU. The floor protects single-core CI from serialising every
+// scan; the cap prevents 64-core servers from over-subscribing
+// when git I/O can't actually use that much concurrency.
+func TestDefaultScanParallel(t *testing.T) {
+	got := defaultScanParallel()
+	if got < 2 || got > 16 {
+		t.Errorf("defaultScanParallel() = %d, want between 2 and 16 inclusive", got)
+	}
+}
+
 func TestValidateDate(t *testing.T) {
 	cases := []struct {
 		in      string
diff --git a/internal/stats/stats.go b/internal/stats/stats.go
index a0b985c..331ac3a 100644
--- a/internal/stats/stats.go
+++ b/internal/stats/stats.go
@@ -239,6 +239,19 @@ func ComputeSummary(ds *Dataset) Summary {
 	return s
 }
 
+// DevEmails returns every author email in the dataset, unordered.
+// Cheaper than TopContributors(ds, 0) when the caller only needs
+// the email set (e.g. aggregating unique devs across multiple
+// repos): skips the per-contributor struct copy and the full sort
+// that TopContributors pays for ranking purposes.
+func DevEmails(ds *Dataset) []string {
+	out := make([]string, 0, len(ds.contributors))
+	for email := range ds.contributors {
+		out = append(out, email)
+	}
+	return out
+}
+
 func TopContributors(ds *Dataset, n int) []ContributorStat {
 	result := make([]ContributorStat, 0, len(ds.contributors))
 	for _, cs := range ds.contributors {
diff --git a/internal/stats/stats_test.go b/internal/stats/stats_test.go
index a22bd15..b9c418b 100644
--- a/internal/stats/stats_test.go
+++ b/internal/stats/stats_test.go
@@ -63,6 +63,30 @@ func makeDataset() *Dataset {
 	return ds
 }
 
+// DevEmails returns every author email unordered. Faster than
+// TopContributors(ds, 0) when the caller only needs the set —
+// used by scan's per-repo render loop to aggregate unique devs
+// across repos without paying the full contributor sort per repo.
+func TestDevEmails(t *testing.T) {
+	ds := makeDataset()
+	got := map[string]bool{}
+	for _, e := range DevEmails(ds) {
+		got[e] = true
+	}
+	want := map[string]bool{
+		"alice@test.com": true,
+		"bob@test.com":   true,
+	}
+	for email := range want {
+		if !got[email] {
+			t.Errorf("missing %q in DevEmails output: %v", email, got)
+		}
+	}
+	if len(got) != len(want) {
+		t.Errorf("unexpected extras in DevEmails: got %v, want %v", got, want)
+	}
+}
+
 func TestComputeSummary(t *testing.T) {
 	ds := makeDataset()
 	s := ComputeSummary(ds)