From 369e78b51ed08fcb1ce02792925c9261c08db63a Mon Sep 17 00:00:00 2001 From: lex0c Date: Mon, 20 Apr 2026 11:49:17 -0300 Subject: [PATCH 01/37] Add scan command for multi-repo discovery and consolidated reporting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Walks one or more roots, finds every git repo, runs extract per repo in parallel, and emits a manifest plus optional consolidated HTML report. The report grows a per-repository breakdown surfacing how work distributes across repos — both in the team report and in the --email developer profile, where Files/commits/churn are filtered to that dev's contributions. - internal/scan: discovery, gitignore-style ignore matcher, worker pool. Two-pass slug naming makes .state stable across runs so resume keeps working when sibling repos share basenames. - internal/stats: commitEntry carries the LoadMultiJSONL pathPrefix; RepoBreakdown groups commits + files by repo with optional email filter (per-dev file count uses devCommits, not the repo total). - internal/report: Per-Repository Breakdown section in both the main and profile templates, gated on len(Repos) > 1. Co-Authored-By: Claude Opus 4.7 (1M context) --- cmd/gitcortex/main.go | 162 +++++++++++++++ internal/report/profile_template.go | 36 ++++ internal/report/report.go | 15 ++ internal/report/template.go | 39 ++++ internal/scan/discovery.go | 175 +++++++++++++++++ internal/scan/discovery_test.go | 166 ++++++++++++++++ internal/scan/ignore.go | 160 +++++++++++++++ internal/scan/ignore_test.go | 81 ++++++++ internal/scan/scan.go | 271 ++++++++++++++++++++++++++ internal/scan/scan_test.go | 138 +++++++++++++ internal/stats/reader.go | 6 + internal/stats/repo_breakdown.go | 169 ++++++++++++++++ internal/stats/repo_breakdown_test.go | 127 ++++++++++++ 13 files changed, 1545 insertions(+) create mode 100644 internal/scan/discovery.go create mode 100644 internal/scan/discovery_test.go create mode 100644 internal/scan/ignore.go create mode 100644 internal/scan/ignore_test.go create mode 100644 internal/scan/scan.go create mode 100644 internal/scan/scan_test.go create mode 100644 internal/stats/repo_breakdown.go create mode 100644 internal/stats/repo_breakdown_test.go diff --git a/cmd/gitcortex/main.go b/cmd/gitcortex/main.go index 50267c4..ddcf0eb 100644 --- a/cmd/gitcortex/main.go +++ b/cmd/gitcortex/main.go @@ -16,6 +16,7 @@ import ( "github.com/lex0c/gitcortex/internal/extract" "github.com/lex0c/gitcortex/internal/git" reportpkg "github.com/lex0c/gitcortex/internal/report" + "github.com/lex0c/gitcortex/internal/scan" "github.com/lex0c/gitcortex/internal/stats" "github.com/spf13/cobra" @@ -35,6 +36,7 @@ func main() { rootCmd.AddCommand(diffCmd()) rootCmd.AddCommand(ciCmd()) rootCmd.AddCommand(reportCmd()) + rootCmd.AddCommand(scanCmd()) if err := rootCmd.Execute(); err != nil { os.Exit(1) @@ -869,3 +871,163 @@ func reportCmd() *cobra.Command { return cmd } + +// --- Scan --- + +func scanCmd() *cobra.Command { + var ( + roots []string + output string + ignoreFile string + maxDepth int + parallel int + email string + from string + to string + since string + reportPath string + topN int + extractIgnore []string + batchSize int + mailmap bool + firstParent bool + includeMessages bool + couplingMaxFiles int + couplingMinChanges int + churnHalfLife int + networkMinFiles int + ) + + cmd := &cobra.Command{ + Use: "scan", + Short: "Discover git repositories under one or more roots and consolidate their history", + Long: `Walk the given root(s), find every git repository, and run extract on each +repository in parallel. Outputs one JSONL per repo plus a manifest in --output. +Optionally generates a consolidated HTML report including a per-repository +breakdown — handy for showing aggregated work across many repos.`, + RunE: func(cmd *cobra.Command, args []string) error { + if len(roots) == 0 { + return fmt.Errorf("--root is required (repeatable for multiple roots)") + } + if since != "" && (from != "" || to != "") { + return fmt.Errorf("--since cannot be combined with --from/--to") + } + if err := validateDate(from, "--from"); err != nil { + return err + } + if err := validateDate(to, "--to"); err != nil { + return err + } + if from != "" && to != "" && from > to { + return fmt.Errorf("--from (%s) must be on or before --to (%s)", from, to) + } + + cfg := scan.Config{ + Roots: roots, + Output: output, + IgnoreFile: ignoreFile, + MaxDepth: maxDepth, + Parallel: parallel, + Extract: extract.Config{ + BatchSize: batchSize, + IncludeMessages: includeMessages, + CommandTimeout: extract.DefaultCommandTimeout, + FirstParent: firstParent, + Mailmap: mailmap, + IgnorePatterns: extractIgnore, + StartOffset: -1, + }, + } + + ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM) + defer stop() + + result, err := scan.Run(ctx, cfg) + // scan.Run returns a partial result alongside ctx.Err() on + // cancellation. Honor that — write whatever progress we made + // to disk and surface the error so the CLI exits non-zero. + if err != nil { + return err + } + + if reportPath == "" { + fmt.Fprintf(os.Stderr, "Scan complete: %d JSONL file(s) in %s\n", len(result.JSONLs), result.OutputDir) + return nil + } + if len(result.JSONLs) == 0 { + return fmt.Errorf("no successful repos extracted; cannot build report") + } + + fromDate := from + if since != "" { + d, err := parseSince(since) + if err != nil { + return err + } + fromDate = d + } + + ds, err := stats.LoadMultiJSONL(result.JSONLs, stats.LoadOptions{ + From: fromDate, + To: to, + HalfLifeDays: churnHalfLife, + CoupMaxFiles: couplingMaxFiles, + }) + if err != nil { + return fmt.Errorf("load consolidated dataset: %w", err) + } + fmt.Fprintf(os.Stderr, "Loaded %d commits across %d repo(s)\n", ds.CommitCount, len(result.JSONLs)) + + f, err := os.Create(reportPath) + if err != nil { + return fmt.Errorf("create report: %w", err) + } + defer f.Close() + + // Label the report after the basename of the first --root + // (or the output dir as a fallback). "scan-scan-output" was + // the previous default; users find the root name far more + // recognizable as the H1 of the report. + repoLabel := filepath.Base(result.OutputDir) + if len(cfg.Roots) > 0 { + repoLabel = filepath.Base(absPath(cfg.Roots[0])) + } + sf := stats.StatsFlags{CouplingMinChanges: couplingMinChanges, NetworkMinFiles: networkMinFiles} + if email != "" { + if err := reportpkg.GenerateProfile(f, ds, repoLabel, email); err != nil { + return fmt.Errorf("generate profile report: %w", err) + } + fmt.Fprintf(os.Stderr, "Profile report for %s written to %s\n", email, fileURL(reportPath)) + return nil + } + if err := reportpkg.Generate(f, ds, repoLabel, topN, sf); err != nil { + return fmt.Errorf("generate report: %w", err) + } + fmt.Fprintf(os.Stderr, "Consolidated report written to %s\n", fileURL(reportPath)) + return nil + }, + } + + cmd.Flags().StringSliceVar(&roots, "root", nil, "Root directory to walk for repositories (repeatable)") + cmd.Flags().StringVar(&output, "output", "scan-output", "Directory to write per-repo JSONL files and the manifest") + cmd.Flags().StringVar(&ignoreFile, "ignore-file", "", "Gitignore-style file with directories to skip during discovery. When unset, only the first --root is searched for a .gitcortex-ignore; pass an explicit path to apply rules across all roots.") + cmd.Flags().IntVar(&maxDepth, "max-depth", 0, "Maximum directory depth to descend into when looking for repos (0 = unlimited)") + cmd.Flags().IntVar(¶llel, "parallel", 4, "Number of repositories to extract in parallel") + cmd.Flags().StringVar(&email, "email", "", "Generate a per-developer profile report (only when --report is set)") + cmd.Flags().StringVar(&from, "from", "", "Window start date YYYY-MM-DD (forwarded to the consolidated report)") + cmd.Flags().StringVar(&to, "to", "", "Window end date YYYY-MM-DD (forwarded to the consolidated report)") + cmd.Flags().StringVar(&since, "since", "", "Filter to recent period (e.g. 7d, 4w, 3m, 1y); mutually exclusive with --from/--to") + cmd.Flags().StringVar(&reportPath, "report", "", "If set, generate a consolidated HTML report at this path after the scan") + cmd.Flags().IntVar(&topN, "top", 20, "Top-N entries per section in the consolidated report") + cmd.Flags().StringSliceVar(&extractIgnore, "extract-ignore", nil, "Glob patterns forwarded to per-repo extract --ignore (e.g. package-lock.json)") + cmd.Flags().IntVar(&batchSize, "batch-size", 1000, "Per-repo extract checkpoint interval") + cmd.Flags().BoolVar(&mailmap, "mailmap", false, "Use .mailmap (per repo) to normalize identities") + cmd.Flags().BoolVar(&firstParent, "first-parent", false, "Restrict extracts to the first-parent chain") + cmd.Flags().BoolVar(&includeMessages, "include-commit-messages", false, "Include commit messages in JSONL (needed for Top Commits in the consolidated report)") + cmd.Flags().IntVar(&couplingMaxFiles, "coupling-max-files", 50, "Max files per commit for coupling analysis (consolidated report)") + cmd.Flags().IntVar(&couplingMinChanges, "coupling-min-changes", 5, "Min co-changes for coupling results (consolidated report)") + cmd.Flags().IntVar(&churnHalfLife, "churn-half-life", 90, "Half-life in days for churn decay (consolidated report)") + cmd.Flags().IntVar(&networkMinFiles, "network-min-files", 5, "Min shared files for dev-network edges (consolidated report)") + + return cmd +} diff --git a/internal/report/profile_template.go b/internal/report/profile_template.go index 5bb8f0f..5fedc3a 100644 --- a/internal/report/profile_template.go +++ b/internal/report/profile_template.go @@ -134,6 +134,42 @@ footer { margin-top: 40px; padding-top: 16px; border-top: 1px solid #d0d7de; col {{end}} +{{if gt (len .Repos) 1}} +

Per-Repository Breakdown {{thousands (len .Repos)}} repositories

+

How {{.Profile.Name}}'s work is split across repositories in this scan. Use this to point at the projects with the most activity, or to spot single-repo focus vs. broad multi-repo engagement.

+ + + + + + + + + + + +{{range .Repos}} + + + + + + + + + + +{{end}} +
RepositoryCommits% of My CommitsChurn% of My ChurnFilesActive daysFirst → Last
{{.Repo}}{{thousands .Commits}} +
+
+
+
+ {{printf "%.1f" .PctOfTotalCommits}}% +
+
{{thousands .Churn}}{{printf "%.1f" .PctOfTotalChurn}}%{{thousands .Files}}{{.ActiveDays}}{{.FirstCommitDate}} → {{.LastCommitDate}}
+{{end}} + {{if .Profile.TopFiles}}

Top Files

Files this developer changed most (churn = additions + deletions). High churn on few files suggests deep ownership and potential knowledge concentration. · {{docRef "hotspots"}}

diff --git a/internal/report/report.go b/internal/report/report.go index 80e875f..358c894 100644 --- a/internal/report/report.go +++ b/internal/report/report.go @@ -68,6 +68,12 @@ type ReportData struct { TotalDirectories int TotalExtensions int TotalBusFactorFiles int + + // Repos holds per-repository aggregates for multi-repo (scan) reports. + // Empty on single-repo runs — the template gates the section behind + // `{{if gt (len .Repos) 1}}` so single-repo callers keep their + // existing layout untouched. + Repos []stats.RepoStat } // htmlTreeDepth caps the repo-structure tree baked into the HTML report. @@ -376,6 +382,7 @@ func Generate(w io.Writer, ds *stats.Dataset, repoName string, topN int, sf stat TotalDirectories: stats.DirectoryCount(ds), TotalExtensions: stats.ExtensionCount(ds), TotalBusFactorFiles: stats.BusFactorCount(ds), + Repos: stats.RepoBreakdown(ds, ""), } CapChildrenPerDir(data.Structure, htmlTreeMaxChildrenPerDir) @@ -641,6 +648,13 @@ type ProfileReportData struct { MaxActivityCommits int PatternGrid [7][24]int MaxPattern int + + // Repos is the per-repository breakdown filtered to this developer's + // commits. Empty on single-repo profile reports — gated in the + // template so existing single-repo callers see no change. The headline + // use case for `gitcortex scan --email me` lives here: the developer + // can see at a glance which repos they spent time in. + Repos []stats.RepoStat } func GenerateProfile(w io.Writer, ds *stats.Dataset, repoName, email string) error { @@ -672,6 +686,7 @@ func GenerateProfile(w io.Writer, ds *stats.Dataset, repoName, email string) err MaxActivityCommits: maxAct, PatternGrid: p.WorkGrid, MaxPattern: maxP, + Repos: stats.RepoBreakdown(ds, email), } return profileTmpl.Execute(w, data) diff --git a/internal/report/template.go b/internal/report/template.go index d6a3a64..5abc4e1 100644 --- a/internal/report/template.go +++ b/internal/report/template.go @@ -145,6 +145,45 @@ footer { margin-top: 40px; padding-top: 16px; border-top: 1px solid #d0d7de; col +{{if gt (len .Repos) 1}} +

Per-Repository Breakdown {{thousands (len .Repos)}} repositories

+

Cross-repo aggregation from scan. Use this view to see how work is distributed: a single repo dominating commits or churn often means the consolidated activity story is really about that one project; an even spread shows broad multi-repo engagement. Bar widths are normalized to the largest commit-count repo.

+{{$maxRepoCommits := 0}}{{range .Repos}}{{if gt .Commits $maxRepoCommits}}{{$maxRepoCommits = .Commits}}{{end}}{{end}} + + + + + + + + + + + + +{{range .Repos}} + + + + + + + + + + + +{{end}} +
RepositoryCommits% CommitsChurn% ChurnFilesActive daysDevsFirst → Last
{{.Repo}}{{thousands .Commits}} +
+
+
+
+ {{printf "%.1f" .PctOfTotalCommits}}% +
+
{{thousands .Churn}}{{printf "%.1f" .PctOfTotalChurn}}%{{thousands .Files}}{{.ActiveDays}}{{.UniqueDevs}}{{.FirstCommitDate}} → {{.LastCommitDate}}
+{{end}} + {{if .ActivityYears}}

Activity

Monthly commit heatmap. Darker = more commits. Sudden drop-offs may mark team changes, re-orgs, or freezes; steady cadence signals healthy pace. Hover for details; toggle to table for exact numbers. · {{docRef "activity"}}

diff --git a/internal/scan/discovery.go b/internal/scan/discovery.go new file mode 100644 index 0000000..0d7b66f --- /dev/null +++ b/internal/scan/discovery.go @@ -0,0 +1,175 @@ +package scan + +import ( + "crypto/sha1" + "encoding/hex" + "fmt" + "log" + "os" + "path/filepath" + "sort" + "strings" +) + +// Repo describes one git repository discovered during scan. +type Repo struct { + // AbsPath is the absolute path to the working tree root. + AbsPath string + // RelPath is AbsPath relative to the scan root it was found under. + RelPath string + // Slug is a filesystem-safe identifier used to derive the output + // JSONL file name and the path prefix that stats will use as the + // repo label. Must be unique across the scan (collisions are resolved + // by appending a short hash of AbsPath). + Slug string +} + +// Discover walks the given roots and returns every git repository it finds. +// Directories matched by the ignore matcher are pruned. Each repo's Slug is +// unique across the full result so downstream JSONL files don't collide, +// and the prefix LoadMultiJSONL derives (basename-minus-ext) still groups +// correctly. +func Discover(roots []string, matcher *Matcher, maxDepth int) ([]Repo, error) { + if matcher == nil { + matcher = NewMatcher(nil) + } + var repos []Repo + seen := make(map[string]bool) + + for _, root := range roots { + abs, err := filepath.Abs(root) + if err != nil { + return nil, fmt.Errorf("resolve %s: %w", root, err) + } + info, err := os.Stat(abs) + if err != nil { + return nil, fmt.Errorf("stat %s: %w", abs, err) + } + if !info.IsDir() { + return nil, fmt.Errorf("%s is not a directory", abs) + } + + err = filepath.WalkDir(abs, func(path string, d os.DirEntry, werr error) error { + if werr != nil { + // Permission errors on one subtree shouldn't abort the + // whole scan. Log and keep walking. + log.Printf("scan: skip %s: %v", path, werr) + if d != nil && d.IsDir() { + return filepath.SkipDir + } + return nil + } + if !d.IsDir() { + return nil + } + + rel, _ := filepath.Rel(abs, path) + rel = filepath.ToSlash(rel) + + if maxDepth > 0 && rel != "." { + depth := strings.Count(rel, "/") + 1 + if depth > maxDepth { + return filepath.SkipDir + } + } + + if rel != "." && matcher.Match(rel, true) { + return filepath.SkipDir + } + + // Detect .git either as a directory (normal) or a file + // (worktree / submodule). Presence alone is enough — we don't + // try to validate the content; extract will fail loudly if + // the repo is broken, which is a better signal than silently + // skipping. + // Lstat (not Stat) so a symlink named `.git` pointing somewhere + // arbitrary doesn't get treated as a real repo. Real `.git` + // entries are either a directory (normal worktree) or a regular + // file (worktree/submodule pointer). Both are caught here; a + // symlink is rejected, which is the conservative choice — if + // users genuinely point `.git` at a shared dir via symlink they + // can resolve it before scanning. + gitEntry := filepath.Join(path, ".git") + if info, statErr := os.Lstat(gitEntry); statErr == nil && info.Mode()&os.ModeSymlink == 0 { + if seen[path] { + return filepath.SkipDir + } + seen[path] = true + repos = append(repos, Repo{ + AbsPath: path, + RelPath: rel, + }) + // Don't descend into a repo — nested repos (submodules, + // vendored repos) get picked up separately only if they + // live outside this repo's worktree, which is rare. If + // users need submodule coverage they can list the parent + // and the submodule paths as separate roots. + return filepath.SkipDir + } + + return nil + }) + if err != nil { + return nil, err + } + } + + assignSlugs(repos) + // Sort by slug so output ordering is deterministic across runs + // (WalkDir ordering is, but concatenating multiple roots could shuffle). + sort.Slice(repos, func(i, j int) bool { return repos[i].Slug < repos[j].Slug }) + return repos, nil +} + +// assignSlugs derives a unique slug per repo from its basename, falling +// back to `-` when two repos share a name. +// The slug is also the JSONL filename stem and the persistence key +// (`.state`), so uniqueness AND determinism across runs both matter: +// if a re-run swaps which sibling gets the bare name, the per-repo +// state file is orphaned and the affected repos are re-extracted from +// scratch (or worse, two repos collide onto one state). +// +// Two-pass: count basenames first, then suffix with a hash whenever the +// base appears more than once anywhere in the result. This makes the +// slug a pure function of (absPath, set of basenames seen), independent +// of WalkDir traversal order. +func assignSlugs(repos []Repo) { + counts := make(map[string]int) + bases := make([]string, len(repos)) + for i := range repos { + base := sanitizeSlug(filepath.Base(repos[i].AbsPath)) + if base == "" { + base = "repo" + } + bases[i] = base + counts[base]++ + } + for i := range repos { + base := bases[i] + slug := base + if counts[base] > 1 { + h := sha1.Sum([]byte(repos[i].AbsPath)) + slug = fmt.Sprintf("%s-%s", base, hex.EncodeToString(h[:])[:6]) + } + repos[i].Slug = slug + } +} + +// sanitizeSlug strips characters that would break the LoadMultiJSONL prefix +// contract (`:`) or filesystem paths. Keeps alphanumerics, dash, +// underscore, and dot. +func sanitizeSlug(s string) string { + var b strings.Builder + for _, r := range s { + switch { + case r >= 'a' && r <= 'z', + r >= 'A' && r <= 'Z', + r >= '0' && r <= '9', + r == '-', r == '_', r == '.': + b.WriteRune(r) + default: + b.WriteRune('_') + } + } + return b.String() +} diff --git a/internal/scan/discovery_test.go b/internal/scan/discovery_test.go new file mode 100644 index 0000000..4bb7a0c --- /dev/null +++ b/internal/scan/discovery_test.go @@ -0,0 +1,166 @@ +package scan + +import ( + "os" + "path/filepath" + "testing" +) + +func TestDiscover_FindsRepos(t *testing.T) { + root := t.TempDir() + + mustMkRepo(t, filepath.Join(root, "a")) + mustMkRepo(t, filepath.Join(root, "b")) + mustMkRepo(t, filepath.Join(root, "nested", "c")) + // Plain dir without .git — must NOT be picked up. + if err := os.MkdirAll(filepath.Join(root, "not-a-repo", "src"), 0o755); err != nil { + t.Fatal(err) + } + + repos, err := Discover([]string{root}, NewMatcher(nil), 0) + if err != nil { + t.Fatalf("Discover: %v", err) + } + if len(repos) != 3 { + t.Fatalf("expected 3 repos, got %d: %+v", len(repos), repos) + } + got := map[string]bool{} + for _, r := range repos { + got[r.Slug] = true + } + for _, want := range []string{"a", "b", "c"} { + if !got[want] { + t.Errorf("expected slug %q in %v", want, got) + } + } +} + +func TestDiscover_RespectsIgnore(t *testing.T) { + root := t.TempDir() + mustMkRepo(t, filepath.Join(root, "keep")) + mustMkRepo(t, filepath.Join(root, "node_modules", "garbage")) + + matcher := NewMatcher([]string{"node_modules"}) + repos, err := Discover([]string{root}, matcher, 0) + if err != nil { + t.Fatal(err) + } + if len(repos) != 1 || repos[0].Slug != "keep" { + t.Fatalf("expected only `keep`, got %+v", repos) + } +} + +func TestDiscover_DoesNotDescendIntoRepo(t *testing.T) { + root := t.TempDir() + parent := filepath.Join(root, "parent") + mustMkRepo(t, parent) + // A nested .git inside an already-discovered repo is treated as a + // submodule and skipped — we don't double-count. + mustMkRepo(t, filepath.Join(parent, "submodule")) + + repos, err := Discover([]string{root}, NewMatcher(nil), 0) + if err != nil { + t.Fatal(err) + } + if len(repos) != 1 { + t.Fatalf("expected 1 repo (parent only), got %d: %+v", len(repos), repos) + } +} + +func TestDiscover_SlugCollisionGetsHashSuffix(t *testing.T) { + root := t.TempDir() + mustMkRepo(t, filepath.Join(root, "a", "myrepo")) + mustMkRepo(t, filepath.Join(root, "b", "myrepo")) + + repos, err := Discover([]string{root}, NewMatcher(nil), 0) + if err != nil { + t.Fatal(err) + } + if len(repos) != 2 { + t.Fatalf("expected 2 repos, got %d", len(repos)) + } + if repos[0].Slug == repos[1].Slug { + t.Errorf("collision not resolved: both slugs are %q", repos[0].Slug) + } + // With the two-pass naming, BOTH duplicates carry a hash suffix — + // neither gets the bare basename. This guarantees the slug for any + // given path is stable regardless of which sibling WalkDir hits + // first across runs. + for _, r := range repos { + if r.Slug == "myrepo" { + t.Errorf("expected both colliding repos to get a hash suffix, but %s kept the bare name", r.AbsPath) + } + } +} + +// Re-running discovery must produce the same slug for the same path +// even when the WalkDir traversal could legally vary. This is the +// invariant `.state` resumption depends on. +func TestDiscover_SlugDeterministicAcrossRuns(t *testing.T) { + root := t.TempDir() + mustMkRepo(t, filepath.Join(root, "a", "myrepo")) + mustMkRepo(t, filepath.Join(root, "b", "myrepo")) + + first, err := Discover([]string{root}, NewMatcher(nil), 0) + if err != nil { + t.Fatal(err) + } + second, err := Discover([]string{root}, NewMatcher(nil), 0) + if err != nil { + t.Fatal(err) + } + if len(first) != len(second) { + t.Fatalf("repo count differs across runs: %d vs %d", len(first), len(second)) + } + pathSlug := map[string]string{} + for _, r := range first { + pathSlug[r.AbsPath] = r.Slug + } + for _, r := range second { + if pathSlug[r.AbsPath] != r.Slug { + t.Errorf("slug for %s changed across runs: %q → %q", r.AbsPath, pathSlug[r.AbsPath], r.Slug) + } + } +} + +func TestDiscover_RejectsSymlinkGit(t *testing.T) { + root := t.TempDir() + // A "repo" whose .git is a symlink — should not be picked up. + bad := filepath.Join(root, "weird") + if err := os.MkdirAll(bad, 0o755); err != nil { + t.Fatal(err) + } + if err := os.Symlink("/etc/hostname", filepath.Join(bad, ".git")); err != nil { + t.Skipf("symlink unsupported here: %v", err) + } + mustMkRepo(t, filepath.Join(root, "real")) + + repos, err := Discover([]string{root}, NewMatcher(nil), 0) + if err != nil { + t.Fatal(err) + } + if len(repos) != 1 || repos[0].Slug != "real" { + t.Fatalf("expected only `real`, got %+v", repos) + } +} + +func TestDiscover_MaxDepthHonored(t *testing.T) { + root := t.TempDir() + mustMkRepo(t, filepath.Join(root, "shallow")) + mustMkRepo(t, filepath.Join(root, "a", "b", "c", "deep")) + + repos, err := Discover([]string{root}, NewMatcher(nil), 2) + if err != nil { + t.Fatal(err) + } + if len(repos) != 1 || repos[0].Slug != "shallow" { + t.Fatalf("expected only shallow, got %+v", repos) + } +} + +func mustMkRepo(t *testing.T, path string) { + t.Helper() + if err := os.MkdirAll(filepath.Join(path, ".git"), 0o755); err != nil { + t.Fatal(err) + } +} diff --git a/internal/scan/ignore.go b/internal/scan/ignore.go new file mode 100644 index 0000000..c378b05 --- /dev/null +++ b/internal/scan/ignore.go @@ -0,0 +1,160 @@ +package scan + +import ( + "bufio" + "fmt" + "os" + "path" + "path/filepath" + "strings" +) + +// IgnoreRule is one line of the ignore file, pre-parsed. +// Supported syntax (a subset of gitignore, enough for directory pruning): +// +// # comment (ignored) +// node_modules (literal name — matches as basename or full path) +// vendor/ (directory-only; matches dirs named vendor) +// archive/* (glob — forwarded to path.Match on the rel path) +// **/generated (deep-match: same as `generated` matched anywhere) +// !important (negation — re-includes a match) +// +// Matching is evaluated in order; the last rule to match wins, following +// gitignore semantics so `!foo` after `foo/` re-includes foo. +type IgnoreRule struct { + Pattern string + Negate bool + DirOnly bool +} + +type Matcher struct { + rules []IgnoreRule +} + +func NewMatcher(patterns []string) *Matcher { + m := &Matcher{} + for _, p := range patterns { + if r, ok := parseRule(p); ok { + m.rules = append(m.rules, r) + } + } + return m +} + +// LoadMatcher reads an ignore file and returns a matcher. A missing file +// is not an error — scan simply proceeds with no user-provided rules. +// This matches the gitignore contract where `.gitignore` is optional. +func LoadMatcher(file string) (*Matcher, error) { + f, err := os.Open(file) + if err != nil { + if os.IsNotExist(err) { + return NewMatcher(nil), nil + } + return nil, fmt.Errorf("open %s: %w", file, err) + } + defer f.Close() + + var patterns []string + scanner := bufio.NewScanner(f) + for scanner.Scan() { + patterns = append(patterns, scanner.Text()) + } + if err := scanner.Err(); err != nil { + return nil, fmt.Errorf("read %s: %w", file, err) + } + return NewMatcher(patterns), nil +} + +func parseRule(line string) (IgnoreRule, bool) { + line = strings.TrimSpace(line) + if line == "" || strings.HasPrefix(line, "#") { + return IgnoreRule{}, false + } + r := IgnoreRule{} + if strings.HasPrefix(line, "!") { + r.Negate = true + line = line[1:] + } + if strings.HasSuffix(line, "/") { + r.DirOnly = true + line = strings.TrimSuffix(line, "/") + } + r.Pattern = line + return r, r.Pattern != "" +} + +// Match reports whether relPath should be ignored. isDir hints directory- +// only rules (`vendor/` only fires on dirs). +// +// Evaluation order: scan every rule, track the last match. This is what +// gitignore does — it lets `!src/keep` override a broader earlier `src/` +// block without forcing the user to care about rule ordering beyond the +// obvious "put exceptions after the thing they exclude". +func (m *Matcher) Match(relPath string, isDir bool) bool { + relPath = filepath.ToSlash(relPath) + matched := false + for _, r := range m.rules { + if r.DirOnly && !isDir { + continue + } + if !matchRule(r, relPath) { + continue + } + matched = !r.Negate + } + return matched +} + +func matchRule(r IgnoreRule, relPath string) bool { + pat := r.Pattern + + // **/foo → strip leading **/ and treat as "match foo anywhere" — + // same as the basename-or-suffix check below. We don't support + // arbitrary ** in the middle of patterns because users coming from + // gitignore expect the common forms (prefix dir, basename, ext), + // not the full double-star algebra. Saves us writing a mini doublestar + // engine for a marginal feature. + if strings.HasPrefix(pat, "**/") { + pat = strings.TrimPrefix(pat, "**/") + } + + base := path.Base(relPath) + + // Literal basename: `node_modules` matches any segment named that. + if pat == base { + return true + } + // Any segment of the path equals the pattern (literal dir/file name + // embedded in the tree). "vendor" matches "a/b/vendor/c.go". + if !strings.ContainsAny(pat, "*?[") && segmentMatch(pat, relPath) { + return true + } + // Glob on basename: "*.log" + if matched, _ := path.Match(pat, base); matched { + return true + } + // Glob on full relative path: "archive/*" + if matched, _ := path.Match(pat, relPath); matched { + return true + } + // Directory prefix: "archive/" (after DirOnly stripping applied above + // removed the trailing slash) or "archive/*". + prefix := strings.TrimSuffix(pat, "*") + prefix = strings.TrimSuffix(prefix, "/") + if prefix != "" && prefix != pat { + if strings.HasPrefix(relPath, prefix+"/") || relPath == prefix { + return true + } + } + return false +} + +func segmentMatch(needle, relPath string) bool { + for _, seg := range strings.Split(relPath, "/") { + if seg == needle { + return true + } + } + return false +} + diff --git a/internal/scan/ignore_test.go b/internal/scan/ignore_test.go new file mode 100644 index 0000000..ed9510f --- /dev/null +++ b/internal/scan/ignore_test.go @@ -0,0 +1,81 @@ +package scan + +import ( + "os" + "path/filepath" + "testing" +) + +func TestMatcher_Basics(t *testing.T) { + m := NewMatcher([]string{ + "# comment line", + "", + "node_modules", + "vendor/", + "archive/*", + "*.log", + "!important.log", + "**/generated", + }) + + cases := []struct { + name string + path string + isDir bool + want bool + }{ + {"basename dir match", "a/b/node_modules", true, true}, + {"basename file match in middle", "a/node_modules/c.go", false, true}, + {"vendor dir-only match", "src/vendor", true, true}, + {"vendor on file does not match", "src/vendor", false, false}, + {"glob in subdir", "archive/2024", true, true}, + {"glob extension", "out/build.log", false, true}, + {"negation re-includes", "out/important.log", false, false}, + {"deep generated", "src/foo/generated", true, true}, + {"unrelated path", "src/main.go", false, false}, + } + + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + got := m.Match(c.path, c.isDir) + if got != c.want { + t.Errorf("Match(%q, dir=%v) = %v, want %v", c.path, c.isDir, got, c.want) + } + }) + } +} + +func TestLoadMatcher_MissingFileIsOK(t *testing.T) { + m, err := LoadMatcher(filepath.Join(t.TempDir(), "missing")) + if err != nil { + t.Fatalf("expected no error for missing file, got %v", err) + } + if m == nil { + t.Fatal("matcher should not be nil") + } + if m.Match("anything", false) { + t.Error("empty matcher should match nothing") + } +} + +func TestLoadMatcher_FromFile(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, ".gitcortex-ignore") + contents := "# comment\nnode_modules\nvendor/\n" + if err := os.WriteFile(path, []byte(contents), 0o644); err != nil { + t.Fatal(err) + } + m, err := LoadMatcher(path) + if err != nil { + t.Fatalf("LoadMatcher: %v", err) + } + if !m.Match("foo/node_modules", true) { + t.Error("node_modules should match") + } + if !m.Match("vendor", true) { + t.Error("vendor/ should match dirs") + } + if m.Match("vendor", false) { + t.Error("vendor/ should not match files") + } +} diff --git a/internal/scan/scan.go b/internal/scan/scan.go new file mode 100644 index 0000000..0d4a41d --- /dev/null +++ b/internal/scan/scan.go @@ -0,0 +1,271 @@ +package scan + +import ( + "context" + "encoding/json" + "fmt" + "log" + "os" + "path/filepath" + "sync" + "time" + + "github.com/lex0c/gitcortex/internal/extract" + "github.com/lex0c/gitcortex/internal/git" +) + +// Config holds scan command input. The Extract template is copied per-repo +// and its Repo/Output/StateFile/Branch are overwritten per worker — the +// template exists to carry shared flags (ignore patterns, batch size, +// mailmap, etc.) without re-declaring them on the scan surface. +type Config struct { + Roots []string + Output string + IgnoreFile string + MaxDepth int + Parallel int + Extract extract.Config +} + +// Manifest is persisted to /manifest.json so subsequent runs +// (or the report stage) can discover which JSONL files to consolidate +// without re-walking the filesystem. +type Manifest struct { + GeneratedAt string `json:"generated_at"` + Roots []string `json:"roots"` + Repos []ManifestRepo `json:"repos"` +} + +type ManifestRepo struct { + Slug string `json:"slug"` + Path string `json:"path"` + Branch string `json:"branch"` + JSONL string `json:"jsonl"` + StateFile string `json:"state_file"` + Status string `json:"status"` // ok | failed | skipped + Error string `json:"error,omitempty"` + Commits int `json:"commits,omitempty"` + DurationMS int64 `json:"duration_ms,omitempty"` +} + +// Result is returned to the caller so the CLI can drive the report stage +// with the concrete JSONL paths (no need to re-read the manifest). +type Result struct { + OutputDir string + Manifest Manifest + JSONLs []string // only successful repos +} + +// Run discovers repos under cfg.Roots, writes one JSONL per repo via +// the existing extract pipeline, and emits a manifest. Failures on +// individual repos are recorded in the manifest with Status="failed" +// but do not abort the whole scan — the report can still be generated +// from the repos that succeeded. +func Run(ctx context.Context, cfg Config) (*Result, error) { + if len(cfg.Roots) == 0 { + return nil, fmt.Errorf("scan: at least one --root is required") + } + if cfg.Output == "" { + return nil, fmt.Errorf("scan: --output is required") + } + if err := os.MkdirAll(cfg.Output, 0o755); err != nil { + return nil, fmt.Errorf("create output dir: %w", err) + } + if cfg.Parallel <= 0 { + cfg.Parallel = 1 + } + + matcher, err := loadMatcher(cfg) + if err != nil { + return nil, err + } + + repos, err := Discover(cfg.Roots, matcher, cfg.MaxDepth) + if err != nil { + return nil, err + } + if len(repos) == 0 { + return nil, fmt.Errorf("scan: no git repositories found under %v", cfg.Roots) + } + log.Printf("scan: discovered %d repositories", len(repos)) + + manifest := Manifest{ + GeneratedAt: time.Now().UTC().Format(time.RFC3339), + Roots: cfg.Roots, + Repos: make([]ManifestRepo, len(repos)), + } + // Pre-fill every slot with the repo's identity and a "pending" + // status. If the user cancels (Ctrl+C) before a worker reaches a + // repo, the manifest still names it with Status="pending" instead + // of an empty struct that misleads "ok, 0 failed" accounting. + for i, r := range repos { + manifest.Repos[i] = ManifestRepo{ + Slug: r.Slug, + Path: r.AbsPath, + JSONL: filepath.Join(cfg.Output, r.Slug+".jsonl"), + StateFile: filepath.Join(cfg.Output, r.Slug+".state"), + Status: "pending", + } + } + + // Worker pool over repos. Each worker runs extract.Run against one + // repo and writes to its dedicated JSONL + state file. The extract + // package already checkpoints per-repo state, so an interrupted scan + // can be resumed by re-invoking the command — completed repos will + // be skipped (their state file's last SHA prevents re-emission) and + // partials will resume. + type job struct { + idx int + repo Repo + } + jobs := make(chan job) + var wg sync.WaitGroup + + for w := 0; w < cfg.Parallel; w++ { + wg.Add(1) + go func() { + defer wg.Done() + for j := range jobs { + entry := runOne(ctx, cfg, j.repo) + manifest.Repos[j.idx] = entry + } + }() + } + + go func() { + for i, r := range repos { + select { + case <-ctx.Done(): + close(jobs) + return + case jobs <- job{idx: i, repo: r}: + } + } + close(jobs) + }() + + wg.Wait() + + manifestPath := filepath.Join(cfg.Output, "manifest.json") + if err := writeManifest(manifestPath, manifest); err != nil { + return nil, fmt.Errorf("write manifest: %w", err) + } + + var jsonls []string + var ok, failed, pending int + for _, r := range manifest.Repos { + switch r.Status { + case "ok": + jsonls = append(jsonls, r.JSONL) + ok++ + case "pending": + pending++ + default: + failed++ + } + } + log.Printf("scan: %d ok, %d failed, %d not started; manifest at %s", ok, failed, pending, manifestPath) + + res := &Result{ + OutputDir: cfg.Output, + Manifest: manifest, + JSONLs: jsonls, + } + // Surface cancellation as a non-nil error so the CLI exits non-zero + // — but return the partial result anyway so the caller can still + // inspect what completed before the interrupt. + if err := ctx.Err(); err != nil { + return res, err + } + return res, nil +} + +func loadMatcher(cfg Config) (*Matcher, error) { + // Per-root default: look for .gitcortex-ignore at the first root when + // no explicit path was given. Single-file load is enough for the MVP; + // if users need per-root rule sets they can concat them into one file. + path := cfg.IgnoreFile + if path == "" && len(cfg.Roots) > 0 { + candidate := filepath.Join(cfg.Roots[0], ".gitcortex-ignore") + if _, err := os.Stat(candidate); err == nil { + path = candidate + } + } + if path == "" { + return NewMatcher(nil), nil + } + return LoadMatcher(path) +} + +func runOne(ctx context.Context, cfg Config, repo Repo) ManifestRepo { + jsonlPath := filepath.Join(cfg.Output, repo.Slug+".jsonl") + statePath := filepath.Join(cfg.Output, repo.Slug+".state") + entry := ManifestRepo{ + Slug: repo.Slug, + Path: repo.AbsPath, + JSONL: jsonlPath, + StateFile: statePath, + } + + // Skip cleanly if the user already cancelled before this worker + // picked up the job — avoids a noisy "extracting..." log followed + // by an immediate ctx.Err() from extract. + if err := ctx.Err(); err != nil { + entry.Status = "skipped" + entry.Error = err.Error() + return entry + } + + branch := cfg.Extract.Branch + if branch == "" { + branch = git.DetectDefaultBranch(repo.AbsPath) + } + if branch == "" { + entry.Status = "failed" + entry.Error = "could not detect default branch" + return entry + } + entry.Branch = branch + + // Copy the extract config so each worker gets its own Repo/Output/State + // without racing on shared fields. Preserves ignore patterns, batch + // size, mailmap, etc. from the template. + sub := cfg.Extract + sub.Repo = repo.AbsPath + sub.Branch = branch + sub.Output = jsonlPath + sub.StateFile = statePath + if sub.CommandTimeout == 0 { + sub.CommandTimeout = extract.DefaultCommandTimeout + } + // Always force StartOffset back to -1 (= "load from state file") + // regardless of the template's value. Programmatic callers passing + // a zero-value Config would otherwise feed flagOffset=0 to + // extract.LoadState, which would IGNORE the per-repo state file and + // re-extract from scratch. StartSHA gets cleared for the same + // reason: a single CLI-level SHA cannot apply to N different repos. + sub.StartSHA = "" + sub.StartOffset = -1 + + log.Printf("scan: [%s] extracting from %s (branch %s)", repo.Slug, repo.AbsPath, branch) + + start := time.Now() + if err := extract.Run(ctx, sub); err != nil { + entry.Status = "failed" + entry.Error = fmt.Errorf("extract %s: %w", repo.Slug, err).Error() + entry.DurationMS = time.Since(start).Milliseconds() + log.Printf("scan: [%s] failed: %v", repo.Slug, err) + return entry + } + entry.Status = "ok" + entry.DurationMS = time.Since(start).Milliseconds() + return entry +} + +func writeManifest(path string, m Manifest) error { + data, err := json.MarshalIndent(m, "", " ") + if err != nil { + return err + } + return os.WriteFile(path, data, 0o644) +} diff --git a/internal/scan/scan_test.go b/internal/scan/scan_test.go new file mode 100644 index 0000000..db24410 --- /dev/null +++ b/internal/scan/scan_test.go @@ -0,0 +1,138 @@ +package scan + +import ( + "context" + "encoding/json" + "os" + "os/exec" + "path/filepath" + "strings" + "testing" + + "github.com/lex0c/gitcortex/internal/extract" + "github.com/lex0c/gitcortex/internal/stats" +) + +func TestRun_EndToEnd(t *testing.T) { + if _, err := exec.LookPath("git"); err != nil { + t.Skip("git not installed") + } + + root := t.TempDir() + makeRealRepo(t, filepath.Join(root, "alpha"), map[string]string{ + "main.go": "package main\nfunc main() {}\n", + "README.md": "# alpha\n", + }) + makeRealRepo(t, filepath.Join(root, "beta"), map[string]string{ + "app.py": "print('hi')\n", + }) + // One node_modules-style decoy that should be ignored. + makeRealRepo(t, filepath.Join(root, "node_modules", "garbage"), map[string]string{ + "x.js": "x\n", + }) + + output := filepath.Join(t.TempDir(), "out") + cfg := Config{ + Roots: []string{root}, + Output: output, + Parallel: 2, + Extract: extract.Config{ + BatchSize: 100, + CommandTimeout: extract.DefaultCommandTimeout, + }, + } + cfg.IgnoreFile = writeIgnoreFile(t, root, "node_modules\n") + + res, err := Run(context.Background(), cfg) + if err != nil { + t.Fatalf("Run: %v", err) + } + + if len(res.JSONLs) != 2 { + t.Fatalf("expected 2 JSONLs, got %d (%v)", len(res.JSONLs), res.JSONLs) + } + + // Manifest sanity + manifestPath := filepath.Join(output, "manifest.json") + mb, err := os.ReadFile(manifestPath) + if err != nil { + t.Fatalf("read manifest: %v", err) + } + var m Manifest + if err := json.Unmarshal(mb, &m); err != nil { + t.Fatalf("parse manifest: %v", err) + } + if len(m.Repos) != 2 { + t.Fatalf("manifest should have 2 repos, got %d", len(m.Repos)) + } + for _, r := range m.Repos { + if r.Status != "ok" { + t.Errorf("repo %s status=%s err=%s", r.Slug, r.Status, r.Error) + } + } + + // Consolidated load + breakdown + ds, err := stats.LoadMultiJSONL(res.JSONLs) + if err != nil { + t.Fatalf("LoadMultiJSONL: %v", err) + } + breakdown := stats.RepoBreakdown(ds, "") + if len(breakdown) != 2 { + t.Fatalf("expected breakdown across 2 repos, got %d: %+v", len(breakdown), breakdown) + } + got := map[string]int{} + for _, b := range breakdown { + got[b.Repo] = b.Commits + } + for _, name := range []string{"alpha", "beta"} { + if got[name] == 0 { + t.Errorf("repo %s missing or has 0 commits in breakdown: %v", name, got) + } + } +} + +// makeRealRepo initializes a git repo with the given files and a single +// commit using a deterministic identity so the assertions don't depend on +// the developer's git config. +func makeRealRepo(t *testing.T, dir string, files map[string]string) { + t.Helper() + if err := os.MkdirAll(dir, 0o755); err != nil { + t.Fatal(err) + } + for name, contents := range files { + full := filepath.Join(dir, name) + if err := os.MkdirAll(filepath.Dir(full), 0o755); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(full, []byte(contents), 0o644); err != nil { + t.Fatal(err) + } + } + + runGit := func(args ...string) { + cmd := exec.Command("git", args...) + cmd.Dir = dir + cmd.Env = append(os.Environ(), + "GIT_AUTHOR_NAME=Test", + "GIT_AUTHOR_EMAIL=test@example.com", + "GIT_COMMITTER_NAME=Test", + "GIT_COMMITTER_EMAIL=test@example.com", + ) + out, err := cmd.CombinedOutput() + if err != nil { + t.Fatalf("git %s: %v\n%s", strings.Join(args, " "), err, out) + } + } + runGit("init", "-q", "-b", "main") + runGit("add", ".") + runGit("commit", "-q", "-m", "initial") +} + +func writeIgnoreFile(t *testing.T, root, contents string) string { + t.Helper() + path := filepath.Join(root, ".gitcortex-ignore") + if err := os.WriteFile(path, []byte(contents), 0o644); err != nil { + t.Fatal(err) + } + return path +} diff --git a/internal/stats/reader.go b/internal/stats/reader.go index 87fc4ef..63783e7 100644 --- a/internal/stats/reader.go +++ b/internal/stats/reader.go @@ -22,6 +22,11 @@ type commitEntry struct { del int64 files int message string + // repo carries the LoadMultiJSONL pathPrefix (without the trailing + // colon) so RepoBreakdown can attribute commits back to their source + // repository on multi-repo scans. Empty for single-file loads, which + // keeps single-repo callers' behavior unchanged. + repo string } type fileEntry struct { @@ -242,6 +247,7 @@ func streamLoadInto(ds *Dataset, r io.Reader, opt LoadOptions, pathPrefix string del: c.Deletions, files: c.FilesChanged, message: c.Message, + repo: strings.TrimSuffix(pathPrefix, ":"), } // Contributors diff --git a/internal/stats/repo_breakdown.go b/internal/stats/repo_breakdown.go new file mode 100644 index 0000000..0de8091 --- /dev/null +++ b/internal/stats/repo_breakdown.go @@ -0,0 +1,169 @@ +package stats + +import ( + "sort" + "strings" + "time" +) + +// RepoStat aggregates commit-level activity for one repository in a +// multi-repo Dataset. Populated only when LoadMultiJSONL ran with more +// than one input file (single-repo loads have no path prefix to group by, +// so the breakdown would be a noop). +type RepoStat struct { + Repo string + Commits int + Additions int64 + Deletions int64 + Churn int64 + Files int + ActiveDays int + UniqueDevs int + FirstCommitDate string + LastCommitDate string + // PctOfTotalCommits / PctOfTotalChurn are computed against the + // dataset totals so the report can show "this repo was 38% of + // your commits" without the template needing to do the math. + PctOfTotalCommits float64 + PctOfTotalChurn float64 +} + +// RepoBreakdown groups commits by their repo tag (set during ingest from +// the LoadMultiJSONL pathPrefix) and returns one row per repo, sorted by +// commit count descending. When emailFilter is non-empty, only commits +// authored by that email are counted — this is the "show my work across +// repos" lens used by the scan report. +// +// On single-repo datasets (no prefix tag, repo == ""), the function +// returns a single row labeled "(repo)" so callers can still render +// without a special case. +// +// Caveat: the underlying ds.commits map is keyed by SHA. If two repos +// share a commit SHA (cherry-pick, mirror, fork) the second one ingested +// overwrites the first, and the breakdown will report only the +// surviving repo. In practice this only matters for forked / mirrored +// repos under a single scan root — for those, scan and aggregate +// separately if exact attribution matters. +func RepoBreakdown(ds *Dataset, emailFilter string) []RepoStat { + type acc struct { + commits int + add, del int64 + days map[string]struct{} + devs map[string]struct{} + first, last time.Time + } + + repos := make(map[string]*acc) + emailLower := strings.ToLower(strings.TrimSpace(emailFilter)) + + for _, c := range ds.commits { + if emailLower != "" && strings.ToLower(strings.TrimSpace(c.email)) != emailLower { + continue + } + key := c.repo + if key == "" { + key = "(repo)" + } + a, ok := repos[key] + if !ok { + a = &acc{ + days: make(map[string]struct{}), + devs: make(map[string]struct{}), + } + repos[key] = a + } + a.commits++ + a.add += c.add + a.del += c.del + if c.email != "" { + a.devs[strings.ToLower(c.email)] = struct{}{} + } + if !c.date.IsZero() { + a.days[c.date.UTC().Format("2006-01-02")] = struct{}{} + if a.first.IsZero() || c.date.Before(a.first) { + a.first = c.date + } + if c.date.After(a.last) { + a.last = c.date + } + } + } + + // File counts come from ds.files. Path prefix is `:` so + // the split key matches the commit-side `c.repo`. Files without a + // prefix (single-repo loads) bucket under the same "(repo)" label. + // + // When emailFilter is set the count must reflect files THIS dev + // touched, not the repo's total — the unfiltered count would + // massively over-state per-dev scope (e.g. "I worked on 12k files + // in monorepo X" when really the dev touched 30). devCommits + // tracks per-file dev appearance and is the right denominator. + fileCounts := make(map[string]int) + for path, fe := range ds.files { + if emailLower != "" { + matched := false + for devEmail := range fe.devCommits { + if strings.ToLower(strings.TrimSpace(devEmail)) == emailLower { + matched = true + break + } + } + if !matched { + continue + } + } + key := "" + if i := strings.Index(path, ":"); i >= 0 { + key = path[:i] + } + if key == "" { + key = "(repo)" + } + fileCounts[key]++ + } + + // Totals computed AFTER the email filter: percentages are relative + // to the filtered universe so a dev profile shows "60% of MY commits + // happened in repo X", not "0.4% of the org's commits". + var totalCommits int + var totalChurn int64 + for _, a := range repos { + totalCommits += a.commits + totalChurn += a.add + a.del + } + + out := make([]RepoStat, 0, len(repos)) + for repo, a := range repos { + stat := RepoStat{ + Repo: repo, + Commits: a.commits, + Additions: a.add, + Deletions: a.del, + Churn: a.add + a.del, + Files: fileCounts[repo], + ActiveDays: len(a.days), + UniqueDevs: len(a.devs), + } + if !a.first.IsZero() { + stat.FirstCommitDate = a.first.UTC().Format("2006-01-02") + } + if !a.last.IsZero() { + stat.LastCommitDate = a.last.UTC().Format("2006-01-02") + } + if totalCommits > 0 { + stat.PctOfTotalCommits = float64(stat.Commits) / float64(totalCommits) * 100 + } + if totalChurn > 0 { + stat.PctOfTotalChurn = float64(stat.Churn) / float64(totalChurn) * 100 + } + out = append(out, stat) + } + + sort.Slice(out, func(i, j int) bool { + if out[i].Commits != out[j].Commits { + return out[i].Commits > out[j].Commits + } + return out[i].Repo < out[j].Repo + }) + return out +} diff --git a/internal/stats/repo_breakdown_test.go b/internal/stats/repo_breakdown_test.go new file mode 100644 index 0000000..952ae00 --- /dev/null +++ b/internal/stats/repo_breakdown_test.go @@ -0,0 +1,127 @@ +package stats + +import ( + "testing" + "time" +) + +func TestRepoBreakdown_GroupsByPrefix(t *testing.T) { + ds := &Dataset{ + commits: map[string]*commitEntry{ + "a1": {email: "me@x.com", repo: "alpha", date: time.Date(2024, 1, 1, 0, 0, 0, 0, time.UTC), add: 10, del: 0}, + "a2": {email: "me@x.com", repo: "alpha", date: time.Date(2024, 1, 5, 0, 0, 0, 0, time.UTC), add: 5, del: 2}, + "b1": {email: "you@x.com", repo: "beta", date: time.Date(2024, 2, 1, 0, 0, 0, 0, time.UTC), add: 30, del: 0}, + }, + files: map[string]*fileEntry{ + "alpha:main.go": {}, + "alpha:helper.go": {}, + "beta:app.py": {}, + }, + } + + breakdown := RepoBreakdown(ds, "") + if len(breakdown) != 2 { + t.Fatalf("want 2 repos, got %d", len(breakdown)) + } + // Sort is by commit count desc, so alpha first. + if breakdown[0].Repo != "alpha" || breakdown[0].Commits != 2 { + t.Errorf("alpha: got %+v", breakdown[0]) + } + if breakdown[0].Files != 2 { + t.Errorf("alpha files: got %d, want 2", breakdown[0].Files) + } + if breakdown[1].Repo != "beta" || breakdown[1].Commits != 1 { + t.Errorf("beta: got %+v", breakdown[1]) + } + // Pct totals — alpha 2/3, beta 1/3. + if got := breakdown[0].PctOfTotalCommits; got < 66.6 || got > 66.7 { + t.Errorf("alpha pct commits: got %.2f, want ~66.67", got) + } +} + +func TestRepoBreakdown_EmailFilter(t *testing.T) { + ds := &Dataset{ + commits: map[string]*commitEntry{ + "a1": {email: "me@x.com", repo: "alpha", date: time.Date(2024, 1, 1, 0, 0, 0, 0, time.UTC), add: 10}, + "a2": {email: "other@x.com", repo: "alpha", date: time.Date(2024, 1, 2, 0, 0, 0, 0, time.UTC), add: 5}, + "b1": {email: "me@x.com", repo: "beta", date: time.Date(2024, 2, 1, 0, 0, 0, 0, time.UTC), add: 30}, + }, + // Per-dev file counts: alpha has 2 files but me@ touched only 1; + // beta has 1 file me@ touched. Other dev's exclusive file in + // alpha must NOT be counted toward me@'s repo Files. + files: map[string]*fileEntry{ + "alpha:mine.go": {devCommits: map[string]int{"me@x.com": 1}}, + "alpha:other.go": {devCommits: map[string]int{"other@x.com": 1}}, + "beta:both.py": {devCommits: map[string]int{"me@x.com": 1}}, + }, + } + breakdown := RepoBreakdown(ds, "me@x.com") + if len(breakdown) != 2 { + t.Fatalf("want 2 repos for me@x.com, got %d", len(breakdown)) + } + for _, b := range breakdown { + if b.Commits != 1 { + t.Errorf("repo %s: expected 1 commit (filtered), got %d", b.Repo, b.Commits) + } + if b.Files != 1 { + t.Errorf("repo %s: expected 1 file (filtered), got %d", b.Repo, b.Files) + } + } +} + +// A rename within a repo collapses two file paths onto one canonical +// path during finalizeDataset. The collapsed path keeps its `:` +// prefix, so RepoBreakdown should still attribute the file to the +// correct repo. Regression guard: without proper handling, the +// post-rename canonical key would either lose its prefix (and bucket +// into "(repo)") or duplicate-count. +func TestRepoBreakdown_SurvivesRename(t *testing.T) { + ds := newDataset() + ds.commits["c1"] = &commitEntry{ + email: "me@x.com", repo: "alpha", + date: time.Date(2024, 1, 1, 0, 0, 0, 0, time.UTC), add: 5, + } + ds.commits["c2"] = &commitEntry{ + email: "me@x.com", repo: "alpha", + date: time.Date(2024, 1, 5, 0, 0, 0, 0, time.UTC), add: 10, + } + ds.files["alpha:old.go"] = &fileEntry{ + devCommits: map[string]int{"me@x.com": 1}, + } + ds.files["alpha:new.go"] = &fileEntry{ + devCommits: map[string]int{"me@x.com": 1}, + } + ds.renameEdges = []renameEdge{{ + oldPath: "alpha:old.go", newPath: "alpha:new.go", + commitDate: time.Date(2024, 1, 5, 0, 0, 0, 0, time.UTC), + }} + applyRenames(ds) + + breakdown := RepoBreakdown(ds, "") + if len(breakdown) != 1 || breakdown[0].Repo != "alpha" { + t.Fatalf("want single alpha row, got %+v", breakdown) + } + // After rename collapse: 1 canonical file, both commits attributed. + if breakdown[0].Files != 1 { + t.Errorf("want 1 file (post-rename), got %d", breakdown[0].Files) + } + if breakdown[0].Commits != 2 { + t.Errorf("want 2 commits, got %d", breakdown[0].Commits) + } +} + +func TestRepoBreakdown_SingleRepoFallback(t *testing.T) { + // No prefix tag → all commits bucket under "(repo)". + ds := &Dataset{ + commits: map[string]*commitEntry{ + "x1": {email: "me@x.com", date: time.Date(2024, 1, 1, 0, 0, 0, 0, time.UTC), add: 5}, + }, + files: map[string]*fileEntry{ + "src/foo.go": {}, + }, + } + breakdown := RepoBreakdown(ds, "") + if len(breakdown) != 1 || breakdown[0].Repo != "(repo)" { + t.Fatalf("want single (repo) row, got %+v", breakdown) + } +} From 02100f5c0ac271f32d9d6e015bdf69a332677ede Mon Sep 17 00:00:00 2001 From: lex0c Date: Mon, 20 Apr 2026 11:58:42 -0300 Subject: [PATCH 02/37] Detect bare repositories during scan discovery MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bare clones (foo.git dirs with HEAD/objects/refs at the root rather than inside .git/) weren't picked up — discovery only accepted working-tree layouts. Since mirrors, fixture clones, and GitHub-style server dirs all use this shape, the omission blocked the most common "point gitcortex scan at a folder of .git clones" validation path. isBareRepo requires all three canonical entries to avoid false positives from stray HEAD/ or refs/ directories. Co-Authored-By: Claude Opus 4.7 (1M context) --- internal/scan/discovery.go | 42 +++++++++++++++++++++++---------- internal/scan/discovery_test.go | 36 ++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+), 12 deletions(-) diff --git a/internal/scan/discovery.go b/internal/scan/discovery.go index 0d7b66f..618fefd 100644 --- a/internal/scan/discovery.go +++ b/internal/scan/discovery.go @@ -77,20 +77,24 @@ func Discover(roots []string, matcher *Matcher, maxDepth int) ([]Repo, error) { return filepath.SkipDir } - // Detect .git either as a directory (normal) or a file - // (worktree / submodule). Presence alone is enough — we don't - // try to validate the content; extract will fail loudly if - // the repo is broken, which is a better signal than silently - // skipping. - // Lstat (not Stat) so a symlink named `.git` pointing somewhere - // arbitrary doesn't get treated as a real repo. Real `.git` - // entries are either a directory (normal worktree) or a regular - // file (worktree/submodule pointer). Both are caught here; a - // symlink is rejected, which is the conservative choice — if - // users genuinely point `.git` at a shared dir via symlink they - // can resolve it before scanning. + // Two repo shapes to detect: + // 1. Working tree: path/.git is a dir (normal) or file + // (worktree/submodule pointer). + // 2. Bare repo: path itself contains HEAD + objects/ + // + refs/ — common for clones used as fixtures or + // mirrors (`git clone --bare`, GitHub-style server dirs + // named foo.git). + // Lstat (not Stat) on the .git entry so a symlink named .git + // pointing somewhere arbitrary doesn't get treated as a real + // repo. The bare check uses os.Stat by way of isBareRepo, + // which inspects three required entries — much harder to + // trick than a single dirent check. gitEntry := filepath.Join(path, ".git") + isWorkingTree := false if info, statErr := os.Lstat(gitEntry); statErr == nil && info.Mode()&os.ModeSymlink == 0 { + isWorkingTree = true + } + if isWorkingTree || isBareRepo(path) { if seen[path] { return filepath.SkipDir } @@ -155,6 +159,20 @@ func assignSlugs(repos []Repo) { } } +// isBareRepo returns true when path is a bare git repository — i.e. the +// directory itself holds HEAD, objects/, and refs/ rather than wrapping +// them in a .git subdirectory. All three entries are required because +// a stray HEAD file or empty refs/ dir alone is not enough to be a real +// repo and we don't want false positives polluting the manifest. +func isBareRepo(path string) bool { + for _, name := range []string{"HEAD", "objects", "refs"} { + if _, err := os.Stat(filepath.Join(path, name)); err != nil { + return false + } + } + return true +} + // sanitizeSlug strips characters that would break the LoadMultiJSONL prefix // contract (`:`) or filesystem paths. Keeps alphanumerics, dash, // underscore, and dot. diff --git a/internal/scan/discovery_test.go b/internal/scan/discovery_test.go index 4bb7a0c..9957b21 100644 --- a/internal/scan/discovery_test.go +++ b/internal/scan/discovery_test.go @@ -164,3 +164,39 @@ func mustMkRepo(t *testing.T, path string) { t.Fatal(err) } } + +func TestDiscover_FindsBareRepo(t *testing.T) { + root := t.TempDir() + bare := filepath.Join(root, "myrepo.git") + for _, name := range []string{"HEAD", "objects", "refs"} { + full := filepath.Join(bare, name) + if name == "HEAD" { + if err := os.MkdirAll(bare, 0o755); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(full, []byte("ref: refs/heads/main\n"), 0o644); err != nil { + t.Fatal(err) + } + } else { + if err := os.MkdirAll(full, 0o755); err != nil { + t.Fatal(err) + } + } + } + // Decoy: dir with HEAD only — must not be picked up. + decoy := filepath.Join(root, "not-a-repo") + if err := os.MkdirAll(decoy, 0o755); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(filepath.Join(decoy, "HEAD"), []byte("nope\n"), 0o644); err != nil { + t.Fatal(err) + } + + repos, err := Discover([]string{root}, NewMatcher(nil), 0) + if err != nil { + t.Fatal(err) + } + if len(repos) != 1 || repos[0].Slug != "myrepo.git" { + t.Fatalf("expected single myrepo.git, got %+v", repos) + } +} From 16742dadd5cde382ee7b805d184361a1e77af30e Mon Sep 17 00:00:00 2001 From: lex0c Date: Mon, 20 Apr 2026 12:04:14 -0300 Subject: [PATCH 03/37] Strip repo prefix from suspect-detector suggestions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit LoadMultiJSONL prefixes file paths with ':' so cross-repo aggregates can attribute files back to their source. The suspect detector's directory-glob suggestions inherited that prefix, which made the warning's remediation command a no-op: extract --ignore and scan --extract-ignore both expect bare, repo-relative globs. Strip the prefix in CollectAllSuggestions before dedup — same shape from different repos (dist/*, node_modules/*) now collapses to one canonical suggestion that works across every repo in the scan. The stripping is conservative: a colon only counts as a prefix delimiter when the preceding segment contains no slash or glob metachars, preserving the rare but valid case of a colon embedded in a real path. Co-Authored-By: Claude Opus 4.7 (1M context) --- internal/stats/suspect.go | 30 ++++++++++++++++++ internal/stats/suspect_test.go | 57 ++++++++++++++++++++++++++++++++++ 2 files changed, 87 insertions(+) diff --git a/internal/stats/suspect.go b/internal/stats/suspect.go index 33a4b5f..af62aaf 100644 --- a/internal/stats/suspect.go +++ b/internal/stats/suspect.go @@ -211,11 +211,23 @@ func ShellQuoteSingle(s string) string { // trimming suggestions to just the displayed subset would leave // unshown suspects untouched and cause the warning to persist after // the suggested fix. +// +// Multi-repo note: LoadMultiJSONL prefixes file paths with `:` +// so every consumer (hotspots, directories, suspect detector) can +// attribute paths back to their source. The suggestion strings +// inherit that prefix from suggestDirGlob, but `extract --ignore` and +// `scan --extract-ignore` both consume bare repo-relative globs — +// neither understands the `:` notation. Strip it here so the +// copy-pasted remediation command actually matches the offending +// paths. Same-shaped globs from different repos collapse via the +// dedup below (e.g. `dist/*` appearing under multiple repos becomes +// one suggestion, which is what the user wants). func CollectAllSuggestions(buckets []SuspectBucket) []string { seen := map[string]struct{}{} var out []string for _, b := range buckets { for _, s := range b.Suggestions { + s = stripRepoPrefix(s) if _, ok := seen[s]; ok { continue } @@ -225,3 +237,21 @@ func CollectAllSuggestions(buckets []SuspectBucket) []string { } return out } + +// stripRepoPrefix removes a leading `:` from a suggestion glob +// when present. Conservative: the prefix is only recognized when the +// segment before the colon has no slash or glob metacharacters, which +// matches the LoadMultiJSONL convention and avoids misinterpreting +// legitimate colons inside paths (rare but possible on some +// filesystems). Single-repo datasets have no prefix and pass through +// unchanged. +func stripRepoPrefix(s string) string { + i := strings.Index(s, ":") + if i < 0 { + return s + } + if strings.ContainsAny(s[:i], "/*?[") { + return s + } + return s[i+1:] +} diff --git a/internal/stats/suspect_test.go b/internal/stats/suspect_test.go index a83ea2c..bd2df02 100644 --- a/internal/stats/suspect_test.go +++ b/internal/stats/suspect_test.go @@ -1,6 +1,7 @@ package stats import ( + "strings" "testing" "github.com/lex0c/gitcortex/internal/extract" @@ -353,6 +354,62 @@ func TestCollectAllSuggestionsCoversUnshownBuckets(t *testing.T) { } } +// When paths come from LoadMultiJSONL they carry a `:` prefix. +// Those prefixes leak into directory-glob suggestions and make the +// copy-pasted `extract --ignore` / `scan --extract-ignore` command +// a no-op on real repo paths (which never carry the prefix). +// CollectAllSuggestions must strip the prefix and collapse duplicates +// that only differed in which repo they came from. +func TestCollectAllSuggestionsStripsRepoPrefix(t *testing.T) { + ds := &Dataset{ + files: map[string]*fileEntry{ + // Two repos with the same offending shape — suggestions + // collapse to one canonical "dist/*" glob. + "repoA:js/dist/a.min.js": {additions: 400, deletions: 400}, + "repoB:css/dist/b.min.css": {additions: 400, deletions: 400}, + // A single-repo path stays untouched (no prefix). + "vendor/c.go": {additions: 400, deletions: 400}, + }, + } + buckets, _ := DetectSuspectFiles(ds) + got := CollectAllSuggestions(buckets) + + for _, s := range got { + if strings.Contains(s, ":") { + t.Errorf("suggestion %q still carries a repo prefix — users can't copy-paste this into extract --ignore", s) + } + } + has := func(want string) bool { + for _, s := range got { + if s == want { + return true + } + } + return false + } + for _, want := range []string{"js/dist/*", "css/dist/*", "vendor/*"} { + if !has(want) { + t.Errorf("expected suggestion %q in %v", want, got) + } + } +} + +func TestStripRepoPrefix(t *testing.T) { + cases := []struct{ in, want string }{ + {"repoA:wp-includes/dist/*", "wp-includes/dist/*"}, + {"dist/*", "dist/*"}, // no prefix + {"*.min.js", "*.min.js"}, // no prefix + {"vendor/*", "vendor/*"}, // no prefix (no colon) + {"src/weird:name.go", "src/weird:name.go"}, // colon after slash = not a prefix + {"glob*prefix:foo/bar", "glob*prefix:foo/bar"}, // metachar before colon = not a prefix + } + for _, c := range cases { + if got := stripRepoPrefix(c.in); got != c.want { + t.Errorf("stripRepoPrefix(%q) = %q, want %q", c.in, got, c.want) + } + } +} + func TestSuspectSuggestionsMatchExtractShouldIgnore(t *testing.T) { // Mix root-level and nested occurrences of every pattern class: // directory segments (vendor, dist), suffix (*.min.js), basename From 61f8e65a359bd07be00a33338e36b6f09f02f783 Mon Sep 17 00:00:00 2001 From: lex0c Date: Mon, 20 Apr 2026 12:13:37 -0300 Subject: [PATCH 04/37] Honor negated ignore rules before pruning a directory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The unconditional SkipDir on an ignored directory short-circuited the matcher's last-match-wins semantics: patterns like `vendor/` + `!vendor/keep` would prune vendor before vendor/keep could be examined, so intentionally re-included repositories were silently dropped from the scan. Matcher now exposes CouldReinclude(dir), which returns true iff any negation rule could fire on a descendant of dir — explicit child references (!vendor/keep) and basename-only negations (!keep) both force descent. Discovery descends instead of pruning whenever that check is true, letting per-path Match() decide. Unrelated exclusions keep pruning effective for the common case. Co-Authored-By: Claude Opus 4.7 (1M context) --- internal/scan/discovery.go | 11 ++++++- internal/scan/discovery_test.go | 53 +++++++++++++++++++++++++++++++++ internal/scan/ignore.go | 37 +++++++++++++++++++++++ 3 files changed, 100 insertions(+), 1 deletion(-) diff --git a/internal/scan/discovery.go b/internal/scan/discovery.go index 618fefd..feae8b2 100644 --- a/internal/scan/discovery.go +++ b/internal/scan/discovery.go @@ -74,7 +74,16 @@ func Discover(roots []string, matcher *Matcher, maxDepth int) ([]Repo, error) { } if rel != "." && matcher.Match(rel, true) { - return filepath.SkipDir + // Don't SkipDir blindly — if any negation rule could + // re-include a descendant (e.g. `vendor/` + `!vendor/keep`), + // pruning here would drop the re-inclusion before its + // target is visited. Descend and let Match() decide + // per-path. For ordinary `node_modules`-style excludes + // with no negations in play, CouldReinclude returns + // false and pruning remains. + if !matcher.CouldReinclude(rel) { + return filepath.SkipDir + } } // Two repo shapes to detect: diff --git a/internal/scan/discovery_test.go b/internal/scan/discovery_test.go index 9957b21..e5917ef 100644 --- a/internal/scan/discovery_test.go +++ b/internal/scan/discovery_test.go @@ -50,6 +50,59 @@ func TestDiscover_RespectsIgnore(t *testing.T) { } } +// Regression: `vendor/` + `!vendor/keep` must descend into vendor/ +// so the negation has a chance to re-include vendor/keep. Before the +// fix, the walker SkipDir'd vendor unconditionally and the re-included +// repo was silently dropped. +func TestDiscover_HonorsNegatedDescendant(t *testing.T) { + root := t.TempDir() + mustMkRepo(t, filepath.Join(root, "app")) + mustMkRepo(t, filepath.Join(root, "vendor", "garbage")) + mustMkRepo(t, filepath.Join(root, "vendor", "keep")) + + matcher := NewMatcher([]string{"vendor/", "!vendor/keep"}) + repos, err := Discover([]string{root}, matcher, 0) + if err != nil { + t.Fatal(err) + } + got := map[string]bool{} + for _, r := range repos { + got[r.RelPath] = true + } + if !got["app"] { + t.Errorf("app should be included: %+v", repos) + } + if !got["vendor/keep"] { + t.Errorf("vendor/keep should be re-included by negation rule: %+v", repos) + } + if got["vendor/garbage"] { + t.Errorf("vendor/garbage should remain ignored: %+v", repos) + } +} + +func TestMatcher_CouldReinclude(t *testing.T) { + cases := []struct { + name string + patterns []string + dir string + want bool + }{ + {"no negation", []string{"vendor/"}, "vendor", false}, + {"explicit descendant", []string{"vendor/", "!vendor/keep"}, "vendor", true}, + {"unrelated negation", []string{"vendor/", "!src/main"}, "vendor", false}, + {"basename negation could fire anywhere", []string{"vendor/", "!keep"}, "vendor", true}, + {"deep-match negation", []string{"build/", "!**/src"}, "build", true}, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + m := NewMatcher(c.patterns) + if got := m.CouldReinclude(c.dir); got != c.want { + t.Errorf("CouldReinclude(%q) with %v = %v, want %v", c.dir, c.patterns, got, c.want) + } + }) + } +} + func TestDiscover_DoesNotDescendIntoRepo(t *testing.T) { root := t.TempDir() parent := filepath.Join(root, "parent") diff --git a/internal/scan/ignore.go b/internal/scan/ignore.go index c378b05..30ab306 100644 --- a/internal/scan/ignore.go +++ b/internal/scan/ignore.go @@ -105,6 +105,43 @@ func (m *Matcher) Match(relPath string, isDir bool) bool { return matched } +// CouldReinclude reports whether any negation rule targets a descendant +// of dir — i.e. walking into the dir could still yield a re-included +// path. Callers use this to decide whether to prune an ignored +// directory via filepath.SkipDir or descend into it and evaluate +// children individually. +// +// Without this check the walker short-circuits the matcher's last- +// match-wins semantics: a `vendor/` + `!vendor/keep` pair would skip +// the vendor subtree entirely before vendor/keep could be examined, +// silently dropping the re-included path. +// +// The check is conservative — it returns true when: +// - a negation's pattern begins with `dir/` (explicit descendant +// reference, the documented common case), OR +// - a negation's pattern has no path separator (basename rules like +// `!keep` that could fire anywhere, including inside dir). +// +// Negations that target a sibling or ancestor path correctly don't +// trigger descent, so pruning remains effective for unrelated ignored +// trees like `node_modules/`. +func (m *Matcher) CouldReinclude(dir string) bool { + dir = filepath.ToSlash(dir) + for _, r := range m.rules { + if !r.Negate { + continue + } + pat := strings.TrimPrefix(r.Pattern, "**/") + if !strings.Contains(pat, "/") { + return true + } + if strings.HasPrefix(pat, dir+"/") { + return true + } + } + return false +} + func matchRule(r IgnoreRule, relPath string) bool { pat := r.Pattern From 5950a26b7bbe0a9d063939b3286e8ad903fc451a Mon Sep 17 00:00:00 2001 From: lex0c Date: Mon, 20 Apr 2026 12:17:43 -0300 Subject: [PATCH 05/37] Keep per-repo commits when SHAs overlap across repositories MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ds.commits is keyed by SHA; when a scan ingests repos that share a commit (fork, mirror, cherry-pick) the later ingest overwrote the earlier one. RepoBreakdown iterated ds.commits directly, so the consolidated report silently under-counted commits and churn for one side of every collision — materially wrong percentages in exactly the multi-repo scenarios scan was built for. Track commits in a parallel map[repo][]*commitEntry populated at ingest. File-side lookups keep using ds.commits (one record per SHA is enough there); RepoBreakdown iterates the per-repo slice, which preserves every commit regardless of collisions. Regression test uses LoadMultiJSONL with two JSONLs that share a SHA — without the fix, one of the two repos disappears from the breakdown entirely. Co-Authored-By: Claude Opus 4.7 (1M context) --- internal/stats/reader.go | 24 ++++++++++- internal/stats/repo_breakdown.go | 59 ++++++++++++++++++--------- internal/stats/repo_breakdown_test.go | 50 +++++++++++++++++++++++ 3 files changed, 112 insertions(+), 21 deletions(-) diff --git a/internal/stats/reader.go b/internal/stats/reader.go index 63783e7..4162249 100644 --- a/internal/stats/reader.go +++ b/internal/stats/reader.go @@ -87,6 +87,17 @@ type Dataset struct { files map[string]*fileEntry workGrid [7][24]int + // commitsByRepo preserves commit records per repository so the + // per-repo breakdown stays correct when two repos share a SHA + // (forks, mirrors, cherry-picks between sibling projects). The + // commits map above is keyed by SHA and drops the losing side of + // a collision — acceptable for file-side lookups where a single + // record per SHA is enough — but RepoBreakdown would then + // silently under-count the overwritten repo's commits and churn. + // Populated at ingest in lockstep with commits; a single-repo + // load (no pathPrefix) keeps all entries under the "(repo)" key. + commitsByRepo map[string][]*commitEntry + // Coupling (pre-aggregated during load) couplingPairs map[filePair]int couplingFileChanges map[string]int @@ -163,6 +174,7 @@ func newDataset() *Dataset { contribFiles: make(map[string]map[string]struct{}), contribFirst: make(map[string]time.Time), contribLast: make(map[string]time.Time), + commitsByRepo: make(map[string][]*commitEntry), } } @@ -240,7 +252,7 @@ func streamLoadInto(ds *Dataset, r io.Reader, opt LoadOptions, pathPrefix string ds.TotalDeletions += c.Deletions ds.TotalFilesChanged += int64(c.FilesChanged) - ds.commits[c.SHA] = &commitEntry{ + entry := &commitEntry{ email: c.AuthorEmail, date: t, add: c.Additions, @@ -249,6 +261,16 @@ func streamLoadInto(ds *Dataset, r io.Reader, opt LoadOptions, pathPrefix string message: c.Message, repo: strings.TrimSuffix(pathPrefix, ":"), } + ds.commits[c.SHA] = entry + // Record separately per-repo so RepoBreakdown survives + // SHA collisions across repositories (forks/mirrors/ + // cherry-picks). Key defaults to "(repo)" in single-repo + // loads so the fallback row in RepoBreakdown still works. + repoKey := entry.repo + if repoKey == "" { + repoKey = "(repo)" + } + ds.commitsByRepo[repoKey] = append(ds.commitsByRepo[repoKey], entry) // Contributors cs, ok := ds.contributors[c.AuthorEmail] diff --git a/internal/stats/repo_breakdown.go b/internal/stats/repo_breakdown.go index 0de8091..57822e8 100644 --- a/internal/stats/repo_breakdown.go +++ b/internal/stats/repo_breakdown.go @@ -28,22 +28,26 @@ type RepoStat struct { PctOfTotalChurn float64 } -// RepoBreakdown groups commits by their repo tag (set during ingest from -// the LoadMultiJSONL pathPrefix) and returns one row per repo, sorted by -// commit count descending. When emailFilter is non-empty, only commits -// authored by that email are counted — this is the "show my work across -// repos" lens used by the scan report. +// RepoBreakdown groups commits by repo and returns one row per repo, +// sorted by commit count descending. When emailFilter is non-empty, +// only commits authored by that email are counted — this is the +// "show my work across repos" lens used by the scan report. // -// On single-repo datasets (no prefix tag, repo == ""), the function -// returns a single row labeled "(repo)" so callers can still render -// without a special case. +// Source of truth is ds.commitsByRepo, a per-repository slice +// populated at ingest. The SHA-keyed ds.commits map cannot be used +// directly because two repos that share a commit SHA (fork, mirror, +// cherry-pick) would lose the earlier ingest to the later one, +// under-counting one repo's totals. Iterating commitsByRepo sidesteps +// that collision. // -// Caveat: the underlying ds.commits map is keyed by SHA. If two repos -// share a commit SHA (cherry-pick, mirror, fork) the second one ingested -// overwrites the first, and the breakdown will report only the -// surviving repo. In practice this only matters for forked / mirrored -// repos under a single scan root — for those, scan and aggregate -// separately if exact attribution matters. +// Fallback: test datasets that hand-build ds.commits without +// populating commitsByRepo still work — we detect the empty slice +// and bucket ds.commits by repo as before. Production ingest always +// populates commitsByRepo. +// +// On single-repo datasets (no prefix tag), the function returns a +// single row labeled "(repo)" so callers can render without a special +// case. func RepoBreakdown(ds *Dataset, emailFilter string) []RepoStat { type acc struct { commits int @@ -56,13 +60,9 @@ func RepoBreakdown(ds *Dataset, emailFilter string) []RepoStat { repos := make(map[string]*acc) emailLower := strings.ToLower(strings.TrimSpace(emailFilter)) - for _, c := range ds.commits { + visit := func(key string, c *commitEntry) { if emailLower != "" && strings.ToLower(strings.TrimSpace(c.email)) != emailLower { - continue - } - key := c.repo - if key == "" { - key = "(repo)" + return } a, ok := repos[key] if !ok { @@ -89,6 +89,25 @@ func RepoBreakdown(ds *Dataset, emailFilter string) []RepoStat { } } + if len(ds.commitsByRepo) > 0 { + for repo, entries := range ds.commitsByRepo { + for _, c := range entries { + visit(repo, c) + } + } + } else { + // Fallback for hand-built test datasets that populate + // ds.commits directly. Production ingest always goes through + // streamLoadInto and populates commitsByRepo. + for _, c := range ds.commits { + key := c.repo + if key == "" { + key = "(repo)" + } + visit(key, c) + } + } + // File counts come from ds.files. Path prefix is `:` so // the split key matches the commit-side `c.repo`. Files without a // prefix (single-repo loads) bucket under the same "(repo)" label. diff --git a/internal/stats/repo_breakdown_test.go b/internal/stats/repo_breakdown_test.go index 952ae00..404a563 100644 --- a/internal/stats/repo_breakdown_test.go +++ b/internal/stats/repo_breakdown_test.go @@ -1,6 +1,8 @@ package stats import ( + "os" + "path/filepath" "testing" "time" ) @@ -110,6 +112,54 @@ func TestRepoBreakdown_SurvivesRename(t *testing.T) { } } +// Regression: when two repos share a commit SHA (fork / mirror / +// cherry-pick) the ingest map ds.commits drops one side. Without a +// separate per-repo record, RepoBreakdown would silently under-count +// the repo ingested first and the consolidated report would show +// wrong percentages. Must preserve both. +func TestRepoBreakdown_SurvivesCrossRepoSHACollision(t *testing.T) { + dir := t.TempDir() + // Two repos with IDENTICAL commit rows — different churn per row + // so we can verify both contribute to the breakdown. In real + // life this maps to a cherry-picked or mirrored commit where the + // SHA matches but the files and stats land in different repos. + commitRow := `{"type":"commit","sha":"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa","author_date":"2024-01-01T00:00:00Z","author_email":"me@x.com","author_name":"Me","additions":10,"deletions":0,"files_changed":1}` + alphaFile := `{"type":"commit_file","commit":"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa","path_current":"a.go","additions":10,"deletions":0}` + betaFile := `{"type":"commit_file","commit":"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa","path_current":"b.go","additions":10,"deletions":0}` + + alpha := filepath.Join(dir, "alpha.jsonl") + beta := filepath.Join(dir, "beta.jsonl") + if err := os.WriteFile(alpha, []byte(commitRow+"\n"+alphaFile+"\n"), 0o644); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(beta, []byte(commitRow+"\n"+betaFile+"\n"), 0o644); err != nil { + t.Fatal(err) + } + + ds, err := LoadMultiJSONL([]string{alpha, beta}) + if err != nil { + t.Fatalf("LoadMultiJSONL: %v", err) + } + + breakdown := RepoBreakdown(ds, "") + if len(breakdown) != 2 { + t.Fatalf("want breakdown across both alpha AND beta despite SHA collision, got %d rows: %+v", len(breakdown), breakdown) + } + got := map[string]int{} + for _, b := range breakdown { + got[b.Repo] = b.Commits + } + if got["alpha"] != 1 || got["beta"] != 1 { + t.Errorf("each repo should keep its commit; got alpha=%d beta=%d", got["alpha"], got["beta"]) + } + // Sanity on percentage — each repo is 50% of the 2 collided commits. + for _, b := range breakdown { + if b.PctOfTotalCommits != 50 { + t.Errorf("repo %s: want 50%% commits, got %.1f", b.Repo, b.PctOfTotalCommits) + } + } +} + func TestRepoBreakdown_SingleRepoFallback(t *testing.T) { // No prefix tag → all commits bucket under "(repo)". ds := &Dataset{ From 9a6f5f68148e5ded305831e5ab57d870cc4b9568 Mon Sep 17 00:00:00 2001 From: lex0c Date: Mon, 20 Apr 2026 12:29:32 -0300 Subject: [PATCH 06/37] Normalize repo prefix before suspect-pattern matching MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DetectSuspectFiles fed ds.files keys straight into the pattern predicates, but multi-repo loads prefix every path with :. Root-level vendor, package-lock.json, go.sum, and friends missed detection — `strings.HasPrefix(p, "vendor/")` and `p == "package-lock.json"` both fail when the string actually begins with `alpha:vendor/x.go` or equals `alpha:package-lock.json`. Nested occurrences like `repo:pkg/vendor/x.go` still matched via the `/vendor/` substring check, so the bug looked partial in the wild: heavy repos with nested generated content triggered the warning, while a Go project with vendor/ at the root under the same scan went silently unflagged and users kept chasing bus-factor ghosts from vendored code. Strip the `:` prefix once per file and feed the normalized path to both Match and Suggest. Nested paths are unaffected; single- repo loads (no prefix) pass through unchanged. Co-Authored-By: Claude Opus 4.7 (1M context) --- internal/stats/suspect.go | 15 +++++++++-- internal/stats/suspect_test.go | 47 ++++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+), 2 deletions(-) diff --git a/internal/stats/suspect.go b/internal/stats/suspect.go index af62aaf..c12f6f4 100644 --- a/internal/stats/suspect.go +++ b/internal/stats/suspect.go @@ -142,11 +142,22 @@ func DetectSuspectFiles(ds *Dataset) ([]SuspectBucket, bool) { for path, fe := range ds.files { fileChurn := fe.additions + fe.deletions totalChurn += fileChurn + // Multi-repo loads prefix every path with `:` so files + // from different repos can coexist in one Dataset. The + // pattern predicates assume bare repo-relative paths — they + // look for root-level `vendor/`, full-path `package-lock.json`, + // and similar shapes that fail when the string actually + // begins with `alpha:vendor/x.go` or equals + // `alpha:package-lock.json`. Strip the prefix once and feed + // the normalized form to both Match and Suggest; nested + // occurrences (e.g. `repo:pkg/vendor/x.go`) still work via + // the normal substring checks on the stripped path. + matchPath := stripRepoPrefix(path) // A file might match multiple patterns (e.g. vendor/*.pb.go). // Attribute it to the first match so totals don't double-count; // the first-match wins keeps output predictable. for _, pat := range defaultSuspectPatterns { - if !pat.Match(path) { + if !pat.Match(matchPath) { continue } b, ok := buckets[pat.Glob] @@ -159,7 +170,7 @@ func DetectSuspectFiles(ds *Dataset) ([]SuspectBucket, bool) { b.Churn += fileChurn suspectChurn += fileChurn if pat.Suggest != nil { - suggSets[pat.Glob][pat.Suggest(path)] = struct{}{} + suggSets[pat.Glob][pat.Suggest(matchPath)] = struct{}{} } break } diff --git a/internal/stats/suspect_test.go b/internal/stats/suspect_test.go index bd2df02..a9dfa7c 100644 --- a/internal/stats/suspect_test.go +++ b/internal/stats/suspect_test.go @@ -410,6 +410,53 @@ func TestStripRepoPrefix(t *testing.T) { } } +// Regression: multi-repo loads prefix every file path with `:`, +// and the pattern predicates (hasPathSegment, basenameEquals) compare +// against the raw string. Root-level occurrences like +// `alpha:vendor/x.go` and `alpha:package-lock.json` slipped past +// detection because `strings.HasPrefix(p, "vendor/")` and +// `p == "package-lock.json"` both fail when the `:` prefix is +// present. Symptom in the wild: stats on a scan consumed multiple +// --input files, suspect warning suppressed, users kept seeing +// inflated churn/bus-factor from generated content with no hint to +// fix it. +func TestDetectSuspectFiles_RootLevelUnderRepoPrefix(t *testing.T) { + ds := &Dataset{ + files: map[string]*fileEntry{ + // Root-level occurrences — the cases that previously missed. + "alpha:vendor/x.go": {additions: 400, deletions: 400}, + "beta:package-lock.json": {additions: 400, deletions: 400}, + "beta:go.sum": {additions: 400, deletions: 400}, + // Nested occurrence — worked before the fix and must still work. + "alpha:pkg/vendor/deep.go": {additions: 100, deletions: 100}, + }, + } + buckets, worth := DetectSuspectFiles(ds) + if !worth { + t.Fatal("expected warning-worthy dataset; prefix-stripped matching should fire on root-level vendor/lockfiles") + } + + got := map[string]bool{} + for _, b := range buckets { + got[b.Pattern.Glob] = true + } + for _, want := range []string{"vendor/*", "package-lock.json", "go.sum"} { + if !got[want] { + t.Errorf("pattern %q missing from detected buckets %v — root-level %q under repo prefix went undetected", want, got, want) + } + } + + // Suggestions should come out clean (no `:`) so the + // copy-pasted --ignore command actually matches these paths + // next run. + suggestions := CollectAllSuggestions(buckets) + for _, s := range suggestions { + if len(s) > 0 && s[0] == '\'' { + t.Errorf("suggestion %q should not carry shell quotes here", s) + } + } +} + func TestSuspectSuggestionsMatchExtractShouldIgnore(t *testing.T) { // Mix root-level and nested occurrences of every pattern class: // directory segments (vendor, dist), suffix (*.min.js), basename From 98efb109897781c6e475a1ebeb52a5d611679e78 Mon Sep 17 00:00:00 2001 From: lex0c Date: Mon, 20 Apr 2026 12:33:03 -0300 Subject: [PATCH 07/37] Use numeric value for repo breakdown bar widths MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Per-Repository Breakdown section fed pctFloat's output (a string) into printf "%.0f". html/template's CSS sanitizer saw the resulting %!f(string=...) as unsafe and replaced every bar width with ZgotmplZ — the commit-share visualization rendered at zero width in every multi-repo report. PctOfTotalCommits is already a float64 on RepoStat, so printf it directly. Drop the $maxRepoCommits variable that was only guarding a zero-churn case already covered by PctOfTotalCommits returning 0. Regression test asserts no ZgotmplZ or %!f(string= markers appear in a multi-repo render. Co-Authored-By: Claude Opus 4.7 (1M context) --- internal/report/report_test.go | 44 ++++++++++++++++++++++++++++++++++ internal/report/template.go | 5 ++-- 2 files changed, 46 insertions(+), 3 deletions(-) diff --git a/internal/report/report_test.go b/internal/report/report_test.go index 2b4c826..595b0f6 100644 --- a/internal/report/report_test.go +++ b/internal/report/report_test.go @@ -77,6 +77,50 @@ func TestGenerate_SmokeRender(t *testing.T) { } } +// Regression: the Per-Repository Breakdown section emits a CSS width +// for each repo's commit-share bar. The width has to be a numeric +// literal — if the template feeds a string through printf "%.0f" +// (e.g. via the string-returning pctFloat helper) html/template's +// CSS sanitizer replaces it with `ZgotmplZ` and the bars render at +// zero width. Exercise the multi-repo path end-to-end and assert +// the rendered widths are clean. +func TestGenerate_PerRepoBreakdownWidthsAreNumeric(t *testing.T) { + dir := t.TempDir() + alpha := filepath.Join(dir, "alpha.jsonl") + beta := filepath.Join(dir, "beta.jsonl") + // Distinct SHAs and content so both repos appear in the breakdown. + alphaRow := `{"type":"commit","sha":"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa","author_email":"me@x.com","author_name":"Me","author_date":"2024-01-01T00:00:00Z","additions":10,"deletions":0,"files_changed":1} +{"type":"commit_file","commit":"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa","path_current":"a.go","additions":10,"deletions":0} +` + betaRow := `{"type":"commit","sha":"bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb","author_email":"me@x.com","author_name":"Me","author_date":"2024-02-01T00:00:00Z","additions":20,"deletions":5,"files_changed":1} +{"type":"commit_file","commit":"bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb","path_current":"b.go","additions":20,"deletions":5} +` + if err := os.WriteFile(alpha, []byte(alphaRow), 0o644); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(beta, []byte(betaRow), 0o644); err != nil { + t.Fatal(err) + } + ds, err := stats.LoadMultiJSONL([]string{alpha, beta}) + if err != nil { + t.Fatalf("LoadMultiJSONL: %v", err) + } + var buf bytes.Buffer + if err := Generate(&buf, ds, "scan-fixture", 10, stats.StatsFlags{CouplingMinChanges: 1, NetworkMinFiles: 1}); err != nil { + t.Fatalf("Generate: %v", err) + } + out := buf.String() + if !strings.Contains(out, "Per-Repository Breakdown") { + t.Fatal("breakdown section missing from multi-repo report") + } + if strings.Contains(out, "ZgotmplZ") { + t.Error("report contains ZgotmplZ — template fed a non-numeric value to a CSS context (likely printf'd a string)") + } + if strings.Contains(out, "%!f(string=") { + t.Error("report contains %!f(string=...) — printf was given a string where a float was expected") + } +} + func TestGenerate_EmptyDataset(t *testing.T) { dir := t.TempDir() path := filepath.Join(dir, "empty.jsonl") diff --git a/internal/report/template.go b/internal/report/template.go index 5abc4e1..9e19846 100644 --- a/internal/report/template.go +++ b/internal/report/template.go @@ -147,8 +147,7 @@ footer { margin-top: 40px; padding-top: 16px; border-top: 1px solid #d0d7de; col {{if gt (len .Repos) 1}}

Per-Repository Breakdown {{thousands (len .Repos)}} repositories

-

Cross-repo aggregation from scan. Use this view to see how work is distributed: a single repo dominating commits or churn often means the consolidated activity story is really about that one project; an even spread shows broad multi-repo engagement. Bar widths are normalized to the largest commit-count repo.

-{{$maxRepoCommits := 0}}{{range .Repos}}{{if gt .Commits $maxRepoCommits}}{{$maxRepoCommits = .Commits}}{{end}}{{end}} +

Cross-repo aggregation from scan. Use this view to see how work is distributed: a single repo dominating commits or churn often means the consolidated activity story is really about that one project; an even spread shows broad multi-repo engagement. Bar widths are the share of total commits across all scanned repos.

@@ -168,7 +167,7 @@ footer { margin-top: 40px; padding-top: 16px; border-top: 1px solid #d0d7de; col ` followed by + // a `` for commits. Assert 1 commit per repo — if the + // filter leaked, alpha would show 2 (me's + colleague's). + commitCountRe := regexp.MustCompile(`\s*`) + rows := commitCountRe.FindAllStringSubmatch(out, -1) + if len(rows) != 2 { + t.Fatalf("expected 2 repo rows in breakdown, got %d: %v", len(rows), rows) + } + for _, r := range rows { + if r[2] != "1" { + t.Errorf("repo %s shows %s commits in profile breakdown — email filter leaked (want 1)", r[1], r[2]) + } + } + + // Colleague-exclusive file must not appear in my scope. The + // template renders file counts as `` inside each row + // — indirect assertion: if the file-count bumped, the dev-filter + // on devCommits is wrong. + if strings.Contains(out, "colleague-only.go") { + t.Error("profile report mentions a file only the colleague touched") + } } func TestGenerate_EmptyDataset(t *testing.T) { diff --git a/internal/scan/discovery_test.go b/internal/scan/discovery_test.go index 8639acb..f07214d 100644 --- a/internal/scan/discovery_test.go +++ b/internal/scan/discovery_test.go @@ -8,6 +8,7 @@ import ( "os" "path/filepath" "testing" + "time" ) func TestDiscover_FindsRepos(t *testing.T) { @@ -184,6 +185,74 @@ func TestAssignSlugs_DeterministicUnderRetry(t *testing.T) { } } +// Mid-walk cancel: the pre-cancelled test below only proves the +// first callback checks ctx. This test cancels AFTER Discover has +// already started walking — the scenario users hit with Ctrl+C on +// a long home-dir scan. Asserting the walk stops in flight, not +// just when prompted at the very start. +// +// The tree is large enough (5,000 dirs × 3 subdirs = 20k nodes) +// that a full walk on typical hardware takes tens of ms; sleeping +// briefly before cancel lands the signal mid-flight. We assert: +// - err is context.Canceled (ctx check fired inside the walk) +// - elapsed time < full baseline (walk was actually interrupted) +// - at least one repo was discovered BEFORE cancel (not a +// pre-cancel scenario in disguise) +func TestDiscover_AbortsMidWalk(t *testing.T) { + if testing.Short() { + t.Skip("builds a large synthetic tree; skipped in -short mode") + } + root := t.TempDir() + const topLevel = 5000 + for i := 0; i < topLevel; i++ { + parent := filepath.Join(root, fmt.Sprintf("d-%05d", i)) + for j := 0; j < 3; j++ { + if err := os.MkdirAll(filepath.Join(parent, fmt.Sprintf("sub-%d", j)), 0o755); err != nil { + t.Fatal(err) + } + } + // Every top-level dir is a repo so the result count is a + // direct proxy for how far the walk got. + mustMkRepo(t, parent) + } + + // Baseline: uncancelled walk finds all topLevel repos. + baseStart := time.Now() + full, err := Discover(context.Background(), []string{root}, NewMatcher(nil), 0) + if err != nil { + t.Fatal(err) + } + baseline := time.Since(baseStart) + if len(full) != topLevel { + t.Fatalf("baseline walk should find all %d repos, got %d", topLevel, len(full)) + } + + // Cancelled walk: give the walk a head start, then cancel. + ctx, cancel := context.WithCancel(context.Background()) + done := make(chan struct{}) + go func() { + time.Sleep(baseline / 4) + cancel() + close(done) + }() + + cancStart := time.Now() + repos, err := Discover(ctx, []string{root}, NewMatcher(nil), 0) + cancElapsed := time.Since(cancStart) + <-done + + if err != context.Canceled { + t.Fatalf("want context.Canceled after mid-walk cancel, got err=%v (elapsed %v, baseline %v, repos %d)", + err, cancElapsed, baseline, len(repos)) + } + if cancElapsed >= baseline { + t.Errorf("cancelled walk took %v — baseline was %v. Ctx respected but walk did not shortcut; users will see delayed Ctrl+C", cancElapsed, baseline) + } + if len(repos) >= topLevel { + t.Errorf("cancelled walk returned all %d repos — appears to have finished before cancel fired; baseline %v, elapsed %v", len(repos), baseline, cancElapsed) + } +} + // A cancelled context must abort the walk — previously Discover // ignored ctx entirely, so Ctrl+C during the walk phase only took // effect after every directory had been stat'd. Test uses a From 0947705215d3828a5bb84cac10ebfc0f7aef018d Mon Sep 17 00:00:00 2001 From: lex0c Date: Mon, 20 Apr 2026 13:47:29 -0300 Subject: [PATCH 13/37] Document scan command and per-repo breakdown metric README gains a full `Scan: discover and aggregate every repo under a root` section covering the headline flows: consolidated team report, `--email` personal cross-repo profile, multi-root workspaces. Covers the output directory layout (slug.jsonl, slug.state, manifest.json), the `.gitcortex-ignore` syntax with negation examples, and the tuning flags forwarded to the consolidated report. HTML report section notes the new Per-Repository Breakdown that appears on multi-repo runs. Architecture tree picks up internal/scan and internal/stats/repo_breakdown. METRICS.md adds a Per-Repository Breakdown entry describing columns, how the repo label is derived from the LoadMultiJSONL path prefix, and the divergence-between-commits-and-churn signal. Vendor/generated warning section notes the prefix-stripping for multi-JSONL inputs so readers understand why suggestions come out repo-relative. Co-Authored-By: Claude Opus 4.7 (1M context) --- README.md | 70 ++++++++++++++++++++++++++++++++++++++++++++++++- docs/METRICS.md | 23 ++++++++++++++++ 2 files changed, 92 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 29ea136..bdcf222 100644 --- a/README.md +++ b/README.md @@ -401,6 +401,67 @@ gitcortex stats --input auth.jsonl --input payments.jsonl --stat coupling --top Paths appear as `auth:src/main.go` and `payments:src/main.go`. Contributors are deduped by email across repos — the same developer contributing to both repos is counted once. +For workspaces containing many repos (an engineer's `~/work`, a platform team's service folder), `gitcortex scan` discovers every `.git` under one or more roots and extracts them in parallel — see below. + +### Scan: discover and aggregate every repo under a root + +Walk one or more directories, find every git repository (working trees and bare clones both detected), extract them in parallel, and optionally generate a consolidated HTML report. The main use case is "show my manager every project I've touched in the last year" or "give a platform team one dashboard across N services" without scripting the multi-extract + multi-input pipeline manually. + +```bash +# Discover and extract every repo under ~/work, one JSONL per repo +gitcortex scan --root ~/work --output ./scan-out + +# Consolidated report across all discovered repos +gitcortex scan --root ~/work --output ./scan-out --report ./all.html + +# Personal cross-repo profile: only MY commits, with per-repo breakdown +gitcortex scan --root ~/work --output ./scan-out \ + --report ./me.html --email me@company.com --since 1y \ + --include-commit-messages + +# Multiple roots, higher parallelism, pre-set ignore patterns +gitcortex scan --root ~/work --root ~/personal --root ~/oss \ + --parallel 8 --max-depth 4 \ + --output ./scan-out --report ./all.html +``` + +The scan output directory holds: + +| file | purpose | +|---|---| +| `.jsonl` | per-repo JSONL, one per discovered repo | +| `.state` | resume checkpoint (safe to re-run scan to continue) | +| `manifest.json` | discovery results, per-repo status (ok/failed/pending), timing | + +Each repo's slug is derived from its directory basename; colliding basenames get a short SHA-1 suffix (the suffix lengthens automatically on the rare truncation collision, so `.state` is stable across runs). + +**Filtering discovery with `.gitcortex-ignore`.** Create a gitignore-style file at the scan root: + +``` +# skip heavy clones we don't want in the report +node_modules +chromium.git +linux.git + +# skip vendored repos except the one we own +vendor/ +!vendor/in-house-fork +``` + +Directory rules, globs, `**/foo`, and `!path` negations all work. Globbed negations like `!vendor*/keep` are honored — discovery descends into any dir where a negation rule could match a descendant. If `--ignore-file` is not set, scan looks for `.gitcortex-ignore` in the first `--root`. + +**Consolidated report extras.** When a scan produces more than one repo's data, both the team report and `--email` profile report render a new *Per-Repository Breakdown* section: commits, churn, files, active days, and the share-of-total for each repo. On an `--email` profile the counts are filtered to that developer's contributions (files count reflects only files the dev touched). + +**Flags worth knowing:** + +- `--parallel N` — repos extracted concurrently (default 4). Git is I/O-bound, so values past NumCPU give diminishing returns. +- `--max-depth N` — stop descending past N levels. Useful when a root contains a monorepo with deeply nested internal repos you don't want enumerated. +- `--extract-ignore ` (repeatable) — forwarded to each per-repo `extract --ignore`, e.g. `--extract-ignore 'package-lock.json' --extract-ignore 'dist/*'`. +- `--from / --to / --since` — time window applied to the consolidated report (same semantics as `report`). +- `--churn-half-life`, `--coupling-max-files`, `--coupling-min-changes`, `--network-min-files` — pass tuning to the consolidated report identical to `gitcortex report`. + +Partial failures are non-fatal: the manifest records which repos failed, and the report is built from whichever JSONLs completed. `Ctrl+C` aborts both the discovery walk and any in-flight extracts; re-running picks up from each repo's state file. + ### Diff: compare time periods Compare stats between two time periods, or filter to a single period. @@ -444,6 +505,8 @@ gitcortex report --input data.jsonl --email alice@company.com --output alice.htm Includes: summary cards, activity heatmap (with table toggle), top contributors, file hotspots, churn risk (with full-dataset label distribution strip above the truncated table), bus factor, file coupling, working patterns heatmap, top commits, developer network, and developer profiles. A collapsible glossary at the top defines the terms (bus factor, churn, legacy-hotspot, specialization, etc.) for readers who are not already familiar. Typical size: 50-500KB depending on number of contributors. +When the input is multi-repo (from `gitcortex scan` or multiple `--input` files), both the team report and `--email` profile also render a *Per-Repository Breakdown* with commit/churn/files/active-days per repo and each repo's share of the total. + > The HTML activity heatmap is always monthly (year × 12 months grid). For day/week/year buckets, use `gitcortex stats --stat activity --granularity `. ### CI: quality gates for pipelines @@ -483,9 +546,14 @@ internal/ parse.go Shared types (RawEntry, NumstatEntry) discard.go Malformed entry tracking extract/extract.go Extraction orchestration, state, JSONL writing + scan/ + scan.go Multi-repo orchestration (worker pool over extract) + discovery.go Directory walk, bare-repo detection, slug uniqueness + ignore.go Gitignore-style matcher with negation support stats/ - reader.go Streaming JSONL aggregator (single-pass) + reader.go Streaming JSONL aggregator (single-pass, multi-JSONL) stats.go Stat computations (9 stats) + repo_breakdown.go Per-repository aggregate (scan consolidated report) format.go Table/CSV/JSON output formatting ``` diff --git a/docs/METRICS.md b/docs/METRICS.md index 7ed473e..6749855 100644 --- a/docs/METRICS.md +++ b/docs/METRICS.md @@ -294,6 +294,27 @@ A `tree(1)`-style view of the repository's directory layout, built from paths se **When to use**: before drilling into hotspots or churn-risk, skim the structure to locate the modules those files live in. The tree is navigational context; ranked tables are where judgment happens. +## Per-Repository Breakdown + +Cross-repo aggregation that appears only when the dataset was loaded from more than one JSONL (typically via `gitcortex scan` or multiple `--input` files). One row per repo: + +| column | meaning | +|---|---| +| `Commits` | author-dated commit count in this repo | +| `% Commits` | share of total commits in the dataset | +| `Churn` | additions + deletions attributed to this repo | +| `% Churn` | share of total churn | +| `Files` | distinct files touched in this repo; when filtered by `--email`, counts only files that developer touched | +| `Active days` | distinct UTC author-dates | +| `Devs` | unique author emails in this repo | +| `First → Last` | earliest and latest author-date | + +**How the repo label is derived**: `LoadMultiJSONL` prefixes every path in the dataset with `:` (so `WordPress.git.jsonl` contributes paths like `WordPress.git:wp-includes/foo.php`). The breakdown groups by that prefix. If only a single JSONL is loaded, no prefix is emitted and the breakdown collapses to a single `(repo)` row the HTML report hides. + +**Divergence between `% Commits` and `% Churn` is informative**: a repo dominating churn while holding modest commit share often signals large-content work (docs, data), while the reverse points to small-diff high-frequency repos (config, manifests). + +**SHA collisions across repos** (forks, mirrors, cherry-picks between sibling projects) are preserved here — the breakdown tracks commits per repo via a dedicated slice populated at ingest, not the SHA-keyed commit map. Other file-level metrics (bus factor, coupling, dev network) still key by SHA and will collapse collided commits onto one record; if exact attribution matters for those, scan and aggregate the sibling repos separately. + ## Data Flow ``` @@ -364,6 +385,8 @@ Directory-segment heuristics (`vendor`, `node_modules`, `dist`, `build`, `third_ The warning is advisory. Nothing is auto-filtered; the user decides whether to re-extract. Matches do not affect computed stats in that run. JSON/CSV output paths skip the warning since they're typically piped. +On multi-JSONL loads (e.g. `stats --input a.jsonl --input b.jsonl` or a `gitcortex scan` dataset), paths carry a `:` prefix internally. The suspect detector strips that prefix before matching and suggesting, so root-level `vendor/`, `package-lock.json`, `go.sum`, etc. in any individual repo are detected and the emitted `--ignore` globs are repo-relative (drop-in for `extract --ignore` and `scan --extract-ignore`). Same-shape findings across repos collapse to one suggestion (`dist/*` applies everywhere rather than being listed once per repo). + Statistical heuristics (very high churn-per-commit, single-author bulk updates) are deliberately out of scope — their false-positive rate on hand-authored code is higher than the path-based list and we'd rather stay quiet than cry wolf. ### `--mailmap` off by default From 00b7321ffe6adebb3d758082ea4069a79b18ce75 Mon Sep 17 00:00:00 2001 From: lex0c Date: Mon, 20 Apr 2026 14:00:43 -0300 Subject: [PATCH 14/37] Validate --since before launching scan scanCmd parsed --since only after scan.Run had already walked every root and extracted every discovered repo. An invalid value (`--since 1yy`, `--since nope`) cost minutes-to-hours of discovery and extraction on large workspaces before the CLI finally rejected the flag. Move parseSince up alongside the other flag validation so obvious typos fail instantly without doing any filesystem work. Test runs scanCmd against an empty TempDir with --since bogus and asserts two things: the error mentions --since (early validation fired), and it does NOT contain "no git repositories found" (the error scan.Run would have produced had it been reached). Together these prove scan.Run was not called. Co-Authored-By: Claude Opus 4.7 (1M context) --- cmd/gitcortex/main.go | 22 ++++++++++++--------- cmd/gitcortex/main_test.go | 40 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 9 deletions(-) diff --git a/cmd/gitcortex/main.go b/cmd/gitcortex/main.go index ddcf0eb..0886401 100644 --- a/cmd/gitcortex/main.go +++ b/cmd/gitcortex/main.go @@ -921,6 +921,19 @@ breakdown — handy for showing aggregated work across many repos.`, if from != "" && to != "" && from > to { return fmt.Errorf("--from (%s) must be on or before --to (%s)", from, to) } + // Resolve --since up-front. If this fails we'd otherwise + // discover the typo only after a full multi-repo scan — + // minutes to hours on a large workspace, all thrown away + // because the user mistyped `--since 1yy`. Validate early, + // fail fast, keep the result for the report stage. + fromDate := from + if since != "" { + d, err := parseSince(since) + if err != nil { + return err + } + fromDate = d + } cfg := scan.Config{ Roots: roots, @@ -958,15 +971,6 @@ breakdown — handy for showing aggregated work across many repos.`, return fmt.Errorf("no successful repos extracted; cannot build report") } - fromDate := from - if since != "" { - d, err := parseSince(since) - if err != nil { - return err - } - fromDate = d - } - ds, err := stats.LoadMultiJSONL(result.JSONLs, stats.LoadOptions{ From: fromDate, To: to, diff --git a/cmd/gitcortex/main_test.go b/cmd/gitcortex/main_test.go index 57410bf..bf7edd3 100644 --- a/cmd/gitcortex/main_test.go +++ b/cmd/gitcortex/main_test.go @@ -1,10 +1,50 @@ package main import ( + "bytes" "strings" "testing" ) +// scanCmd must validate --since BEFORE running the discovery walk +// and extract pool. Without the early check, an obvious typo like +// `--since 1yy` only surfaces after scan.Run has already walked +// every root and extracted every repo found — which can take +// minutes-to-hours on a large workspace and waste the work. +// +// Use an empty TempDir so scan.Run would fail fast with "no git +// repositories found" if we reached it — that error does not mention +// "since", so an error containing "since" proves the early +// validation fired first. +func TestScanCmd_ValidatesSinceBeforeScanning(t *testing.T) { + cmd := scanCmd() + cmd.SilenceUsage = true + cmd.SilenceErrors = true + cmd.SetArgs([]string{ + "--root", t.TempDir(), + "--output", t.TempDir(), + "--since", "bogus", + }) + // Swallow any stderr output cobra might emit so we don't pollute + // go-test logs on success. + cmd.SetOut(&bytes.Buffer{}) + cmd.SetErr(&bytes.Buffer{}) + + err := cmd.Execute() + if err == nil { + t.Fatal("expected error for --since bogus, got nil") + } + if !strings.Contains(err.Error(), "since") { + t.Errorf("expected error to mention --since (proving early validation), got %q", err) + } + // Reaching scan.Run on an empty TempDir would produce the + // discovery error; asserting its absence confirms the walk was + // not started. + if strings.Contains(err.Error(), "no git repositories found") { + t.Errorf("scan.Run was reached before --since was validated — discovery ran on an invalid-flag input: %q", err) + } +} + func TestValidateDate(t *testing.T) { cases := []struct { in string From 814269588033a7795220ad49d8d336c51dd07d11 Mon Sep 17 00:00:00 2001 From: lex0c Date: Mon, 20 Apr 2026 14:02:13 -0300 Subject: [PATCH 15/37] Resolve symlink roots before walking discovery tree MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit filepath.WalkDir has a well-known quirk: when the ROOT argument is a symlink it visits only the link entry (reported as ModeSymlink) and refuses to descend into the target. Users whose `~/work` is a symlink to `/mnt/data/work` — a common setup on machines with a separate data disk — would see discovery return zero repos and scan conclude "no git repositories found" despite the target being full of them. os.Stat already followed the link for the is-directory check, so the error path silently branched at WalkDir instead of surfacing a clear problem. Canonicalize the root with filepath.EvalSymlinks before stat'ing; the walk then starts at a real directory path. Symlinks ENCOUNTERED during the walk (as opposed to the root itself) are left as-is — we don't chase every link on disk, only the one the user explicitly named as a root. Co-Authored-By: Claude Opus 4.7 (1M context) --- internal/scan/discovery.go | 19 +++++++++++++++++ internal/scan/discovery_test.go | 36 +++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+) diff --git a/internal/scan/discovery.go b/internal/scan/discovery.go index 2f2716a..3a76b8c 100644 --- a/internal/scan/discovery.go +++ b/internal/scan/discovery.go @@ -55,6 +55,25 @@ func Discover(ctx context.Context, roots []string, matcher *Matcher, maxDepth in if err != nil { return nil, fmt.Errorf("resolve %s: %w", root, err) } + // Canonicalize via EvalSymlinks. os.Stat above followed the + // symlink for the is-directory check, but filepath.WalkDir + // treats a symlink ROOT specially: it visits only the link + // itself (reported with ModeSymlink) and does NOT descend + // into the target. For a user whose ~/work is a symlink to + // /mnt/data/work — a common setup — the walk would yield + // zero callbacks past the root and scan would conclude "no + // git repositories found under ..." despite the target being + // full of repos. EvalSymlinks dereferences the root once so + // WalkDir starts with a real directory path; links ENCOUNTERED + // during the walk are still left as-is (default WalkDir + // behavior), which is what we want — we don't chase every + // symlink we come across, only the ones the user explicitly + // named as a root. + resolved, err := filepath.EvalSymlinks(abs) + if err != nil { + return nil, fmt.Errorf("resolve symlinks for %s: %w", abs, err) + } + abs = resolved info, err := os.Stat(abs) if err != nil { return nil, fmt.Errorf("stat %s: %w", abs, err) diff --git a/internal/scan/discovery_test.go b/internal/scan/discovery_test.go index f07214d..8dcaadc 100644 --- a/internal/scan/discovery_test.go +++ b/internal/scan/discovery_test.go @@ -198,6 +198,42 @@ func TestAssignSlugs_DeterministicUnderRetry(t *testing.T) { // - elapsed time < full baseline (walk was actually interrupted) // - at least one repo was discovered BEFORE cancel (not a // pre-cancel scenario in disguise) +// Regression: WalkDir given a symlink ROOT visits only the link +// entry and refuses to descend. Users whose ~/work is a symlink to +// a real data disk would previously see "no git repositories found" +// despite the target being full of repos. Canonicalize via +// EvalSymlinks before walking so the walk starts at a real dir. +func TestDiscover_DereferencesSymlinkRoot(t *testing.T) { + real := t.TempDir() + mustMkRepo(t, filepath.Join(real, "repo-a")) + mustMkRepo(t, filepath.Join(real, "nested", "repo-b")) + + // Place the symlink in a separate TempDir so the test doesn't + // have to clean it up explicitly. + linkDir := t.TempDir() + link := filepath.Join(linkDir, "work") + if err := os.Symlink(real, link); err != nil { + t.Skipf("symlinks not supported here: %v", err) + } + + repos, err := Discover(context.Background(), []string{link}, NewMatcher(nil), 0) + if err != nil { + t.Fatalf("Discover via symlink root: %v", err) + } + if len(repos) != 2 { + t.Fatalf("want 2 repos found through symlink root, got %d: %+v", len(repos), repos) + } + got := map[string]bool{} + for _, r := range repos { + got[r.Slug] = true + } + for _, want := range []string{"repo-a", "repo-b"} { + if !got[want] { + t.Errorf("expected slug %q in %v (symlink root was not dereferenced)", want, got) + } + } +} + func TestDiscover_AbortsMidWalk(t *testing.T) { if testing.Short() { t.Skip("builds a large synthetic tree; skipped in -short mode") From 5bd94ab0d9cdf5d645cc86bd7d807a9d38a1440e Mon Sep 17 00:00:00 2001 From: lex0c Date: Mon, 20 Apr 2026 17:12:01 -0300 Subject: [PATCH 16/37] Normalize slug uniqueness for case-insensitive filesystems MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On macOS (APFS/HFS+ default) and Windows (NTFS default), Repo.jsonl and repo.jsonl resolve to the same file on disk. The case-sensitive counts[base] / seen[slug] checks treated two repos named Repo and repo as distinct, handed each the bare basename, and let the filesystem silently merge their JSONL and state files — one scan's output overwriting the other with no warning. Fold to lower case for both the duplicate count and the seen-set so case-only basename differences now trigger the hash-suffix branch. Emitted slugs keep their original case for readability; the collision check that decides filesystem safety runs on the lower-cased form, which is what the OS compares against. Test pairs /a/Team/Repo with /b/OSS/repo and asserts the slugs differ even under strings.EqualFold and both carry a hash suffix — without the fix, both would land on bare "Repo"/"repo" and collide on disk. Co-Authored-By: Claude Opus 4.7 (1M context) --- internal/scan/discovery.go | 21 +++++++++++++++++---- internal/scan/discovery_test.go | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 4 deletions(-) diff --git a/internal/scan/discovery.go b/internal/scan/discovery.go index 3a76b8c..2989893 100644 --- a/internal/scan/discovery.go +++ b/internal/scan/discovery.go @@ -208,6 +208,18 @@ const initialSlugHashLen = 6 // redo the pass with a longer hash. Grows up to the full 40 hex chars // before panicking — needing that much is a cryptographic event, not // a scan-time bug. +// +// Case-insensitive uniqueness: on macOS (APFS/HFS+ in the default +// configuration) and Windows (NTFS), `Repo.jsonl` and `repo.jsonl` +// resolve to the same file on disk. A case-sensitive slug compare +// would treat the two repos as distinct, hand each the bare basename, +// and then quietly let the filesystem merge their JSONL and state +// files. Fold to lower case for BOTH the duplicate-detection count +// and the uniqueness seen-set so collision-triggered hashing fires +// on case-only differences. The emitted slug retains the original +// case for readability (so paths named Repo and repo produce +// `Repo-` and `repo-` — visually distinct to humans, +// case-insensitively distinct on disk). func assignSlugs(repos []Repo) { counts := make(map[string]int) bases := make([]string, len(repos)) @@ -217,7 +229,7 @@ func assignSlugs(repos []Repo) { base = "repo" } bases[i] = base - counts[base]++ + counts[strings.ToLower(base)]++ } for hashLen := initialSlugHashLen; hashLen <= 40; hashLen += 2 { @@ -227,15 +239,16 @@ func assignSlugs(repos []Repo) { for i := range repos { base := bases[i] slug := base - if counts[base] > 1 { + if counts[strings.ToLower(base)] > 1 { h := sha1.Sum([]byte(repos[i].AbsPath)) slug = fmt.Sprintf("%s-%s", base, hex.EncodeToString(h[:])[:hashLen]) } - if prev, ok := seen[slug]; ok && prev != i { + key := strings.ToLower(slug) + if prev, ok := seen[key]; ok && prev != i { collided = true break } - seen[slug] = i + seen[key] = i proposed[i] = slug } if !collided { diff --git a/internal/scan/discovery_test.go b/internal/scan/discovery_test.go index 8dcaadc..5c1fa19 100644 --- a/internal/scan/discovery_test.go +++ b/internal/scan/discovery_test.go @@ -7,6 +7,7 @@ import ( "fmt" "os" "path/filepath" + "strings" "testing" "time" ) @@ -162,6 +163,38 @@ func findColliding6HexPaths(maxAttempts int) (string, string, bool) { return "", "", false } +// Regression: on macOS (APFS/HFS+ default) and Windows (NTFS), +// `Repo.jsonl` and `repo.jsonl` are the same file on disk. A +// case-sensitive slug compare would hand both repos the bare +// basename and let the filesystem merge their outputs — one scan's +// JSONL silently overwrites the other. Counts and seen-set are +// lower-cased so any case-only difference is treated as a collision +// and forces the hash suffix. +func TestAssignSlugs_CaseInsensitiveUniqueness(t *testing.T) { + repos := []Repo{ + {AbsPath: "/a/Team/Repo"}, + {AbsPath: "/b/OSS/repo"}, + } + assignSlugs(repos) + + if repos[0].Slug == repos[1].Slug { + t.Fatalf("repos with case-only-differing basenames got same slug %q", repos[0].Slug) + } + // The fs collision only goes away if the slugs also differ when + // folded to lower case — which is what decides filenames on + // case-insensitive fs. + if strings.EqualFold(repos[0].Slug, repos[1].Slug) { + t.Fatalf("slugs %q and %q differ only in case — they would still collide on macOS/Windows", repos[0].Slug, repos[1].Slug) + } + // Sanity: both should have gained a hash suffix (the bare `Repo` + // and `repo` weren't safe to emit). + for _, r := range repos { + if !strings.Contains(r.Slug, "-") { + t.Errorf("repo %s got bare slug %q; case-sensitive branch did not trigger hashing", r.AbsPath, r.Slug) + } + } +} + // Same paths ingested twice must produce the same slugs — this is // what makes resume work across runs. Even with the retry loop // adjusting hash length under collisions, a deterministic input set From 89df5862addb4ef2cf8c7e88f72859a827be152f Mon Sep 17 00:00:00 2001 From: lex0c Date: Mon, 20 Apr 2026 17:22:02 -0300 Subject: [PATCH 17/37] Fail scan when every repo's extract failed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit scanCmd's no-report branch returned success unconditionally, so a run where every discovered repo's extract failed still exited 0 after printing "Scan complete: 0 JSONL file(s)". CI and any other automation that checked only the exit code treated a fully-failed scan as successful and moved on to downstream steps with empty artifacts. The report branch already had the guard; apply it uniformly before the branch so both paths behave. scan.Run is intentionally non-fatal on per-repo errors so one transient failure doesn't tank a batch — that policy remains. But when the successful count is zero, there's nothing useful on disk to inspect and the CLI must surface the problem. The error message points at the manifest, which is where per-repo Status fields record which repos failed and why. Test creates a fake repo (empty .git dir that discovery picks up but extract rejects as "not a git repository") and asserts scanCmd's error surfaces. Co-Authored-By: Claude Opus 4.7 (1M context) --- cmd/gitcortex/main.go | 17 ++++++++++++++--- cmd/gitcortex/main_test.go | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 3 deletions(-) diff --git a/cmd/gitcortex/main.go b/cmd/gitcortex/main.go index 0886401..948aac4 100644 --- a/cmd/gitcortex/main.go +++ b/cmd/gitcortex/main.go @@ -963,13 +963,24 @@ breakdown — handy for showing aggregated work across many repos.`, return err } + // Exit non-zero when every repo failed to extract, regardless + // of whether a report was requested. scan.Run intentionally + // treats per-repo failures as non-fatal so a transient error + // on one repo doesn't tank the whole batch — but when the + // count of successes is zero, there's nothing useful on disk + // to inspect. Without this guard CI sees exit code 0 and + // continues with empty artifacts; the manifest's per-repo + // Status fields remain the only way to see the failures. + if len(result.JSONLs) == 0 { + return fmt.Errorf("scan found %d repositor(ies) but all extracts failed; see %s for per-repo status", + len(result.Manifest.Repos), + filepath.Join(result.OutputDir, "manifest.json")) + } + if reportPath == "" { fmt.Fprintf(os.Stderr, "Scan complete: %d JSONL file(s) in %s\n", len(result.JSONLs), result.OutputDir) return nil } - if len(result.JSONLs) == 0 { - return fmt.Errorf("no successful repos extracted; cannot build report") - } ds, err := stats.LoadMultiJSONL(result.JSONLs, stats.LoadOptions{ From: fromDate, diff --git a/cmd/gitcortex/main_test.go b/cmd/gitcortex/main_test.go index bf7edd3..3c5fcc0 100644 --- a/cmd/gitcortex/main_test.go +++ b/cmd/gitcortex/main_test.go @@ -2,6 +2,8 @@ package main import ( "bytes" + "os" + "path/filepath" "strings" "testing" ) @@ -45,6 +47,42 @@ func TestScanCmd_ValidatesSinceBeforeScanning(t *testing.T) { } } +// scanCmd must exit non-zero when every discovered repo's extract +// failed, regardless of whether --report was requested. scan.Run +// intentionally treats per-repo failures as non-fatal (a transient +// error on one repo shouldn't tank the whole batch), but if ZERO +// repos succeed there's nothing useful on disk and CI should know. +// Previously the no-report branch returned success unconditionally; +// automation saw exit 0 and "Scan complete: 0 JSONL file(s)" and +// continued with empty artifacts. +// +// Test uses a fake repo (a bare `.git` dir with no history) so +// discovery picks it up but extract fails. +func TestScanCmd_ExitsNonZeroWhenAllExtractsFail(t *testing.T) { + root := t.TempDir() + if err := os.MkdirAll(filepath.Join(root, "fake-repo", ".git"), 0o755); err != nil { + t.Fatal(err) + } + + cmd := scanCmd() + cmd.SilenceUsage = true + cmd.SilenceErrors = true + cmd.SetOut(&bytes.Buffer{}) + cmd.SetErr(&bytes.Buffer{}) + cmd.SetArgs([]string{ + "--root", root, + "--output", t.TempDir(), + }) + + err := cmd.Execute() + if err == nil { + t.Fatal("expected non-zero exit when all extracts fail; CI would silently treat this as success") + } + if !strings.Contains(err.Error(), "all extracts failed") { + t.Errorf("error should explain every repo failed; got %q", err) + } +} + func TestValidateDate(t *testing.T) { cases := []struct { in string From 66f45169bd5b2d4c98d56cf613142adc98f407cc Mon Sep 17 00:00:00 2001 From: lex0c Date: Mon, 20 Apr 2026 17:24:36 -0300 Subject: [PATCH 18/37] Reject missing explicit ignore file paths MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit LoadMatcher treated os.IsNotExist as success and returned an empty matcher. A typo in `--ignore-file my.ignorre` silently disabled every rule and widened discovery to include node_modules/, vendor/, and any other trees the user thought they had excluded — with no signal from the console that anything was wrong. Users would see a consolidated report loaded with unwanted content and have no breadcrumb pointing at the mistyped flag. Fail loudly in LoadMatcher. The implicit default-path lookup (scan.loadMatcher checking for .gitcortex-ignore at the first root) already os.Stats the candidate before calling LoadMatcher, so the "silent when absent" semantics survive exactly where they belong: not on explicit user input. Tests pair a unit-level LoadMatcher check with a CLI-layer `--ignore-file /typo.ignore` check; the latter verifies the error surfaces before discovery starts (no "no git repositories found" wrapper from a downstream phase). Co-Authored-By: Claude Opus 4.7 (1M context) --- cmd/gitcortex/main_test.go | 33 +++++++++++++++++++++++++++++++++ internal/scan/ignore.go | 13 +++++++------ internal/scan/ignore_test.go | 24 +++++++++++++++--------- 3 files changed, 55 insertions(+), 15 deletions(-) diff --git a/cmd/gitcortex/main_test.go b/cmd/gitcortex/main_test.go index 3c5fcc0..aca5896 100644 --- a/cmd/gitcortex/main_test.go +++ b/cmd/gitcortex/main_test.go @@ -83,6 +83,39 @@ func TestScanCmd_ExitsNonZeroWhenAllExtractsFail(t *testing.T) { } } +// `scan --ignore-file /nonexistent` must fail at the CLI layer — +// silently falling back to zero rules would widen discovery scope +// without telling the user (e.g. node_modules and vendor/ dirs the +// user thought they excluded suddenly appear in the report). The +// default-path lookup in scan.loadMatcher still tolerates a missing +// `.gitcortex-ignore` at the first root; only the explicit flag is +// strict. +func TestScanCmd_RejectsMissingIgnoreFile(t *testing.T) { + cmd := scanCmd() + cmd.SilenceUsage = true + cmd.SilenceErrors = true + cmd.SetOut(&bytes.Buffer{}) + cmd.SetErr(&bytes.Buffer{}) + cmd.SetArgs([]string{ + "--root", t.TempDir(), + "--output", t.TempDir(), + "--ignore-file", filepath.Join(t.TempDir(), "typo.ignore"), + }) + + err := cmd.Execute() + if err == nil { + t.Fatal("expected error for missing --ignore-file; a typo must not silently widen scope") + } + if !strings.Contains(err.Error(), "typo.ignore") { + t.Errorf("error should identify the missing file; got %q", err) + } + // Must fail BEFORE discovery starts — scan on an empty TempDir + // would otherwise produce "no git repositories found". + if strings.Contains(err.Error(), "no git repositories found") { + t.Errorf("ignore-file check ran after discovery; got %q", err) + } +} + func TestValidateDate(t *testing.T) { cases := []struct { in string diff --git a/internal/scan/ignore.go b/internal/scan/ignore.go index 3101124..bffce2c 100644 --- a/internal/scan/ignore.go +++ b/internal/scan/ignore.go @@ -41,15 +41,16 @@ func NewMatcher(patterns []string) *Matcher { return m } -// LoadMatcher reads an ignore file and returns a matcher. A missing file -// is not an error — scan simply proceeds with no user-provided rules. -// This matches the gitignore contract where `.gitignore` is optional. +// LoadMatcher reads an ignore file and returns a matcher. Missing files +// are rejected — this entry point is used when the path was specified +// explicitly (e.g. `--ignore-file foo.ignore`), where a typo should +// fail loudly instead of silently disabling all ignore rules and +// widening discovery scope. Callers that want "load if present, empty +// matcher otherwise" semantics (implicit default-path lookup) should +// os.Stat first. func LoadMatcher(file string) (*Matcher, error) { f, err := os.Open(file) if err != nil { - if os.IsNotExist(err) { - return NewMatcher(nil), nil - } return nil, fmt.Errorf("open %s: %w", file, err) } defer f.Close() diff --git a/internal/scan/ignore_test.go b/internal/scan/ignore_test.go index ed9510f..cda4f25 100644 --- a/internal/scan/ignore_test.go +++ b/internal/scan/ignore_test.go @@ -3,6 +3,7 @@ package scan import ( "os" "path/filepath" + "strings" "testing" ) @@ -45,16 +46,21 @@ func TestMatcher_Basics(t *testing.T) { } } -func TestLoadMatcher_MissingFileIsOK(t *testing.T) { - m, err := LoadMatcher(filepath.Join(t.TempDir(), "missing")) - if err != nil { - t.Fatalf("expected no error for missing file, got %v", err) - } - if m == nil { - t.Fatal("matcher should not be nil") +// LoadMatcher is called when the user supplied an explicit +// --ignore-file; a missing file there is almost always a typo. If +// we silently returned an empty matcher, every discovery would +// widen to include node_modules/, vendor/, chromium-clones/, etc. +// the user thought they had excluded — and they'd have no way to +// tell from the console output. Fail loudly instead; the +// default-path lookup in scan.loadMatcher handles the "silent when +// absent" case via its own os.Stat before calling here. +func TestLoadMatcher_MissingFileFails(t *testing.T) { + _, err := LoadMatcher(filepath.Join(t.TempDir(), "typo.ignore")) + if err == nil { + t.Fatal("expected error for missing explicit ignore file; a typo must not silently disable rules") } - if m.Match("anything", false) { - t.Error("empty matcher should match nothing") + if !os.IsNotExist(err) && !strings.Contains(err.Error(), "typo.ignore") { + t.Errorf("error should identify the missing path; got %q", err) } } From f62cdc5a85d3280b6579c006e33e42d75042e90d Mon Sep 17 00:00:00 2001 From: lex0c Date: Mon, 20 Apr 2026 17:43:13 -0300 Subject: [PATCH 19/37] Recover from worker panics in the scan pool MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If extract.Run panicked inside a worker goroutine (unexpected git binary output, nil deref from a future refactor, etc.) the goroutine would crash the entire scan process before its manifest.Repos[idx] slot was written. At best the repo stayed forever at Status="pending" — indistinguishable from "worker never dispatched this repo", which misled both operators and downstream automation. At worst, the process died mid-batch and the other in-flight extracts were lost. Wrap the per-job call in runJobSafely: defer-recover converts any panic into a ManifestRepo with Status="failed" and an Error prefixed with "panic:". The sibling workers keep pulling from the jobs channel, so the rest of the batch completes. extract.Run is not expected to panic under normal inputs — this is a defensive seam for the "impossible" cases the branch review flagged as the only remaining untested failure mode. Package-level runJobFn indirection lets the test override the worker body to force a deterministic panic without needing a broken git binary or a real extract bug to reproduce against. Co-Authored-By: Claude Opus 4.7 (1M context) --- internal/scan/scan.go | 37 +++++++++++++++++-- internal/scan/scan_test.go | 73 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 108 insertions(+), 2 deletions(-) diff --git a/internal/scan/scan.go b/internal/scan/scan.go index 4417ad6..a634f21 100644 --- a/internal/scan/scan.go +++ b/internal/scan/scan.go @@ -126,8 +126,7 @@ func Run(ctx context.Context, cfg Config) (*Result, error) { go func() { defer wg.Done() for j := range jobs { - entry := runOne(ctx, cfg, j.repo) - manifest.Repos[j.idx] = entry + manifest.Repos[j.idx] = runJobSafely(ctx, cfg, j.repo) } }() } @@ -197,6 +196,40 @@ func loadMatcher(cfg Config) (*Matcher, error) { return LoadMatcher(path) } +// runJobFn is the injection point the worker pool uses. Default is +// the real runOne; tests override it to simulate extract panics and +// verify the pool's recovery path. Not exposed via any user-facing +// API — this is strictly a test seam. +var runJobFn = runOne + +// runJobSafely wraps runJobFn in a defer-recover. If extract panics +// inside a worker the goroutine would otherwise crash the entire +// process and the repo's manifest slot would stay at "pending", which +// is indistinguishable from a job that was never dispatched. Convert +// panics to a well-formed ManifestRepo with Status="failed" so the +// manifest reflects reality and the rest of the pool keeps running. +// +// extract.Run doesn't panic under normal inputs — the recover exists +// for the "impossible" cases (e.g. a git binary that writes malformed +// output, nil-deref bugs introduced by a future refactor). Better to +// isolate the blast radius than leave a silent corruption mode. +func runJobSafely(ctx context.Context, cfg Config, repo Repo) (entry ManifestRepo) { + defer func() { + if r := recover(); r != nil { + entry = ManifestRepo{ + Slug: repo.Slug, + Path: repo.AbsPath, + JSONL: filepath.Join(cfg.Output, repo.Slug+".jsonl"), + StateFile: filepath.Join(cfg.Output, repo.Slug+".state"), + Status: "failed", + Error: fmt.Sprintf("panic: %v", r), + } + log.Printf("scan: [%s] panicked: %v", repo.Slug, r) + } + }() + return runJobFn(ctx, cfg, repo) +} + func runOne(ctx context.Context, cfg Config, repo Repo) ManifestRepo { jsonlPath := filepath.Join(cfg.Output, repo.Slug+".jsonl") statePath := filepath.Join(cfg.Output, repo.Slug+".state") diff --git a/internal/scan/scan_test.go b/internal/scan/scan_test.go index db24410..ad0c073 100644 --- a/internal/scan/scan_test.go +++ b/internal/scan/scan_test.go @@ -13,6 +13,79 @@ import ( "github.com/lex0c/gitcortex/internal/stats" ) +// Regression: if extract.Run panics inside a worker (a future +// refactor with a nil deref, an unexpected git binary output, etc.), +// defer wg.Done() still closes the WaitGroup — but without a +// recover, the goroutine crashes the whole process before the +// manifest slot for that repo is written. Even without a full crash, +// the slot would silently stay at "pending" indistinguishable from +// "worker never reached it". The pool must convert panics to a +// ManifestRepo with Status="failed" and continue with sibling jobs. +// +// Override the injection seam runJobFn: first repo panics, second +// runs normally. Assert (1) the pool survives, (2) the panicking +// repo's slot is Status=failed with a panic: prefix, (3) the +// non-panicking repo still completes. +func TestRun_RecoversFromWorkerPanic(t *testing.T) { + if _, err := exec.LookPath("git"); err != nil { + t.Skip("git not installed") + } + + root := t.TempDir() + makeRealRepo(t, filepath.Join(root, "boom"), map[string]string{"x.go": "x\n"}) + makeRealRepo(t, filepath.Join(root, "ok"), map[string]string{"y.go": "y\n"}) + + // Override runJobFn for the duration of this test only. + original := runJobFn + defer func() { runJobFn = original }() + runJobFn = func(ctx context.Context, cfg Config, repo Repo) ManifestRepo { + if repo.Slug == "boom" { + panic("simulated extract failure") + } + return original(ctx, cfg, repo) + } + + output := filepath.Join(t.TempDir(), "out") + cfg := Config{ + Roots: []string{root}, + Output: output, + Parallel: 1, // force serial so the panicking job is reached + Extract: extract.Config{ + BatchSize: 100, + CommandTimeout: extract.DefaultCommandTimeout, + }, + } + + res, err := Run(context.Background(), cfg) + if err != nil { + t.Fatalf("Run should not propagate the worker panic as an error; got %v", err) + } + + gotStatus := map[string]ManifestRepo{} + for _, r := range res.Manifest.Repos { + gotStatus[r.Slug] = r + } + boom, ok := gotStatus["boom"] + if !ok { + t.Fatal("boom slot missing from manifest") + } + if boom.Status != "failed" { + t.Errorf("boom should be failed after panic, got Status=%q", boom.Status) + } + if !strings.Contains(boom.Error, "panic") { + t.Errorf("boom error should carry a panic: prefix so operators can tell panics from normal failures; got %q", boom.Error) + } + if okRepo, okFound := gotStatus["ok"]; !okFound || okRepo.Status != "ok" { + t.Errorf("sibling job must still complete despite the other worker's panic; got %+v", okRepo) + } + // Ensure the pool didn't leak: all slots have a non-pending status. + for _, r := range res.Manifest.Repos { + if r.Status == "pending" { + t.Errorf("repo %s left at 'pending' after Run returned — the pool abandoned a slot", r.Slug) + } + } +} + func TestRun_EndToEnd(t *testing.T) { if _, err := exec.LookPath("git"); err != nil { t.Skip("git not installed") From ecce5ee12fbf1e73a94501027e2f57a923d6e953 Mon Sep 17 00:00:00 2001 From: lex0c Date: Mon, 20 Apr 2026 17:51:24 -0300 Subject: [PATCH 20/37] Scope Per-Repository Breakdown to profile reports only The breakdown was rendered on both the team report and the --email profile report whenever the dataset spanned multiple repos. On reflection the team view adds little: "how is git history distributed across the repos in this scan" is nearly tautological and usually better answered by manifest.json or running stats per-repo. The section earns its place only when filtered by developer, where the same numbers turn into a real consolidation story: "where did this person spend their time across projects". Remove the section from the team template; keep it on the profile template where it was already filtered by email. Drop the unused ReportData.Repos field and the stats.RepoBreakdown call from Generate; ProfileReportData.Repos remains the single populated home for the metric. Team-report regression test now asserts the section does NOT render even on a multi-repo dataset; the profile test (unchanged) continues to cover the positive case with email filtering. README and METRICS.md rewritten to frame the metric as profile-only. Co-Authored-By: Claude Opus 4.7 (1M context) --- README.md | 4 +-- docs/METRICS.md | 6 ++-- internal/report/report.go | 7 ----- internal/report/report_test.go | 55 ++++++++++------------------------ internal/report/template.go | 38 ----------------------- 5 files changed, 21 insertions(+), 89 deletions(-) diff --git a/README.md b/README.md index bdcf222..df25f09 100644 --- a/README.md +++ b/README.md @@ -450,7 +450,7 @@ vendor/ Directory rules, globs, `**/foo`, and `!path` negations all work. Globbed negations like `!vendor*/keep` are honored — discovery descends into any dir where a negation rule could match a descendant. If `--ignore-file` is not set, scan looks for `.gitcortex-ignore` in the first `--root`. -**Consolidated report extras.** When a scan produces more than one repo's data, both the team report and `--email` profile report render a new *Per-Repository Breakdown* section: commits, churn, files, active days, and the share-of-total for each repo. On an `--email` profile the counts are filtered to that developer's contributions (files count reflects only files the dev touched). +**Consolidated profile report.** When `scan --email me@company.com --report path.html` runs against a multi-repo dataset, the profile report renders a *Per-Repository Breakdown* section: commits, churn, files, active days, and share-of-total — all filtered to that developer's contributions (files count reflects only files the dev touched). The section is profile-only; the team report focuses on the aggregate across repos without a per-repo split (run `stats --input repo.jsonl` per-repo or read the `manifest.json` for that). **Flags worth knowing:** @@ -505,7 +505,7 @@ gitcortex report --input data.jsonl --email alice@company.com --output alice.htm Includes: summary cards, activity heatmap (with table toggle), top contributors, file hotspots, churn risk (with full-dataset label distribution strip above the truncated table), bus factor, file coupling, working patterns heatmap, top commits, developer network, and developer profiles. A collapsible glossary at the top defines the terms (bus factor, churn, legacy-hotspot, specialization, etc.) for readers who are not already familiar. Typical size: 50-500KB depending on number of contributors. -When the input is multi-repo (from `gitcortex scan` or multiple `--input` files), both the team report and `--email` profile also render a *Per-Repository Breakdown* with commit/churn/files/active-days per repo and each repo's share of the total. +When the input is multi-repo (from `gitcortex scan` or multiple `--input` files) AND `--email` is set, the profile report renders a *Per-Repository Breakdown* with commit/churn/files/active-days per repo, filtered to that developer's contributions. The team-view report intentionally omits this section — per-repo aggregates on a consolidated dataset reduce to raw git-history distribution, which is more usefully inspected via `manifest.json` or `stats --input X.jsonl` per repo. > The HTML activity heatmap is always monthly (year × 12 months grid). For day/week/year buckets, use `gitcortex stats --stat activity --granularity `. diff --git a/docs/METRICS.md b/docs/METRICS.md index 6749855..72b9fff 100644 --- a/docs/METRICS.md +++ b/docs/METRICS.md @@ -296,7 +296,9 @@ A `tree(1)`-style view of the repository's directory layout, built from paths se ## Per-Repository Breakdown -Cross-repo aggregation that appears only when the dataset was loaded from more than one JSONL (typically via `gitcortex scan` or multiple `--input` files). One row per repo: +Cross-repo aggregation scoped to a single developer. Renders in the HTML profile report (`report --email me@x.com` or `scan --email me@x.com --report …`) when the dataset was loaded from more than one JSONL. The metric is deliberately profile-only: on a team-view consolidated report, per-repo aggregates reduce to raw git-history distribution and are better inspected via `manifest.json` or by running `stats --input X.jsonl` per-repo. Filtered by a developer's email, the same numbers answer a genuinely different question — *"where did this person spend their time?"* — which is what the section surfaces. + +One row per repo: | column | meaning | |---|---| @@ -304,7 +306,7 @@ Cross-repo aggregation that appears only when the dataset was loaded from more t | `% Commits` | share of total commits in the dataset | | `Churn` | additions + deletions attributed to this repo | | `% Churn` | share of total churn | -| `Files` | distinct files touched in this repo; when filtered by `--email`, counts only files that developer touched | +| `Files` | distinct files the developer touched in this repo (always email-filtered, since the section only renders on profile reports) | | `Active days` | distinct UTC author-dates | | `Devs` | unique author emails in this repo | | `First → Last` | earliest and latest author-date | diff --git a/internal/report/report.go b/internal/report/report.go index 358c894..24c982f 100644 --- a/internal/report/report.go +++ b/internal/report/report.go @@ -68,12 +68,6 @@ type ReportData struct { TotalDirectories int TotalExtensions int TotalBusFactorFiles int - - // Repos holds per-repository aggregates for multi-repo (scan) reports. - // Empty on single-repo runs — the template gates the section behind - // `{{if gt (len .Repos) 1}}` so single-repo callers keep their - // existing layout untouched. - Repos []stats.RepoStat } // htmlTreeDepth caps the repo-structure tree baked into the HTML report. @@ -382,7 +376,6 @@ func Generate(w io.Writer, ds *stats.Dataset, repoName string, topN int, sf stat TotalDirectories: stats.DirectoryCount(ds), TotalExtensions: stats.ExtensionCount(ds), TotalBusFactorFiles: stats.BusFactorCount(ds), - Repos: stats.RepoBreakdown(ds, ""), } CapChildrenPerDir(data.Structure, htmlTreeMaxChildrenPerDir) diff --git a/internal/report/report_test.go b/internal/report/report_test.go index f27cf33..45a91eb 100644 --- a/internal/report/report_test.go +++ b/internal/report/report_test.go @@ -78,18 +78,16 @@ func TestGenerate_SmokeRender(t *testing.T) { } } -// Regression: the Per-Repository Breakdown section emits a CSS width -// for each repo's commit-share bar. The width has to be a numeric -// literal — if the template feeds a string through printf "%.0f" -// (e.g. via the string-returning pctFloat helper) html/template's -// CSS sanitizer replaces it with `ZgotmplZ` and the bars render at -// zero width. Exercise the multi-repo path end-to-end and assert -// the rendered widths are clean. -func TestGenerate_PerRepoBreakdownWidthsAreNumeric(t *testing.T) { +// The Per-Repository Breakdown is a consolidation metric about the +// developer's work, not about the repository set. It belongs on the +// profile (--email) report where "3 of my commits in auth, 50 in +// payments" tells a story; on the team report it would just restate +// git-history distribution, which is tautological. Assert the team +// render does NOT surface the section even with a multi-repo dataset. +func TestGenerate_TeamReportOmitsPerRepoBreakdown(t *testing.T) { dir := t.TempDir() alpha := filepath.Join(dir, "alpha.jsonl") beta := filepath.Join(dir, "beta.jsonl") - // Distinct SHAs and content so both repos appear in the breakdown. alphaRow := `{"type":"commit","sha":"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa","author_email":"me@x.com","author_name":"Me","author_date":"2024-01-01T00:00:00Z","additions":10,"deletions":0,"files_changed":1} {"type":"commit_file","commit":"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa","path_current":"a.go","additions":10,"deletions":0} ` @@ -111,37 +109,14 @@ func TestGenerate_PerRepoBreakdownWidthsAreNumeric(t *testing.T) { t.Fatalf("Generate: %v", err) } out := buf.String() - if !strings.Contains(out, "Per-Repository Breakdown") { - t.Fatal("breakdown section missing from multi-repo report") - } - if strings.Contains(out, "ZgotmplZ") { - t.Error("report contains ZgotmplZ — template fed a non-numeric value to a CSS context (likely printf'd a string)") - } - if strings.Contains(out, "%!f(string=") { - t.Error("report contains %!f(string=...) — printf was given a string where a float was expected") - } - - // Stronger guard: the absence-of-markers check above would pass - // if a future bug rendered every bar at `width:0%`. The breakdown - // tile has a green fill (background:#216e39) — extract every - // rendered width for those tiles and assert at least one is - // non-zero. With alpha's 10-churn commit and beta's 25-churn - // commit producing distinct totals, the commit-share proportions - // can't all collapse to zero unless the template is broken. - widthRe := regexp.MustCompile(`width:([0-9]+(?:\.[0-9]+)?)%; background:#216e39`) - matches := widthRe.FindAllStringSubmatch(out, -1) - if len(matches) == 0 { - t.Fatal("no repo-breakdown bar widths found; template may have changed selector") - } - sawNonZero := false - for _, m := range matches { - if m[1] != "0" && m[1] != "0.0" { - sawNonZero = true - break - } - } - if !sawNonZero { - t.Errorf("every repo bar rendered with width:0 — proportions would be invisible in the UI; widths: %v", matches) + if strings.Contains(out, "Per-Repository Breakdown") { + t.Error("team report should not render the Per-Repository Breakdown; it's a profile-only metric") + } + // Sanity: the scan JSONL prefix still shows up in file paths etc. + // — we didn't accidentally strip the repo context from the whole + // report. + if !strings.Contains(out, "alpha:") && !strings.Contains(out, "beta:") { + t.Error("expected repo-prefixed paths to appear somewhere in the team report; none found") } } diff --git a/internal/report/template.go b/internal/report/template.go index 9e19846..d6a3a64 100644 --- a/internal/report/template.go +++ b/internal/report/template.go @@ -145,44 +145,6 @@ footer { margin-top: 40px; padding-top: 16px; border-top: 1px solid #d0d7de; col -{{if gt (len .Repos) 1}} -

Per-Repository Breakdown {{thousands (len .Repos)}} repositories

-

Cross-repo aggregation from scan. Use this view to see how work is distributed: a single repo dominating commits or churn often means the consolidated activity story is really about that one project; an even spread shows broad multi-repo engagement. Bar widths are the share of total commits across all scanned repos.

-
Repository
-
+
{{printf "%.1f" .PctOfTotalCommits}}%
From 6cfe86d3f18350e8038daa8ca86e229b17391d4e Mon Sep 17 00:00:00 2001 From: lex0c Date: Mon, 20 Apr 2026 12:48:21 -0300 Subject: [PATCH 08/37] Skip ignored repos before recording discovery hits MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Discover descended into ignored directories only to give negation rules a chance to re-include descendants, but after the descent decision it still ran the .git detection on the ignored parent. When the parent itself held a .git (e.g. `vendor/` with vendor/.git plus a sibling !vendor/keep rule), the walker wrongly recorded the ignored parent as a repo and then returned filepath.SkipDir from the repo branch — which cut off the very descent the negation relied on. Both outcomes violated the documented last-match-wins semantics: the ignored dir leaked into scan output, and the re-included child never got a chance to register. Return early as soon as an ignored dir is walked-into-for-negation. The .git detection is now reached only for dirs the matcher did not ignore; re-included descendants are examined in their own callback invocation. Co-Authored-By: Claude Opus 4.7 (1M context) --- internal/scan/discovery.go | 15 +++++++++++---- internal/scan/discovery_test.go | 31 +++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 4 deletions(-) diff --git a/internal/scan/discovery.go b/internal/scan/discovery.go index feae8b2..4211674 100644 --- a/internal/scan/discovery.go +++ b/internal/scan/discovery.go @@ -77,13 +77,20 @@ func Discover(roots []string, matcher *Matcher, maxDepth int) ([]Repo, error) { // Don't SkipDir blindly — if any negation rule could // re-include a descendant (e.g. `vendor/` + `!vendor/keep`), // pruning here would drop the re-inclusion before its - // target is visited. Descend and let Match() decide - // per-path. For ordinary `node_modules`-style excludes - // with no negations in play, CouldReinclude returns - // false and pruning remains. + // target is visited. if !matcher.CouldReinclude(rel) { return filepath.SkipDir } + // The dir itself is ignored; we only walk in so a + // negated descendant can be visited in its own turn. + // Return early so the .git detection below doesn't + // record this ignored directory as a repo (and so its + // `return filepath.SkipDir` doesn't cut the descent + // off before the negation's target is seen). A + // descendant whose rel path is re-included by `!rule` + // will be examined by the next WalkDir callback + // invocation. + return nil } // Two repo shapes to detect: diff --git a/internal/scan/discovery_test.go b/internal/scan/discovery_test.go index e5917ef..27f514f 100644 --- a/internal/scan/discovery_test.go +++ b/internal/scan/discovery_test.go @@ -80,6 +80,37 @@ func TestDiscover_HonorsNegatedDescendant(t *testing.T) { } } +// Regression: when the ignored directory itself is a repo and a +// negation rule points at a descendant, the walker must NOT record +// the ignored dir (it's ignored) AND must keep descending so the +// negation's target can be visited. Previously the .git detection +// ran unconditionally and both incorrectly recorded the ignored +// parent AND SkipDir'd the child out of the walk. +func TestDiscover_IgnoredRepoNotRecorded_DescendantStillFound(t *testing.T) { + root := t.TempDir() + // `vendor` is itself a repo AND is ignored by `vendor/`. + mustMkRepo(t, filepath.Join(root, "vendor")) + // `vendor/keep` is a nested repo, re-included by `!vendor/keep`. + mustMkRepo(t, filepath.Join(root, "vendor", "keep")) + + matcher := NewMatcher([]string{"vendor/", "!vendor/keep"}) + repos, err := Discover([]string{root}, matcher, 0) + if err != nil { + t.Fatal(err) + } + + got := map[string]bool{} + for _, r := range repos { + got[r.RelPath] = true + } + if got["vendor"] { + t.Error("vendor itself is ignored and must not be recorded as a repo") + } + if !got["vendor/keep"] { + t.Errorf("vendor/keep should be re-included by the negation rule; got %+v", repos) + } +} + func TestMatcher_CouldReinclude(t *testing.T) { cases := []struct { name string From 26ac75e762855b189492eae83166b2fe78c69c44 Mon Sep 17 00:00:00 2001 From: lex0c Date: Mon, 20 Apr 2026 12:51:00 -0300 Subject: [PATCH 09/37] Account for globbed negations in reinclude checks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CouldReinclude only recognized negation rules whose pattern began with a literal `dir/` prefix. Globbed variants that can legitimately match descendants of an ignored directory — `!vendor*/keep`, `!*/keep`, `!*/vendor/keep` — returned false, so the walker pruned the parent and the re-included repos silently disappeared. Per-segment path.Match against the dir's segments handles both literal and glob first segments uniformly. A pattern is a candidate reinclude iff it has strictly more segments than dir and every leading segment matches the corresponding dir segment (path.Match treats `vendor*`, `*`, literals, etc. the same way). Basename-only and `**/`-prefixed negations keep their fire-anywhere semantics. Discovery test uses `vendor*/` + `!vendor*/keep` across vendor/ and vendor-old/ to prove both re-included repos are now kept while vendor/garbage stays out. Co-Authored-By: Claude Opus 4.7 (1M context) --- internal/scan/discovery_test.go | 38 ++++++++++++++++++++++++ internal/scan/ignore.go | 51 +++++++++++++++++++++++++++------ 2 files changed, 80 insertions(+), 9 deletions(-) diff --git a/internal/scan/discovery_test.go b/internal/scan/discovery_test.go index 27f514f..2a95aa2 100644 --- a/internal/scan/discovery_test.go +++ b/internal/scan/discovery_test.go @@ -123,6 +123,13 @@ func TestMatcher_CouldReinclude(t *testing.T) { {"unrelated negation", []string{"vendor/", "!src/main"}, "vendor", false}, {"basename negation could fire anywhere", []string{"vendor/", "!keep"}, "vendor", true}, {"deep-match negation", []string{"build/", "!**/src"}, "build", true}, + // Globbed first segment — reviewer case. + {"glob star in segment", []string{"vendor*/", "!vendor*/keep"}, "vendor", true}, + {"wildcard segment matches any parent", []string{"*/", "!*/keep"}, "vendor", true}, + {"glob prefix that doesn't match dir", []string{"vendor*/", "!foo*/keep"}, "vendor", false}, + {"nested dir with literal pattern", []string{"pkg/vendor/", "!pkg/vendor/keep"}, "pkg/vendor", true}, + {"nested dir with glob in first segment", []string{"*/vendor/", "!*/vendor/keep"}, "pkg/vendor", true}, + {"pattern with same segment count as dir can't match descendant", []string{"!vendor"}, "vendor", true}, // basename-anywhere } for _, c := range cases { t.Run(c.name, func(t *testing.T) { @@ -134,6 +141,37 @@ func TestMatcher_CouldReinclude(t *testing.T) { } } +// Regression: glob-prefixed negations like `!vendor*/keep` or `!*/keep` +// used to slip past CouldReinclude because it only matched literal +// `dir + "/"` prefixes. Discovery then pruned vendor/ and the +// re-included vendor/keep repo disappeared from the scan. +func TestDiscover_HonorsGlobbedNegation(t *testing.T) { + root := t.TempDir() + mustMkRepo(t, filepath.Join(root, "vendor", "keep")) + mustMkRepo(t, filepath.Join(root, "vendor", "garbage")) + mustMkRepo(t, filepath.Join(root, "vendor-old", "keep")) + mustMkRepo(t, filepath.Join(root, "unrelated")) + + matcher := NewMatcher([]string{"vendor*/", "!vendor*/keep"}) + repos, err := Discover([]string{root}, matcher, 0) + if err != nil { + t.Fatal(err) + } + got := map[string]bool{} + for _, r := range repos { + got[r.RelPath] = true + } + want := []string{"vendor/keep", "vendor-old/keep", "unrelated"} + for _, w := range want { + if !got[w] { + t.Errorf("missing %q in discovered repos: %+v", w, repos) + } + } + if got["vendor/garbage"] { + t.Error("vendor/garbage should remain ignored") + } +} + func TestDiscover_DoesNotDescendIntoRepo(t *testing.T) { root := t.TempDir() parent := filepath.Join(root, "parent") diff --git a/internal/scan/ignore.go b/internal/scan/ignore.go index 30ab306..3101124 100644 --- a/internal/scan/ignore.go +++ b/internal/scan/ignore.go @@ -116,26 +116,59 @@ func (m *Matcher) Match(relPath string, isDir bool) bool { // the vendor subtree entirely before vendor/keep could be examined, // silently dropping the re-included path. // -// The check is conservative — it returns true when: -// - a negation's pattern begins with `dir/` (explicit descendant -// reference, the documented common case), OR +// Returns true when: // - a negation's pattern has no path separator (basename rules like -// `!keep` that could fire anywhere, including inside dir). +// `!keep` or `!*.go` that could fire at any depth), OR +// - a negation begins with `**/` (deep-match; applies anywhere in the +// tree), OR +// - the pattern's leading path segments are each compatible with the +// corresponding segment of dir via path.Match, so its trailing +// segment(s) could name a descendant. This covers literal prefixes +// like `!vendor/keep` AND globbed prefixes like `!vendor*/keep` or +// `!*/keep` — both can match children of an ignored `vendor`. // -// Negations that target a sibling or ancestor path correctly don't -// trigger descent, so pruning remains effective for unrelated ignored -// trees like `node_modules/`. +// Negations that target a sibling or ancestor path (e.g. `!src/keep` +// when walking vendor) correctly don't trigger descent, so pruning +// remains effective for unrelated ignored trees like `node_modules/`. func (m *Matcher) CouldReinclude(dir string) bool { dir = filepath.ToSlash(dir) + dirSegs := strings.Split(dir, "/") for _, r := range m.rules { if !r.Negate { continue } - pat := strings.TrimPrefix(r.Pattern, "**/") + pat := r.Pattern + // `**/foo` and other deep-match patterns can fire at any + // depth — walking into any ignored subtree could reach one. + if strings.HasPrefix(pat, "**/") { + return true + } + // Basename-only rules (no `/`) apply to any segment. `!keep` + // could re-include vendor/keep, a/b/keep, anywhere. if !strings.Contains(pat, "/") { return true } - if strings.HasPrefix(pat, dir+"/") { + patSegs := strings.Split(pat, "/") + // The pattern must have strictly more segments than dir; + // otherwise its deepest named entity is dir itself or an + // ancestor, never a descendant worth descending for. + if len(patSegs) <= len(dirSegs) { + continue + } + // Each of the pattern's leading segments must be compatible + // with the corresponding dir segment. path.Match handles both + // literal names (`vendor`) and globs (`vendor*`, `*`, `?`) + // uniformly; a failing match on any segment rules this + // negation out. + ok := true + for i := 0; i < len(dirSegs); i++ { + matched, err := path.Match(patSegs[i], dirSegs[i]) + if err != nil || !matched { + ok = false + break + } + } + if ok { return true } } From e85768ed22671fc982f2ba6d4cc1f9ea62a0e015 Mon Sep 17 00:00:00 2001 From: lex0c Date: Mon, 20 Apr 2026 13:35:24 -0300 Subject: [PATCH 10/37] Guarantee unique slugs after hashing duplicate basenames MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 6-hex (24-bit) hash suffix used for colliding basenames admits a tiny but real birthday-collision probability: two absolute paths whose SHA-1 digests share the first six hex chars would land on the same slug, and scan would silently write both repos' data to the same .jsonl and .state. Corruption of the exact kind the hash was added to prevent. Retry with a longer hash (up to the full 40 hex) whenever the proposed slug set still has duplicates. Happy path remains readable six-char suffixes; the rare collision grows until unique or panics past SHA-1 — the latter would be a cryptographic event, not a scan-time bug. Tests: 10k-duplicate property test (24-bit birthday expects ~3 collisions, forces retry path to fire naturally), determinism across re-runs of the same input, and a targeted test that builds two paths colliding at 6 hex and asserts the retry separates them. Co-Authored-By: Claude Opus 4.7 (1M context) --- internal/scan/discovery.go | 47 +++++++++++++--- internal/scan/discovery_test.go | 97 +++++++++++++++++++++++++++++++++ 2 files changed, 137 insertions(+), 7 deletions(-) diff --git a/internal/scan/discovery.go b/internal/scan/discovery.go index 4211674..cecbae2 100644 --- a/internal/scan/discovery.go +++ b/internal/scan/discovery.go @@ -141,6 +141,12 @@ func Discover(roots []string, matcher *Matcher, maxDepth int) ([]Repo, error) { return repos, nil } +// initialSlugHashLen controls the default hash-suffix length in hex +// chars when two repos share a basename. Six chars (24 bits) gives +// readable slugs in the common case; the retry loop in assignSlugs +// grows the length if a truncation collision occurs. +const initialSlugHashLen = 6 + // assignSlugs derives a unique slug per repo from its basename, falling // back to `-` when two repos share a name. // The slug is also the JSONL filename stem and the persistence key @@ -153,6 +159,15 @@ func Discover(roots []string, matcher *Matcher, maxDepth int) ([]Repo, error) { // base appears more than once anywhere in the result. This makes the // slug a pure function of (absPath, set of basenames seen), independent // of WalkDir traversal order. +// +// Short-hash truncation (6 hex = 24 bits) admits a small but nonzero +// collision probability — two absolute paths whose SHA-1 digests share +// the same 6-hex prefix would land on the same slug, and scan would +// silently overwrite one repo's JSONL + state file with the other's. +// The retry loop walks the proposed slug set; on any duplicate we +// redo the pass with a longer hash. Grows up to the full 40 hex chars +// before panicking — needing that much is a cryptographic event, not +// a scan-time bug. func assignSlugs(repos []Repo) { counts := make(map[string]int) bases := make([]string, len(repos)) @@ -164,15 +179,33 @@ func assignSlugs(repos []Repo) { bases[i] = base counts[base]++ } - for i := range repos { - base := bases[i] - slug := base - if counts[base] > 1 { - h := sha1.Sum([]byte(repos[i].AbsPath)) - slug = fmt.Sprintf("%s-%s", base, hex.EncodeToString(h[:])[:6]) + + for hashLen := initialSlugHashLen; hashLen <= 40; hashLen += 2 { + proposed := make([]string, len(repos)) + seen := make(map[string]int, len(repos)) + collided := false + for i := range repos { + base := bases[i] + slug := base + if counts[base] > 1 { + h := sha1.Sum([]byte(repos[i].AbsPath)) + slug = fmt.Sprintf("%s-%s", base, hex.EncodeToString(h[:])[:hashLen]) + } + if prev, ok := seen[slug]; ok && prev != i { + collided = true + break + } + seen[slug] = i + proposed[i] = slug + } + if !collided { + for i := range repos { + repos[i].Slug = proposed[i] + } + return } - repos[i].Slug = slug } + panic("scan: slug assignment failed to find unique hash within SHA-1 range") } // isBareRepo returns true when path is a bare git repository — i.e. the diff --git a/internal/scan/discovery_test.go b/internal/scan/discovery_test.go index 2a95aa2..9ec3aa7 100644 --- a/internal/scan/discovery_test.go +++ b/internal/scan/discovery_test.go @@ -1,6 +1,9 @@ package scan import ( + "crypto/sha1" + "encoding/hex" + "fmt" "os" "path/filepath" "testing" @@ -86,6 +89,100 @@ func TestDiscover_HonorsNegatedDescendant(t *testing.T) { // negation's target can be visited. Previously the .git detection // ran unconditionally and both incorrectly recorded the ignored // parent AND SkipDir'd the child out of the walk. +// With a 6-hex (24-bit) suffix, the birthday paradox predicts a +// truncation collision around ~2^12 duplicates. Generating 10k paths +// that share a basename makes collisions statistically near-certain +// (expected count ≈ 3). The invariant "all resulting slugs are +// distinct" forces assignSlugs's retry loop to grow the hash and +// still produce a unique namespace — the exact corruption class the +// extra iterations were added to prevent. +func TestAssignSlugs_UniqueEvenUnderTruncationCollisions(t *testing.T) { + const n = 10000 + repos := make([]Repo, n) + for i := 0; i < n; i++ { + repos[i] = Repo{AbsPath: fmt.Sprintf("/path/to/dir-%06d/myrepo", i)} + } + assignSlugs(repos) + + slugs := make(map[string]int, n) + for i, r := range repos { + if prev, ok := slugs[r.Slug]; ok { + t.Fatalf("duplicate slug %q between repos[%d]=%s and repos[%d]=%s — JSONL + state files would collide", + r.Slug, prev, repos[prev].AbsPath, i, r.AbsPath) + } + slugs[r.Slug] = i + } +} + +// Directly exercise the retry path with two paths constructed to +// collide at initialSlugHashLen. Without the retry, both would get +// slug "myrepo-" and the scan would silently overwrite one +// repo's files with the other's. With the retry, the loop grows +// the hash until the pair separates. +func TestAssignSlugs_ResolvesFirstPrefixCollision(t *testing.T) { + a, b, found := findColliding6HexPaths(50000) + if !found { + t.Skip("no colliding pair found within search budget — astronomically unlikely, skip rather than flake") + } + repos := []Repo{{AbsPath: a}, {AbsPath: b}} + assignSlugs(repos) + + if repos[0].Slug == repos[1].Slug { + t.Fatalf("retry failed: both repos got slug %q for colliding paths %s and %s", repos[0].Slug, a, b) + } + // Sanity: the slug suffix must be longer than the initial 6 hex, + // proving the retry branch actually fired. + const minLenAfterRetry = len("myrepo-") + initialSlugHashLen + 1 + if len(repos[0].Slug) < minLenAfterRetry && len(repos[1].Slug) < minLenAfterRetry { + t.Errorf("expected at least one slug to have a longer hash after retry; got %q and %q", repos[0].Slug, repos[1].Slug) + } +} + +// findColliding6HexPaths searches a deterministic sequence of +// `myrepo` paths for any two whose SHA-1 digests share the first +// initialSlugHashLen hex chars. At 24 bits of resolution the +// birthday bound hits 50% around N≈4900, so maxAttempts=50000 is +// overwhelmingly likely to yield a pair. Returns false only if no +// collision was found — the test treats that as a skip, not a +// failure, since the worst-case outcome is a missed opportunity to +// exercise the retry, not a bug. +func findColliding6HexPaths(maxAttempts int) (string, string, bool) { + seen := make(map[string]string, maxAttempts) + for i := 0; i < maxAttempts; i++ { + p := fmt.Sprintf("/search/path-%07d/myrepo", i) + h := sha1.Sum([]byte(p)) + prefix := hex.EncodeToString(h[:])[:initialSlugHashLen] + if prev, ok := seen[prefix]; ok { + return prev, p, true + } + seen[prefix] = p + } + return "", "", false +} + +// Same paths ingested twice must produce the same slugs — this is +// what makes resume work across runs. Even with the retry loop +// adjusting hash length under collisions, a deterministic input set +// must yield a deterministic output. +func TestAssignSlugs_DeterministicUnderRetry(t *testing.T) { + build := func() []Repo { + rs := make([]Repo, 5000) + for i := 0; i < 5000; i++ { + rs[i] = Repo{AbsPath: fmt.Sprintf("/work/p-%05d/myrepo", i)} + } + return rs + } + a := build() + b := build() + assignSlugs(a) + assignSlugs(b) + for i := range a { + if a[i].Slug != b[i].Slug { + t.Errorf("slug for %s differs across runs: %q vs %q", a[i].AbsPath, a[i].Slug, b[i].Slug) + } + } +} + func TestDiscover_IgnoredRepoNotRecorded_DescendantStillFound(t *testing.T) { root := t.TempDir() // `vendor` is itself a repo AND is ignored by `vendor/`. From c4958bb56d7622b10bb6381d0044ea715a78aeb1 Mon Sep 17 00:00:00 2001 From: lex0c Date: Mon, 20 Apr 2026 13:39:05 -0300 Subject: [PATCH 11/37] Propagate context cancellation into Discover walk Discover took no context, so a ctx-cancelled scan still walked the entire tree before returning. On home-directory scans where the walk phase can be the longest leg of the whole command, Ctrl+C felt unresponsive and anyone running interactive scans saw seconds of post-cancel filesystem churn. Thread ctx into Discover; each filepath.WalkDir callback checks ctx.Err() before doing filesystem work and returns it to short-circuit the walk. A pre-cancelled context test asserts Discover returns context.Canceled with no repos instead of draining the tree. Co-Authored-By: Claude Opus 4.7 (1M context) --- internal/scan/discovery.go | 23 +++++++++++++- internal/scan/discovery_test.go | 55 ++++++++++++++++++++++++++------- internal/scan/scan.go | 2 +- 3 files changed, 66 insertions(+), 14 deletions(-) diff --git a/internal/scan/discovery.go b/internal/scan/discovery.go index cecbae2..2f2716a 100644 --- a/internal/scan/discovery.go +++ b/internal/scan/discovery.go @@ -1,6 +1,7 @@ package scan import ( + "context" "crypto/sha1" "encoding/hex" "fmt" @@ -29,14 +30,27 @@ type Repo struct { // unique across the full result so downstream JSONL files don't collide, // and the prefix LoadMultiJSONL derives (basename-minus-ext) still groups // correctly. -func Discover(roots []string, matcher *Matcher, maxDepth int) ([]Repo, error) { +// +// A cancelled ctx aborts the walk promptly — important because on large +// roots (home directory scans, monorepos of repos) discovery is often +// the longest phase of scan and Ctrl+C needs to land in real time, not +// "after the walk naturally finishes". Each filepath.WalkDir callback +// checks ctx.Done() before doing any filesystem work, so the abort +// window is bounded by the cost of one stat call per dir entry. +func Discover(ctx context.Context, roots []string, matcher *Matcher, maxDepth int) ([]Repo, error) { if matcher == nil { matcher = NewMatcher(nil) } + if ctx == nil { + ctx = context.Background() + } var repos []Repo seen := make(map[string]bool) for _, root := range roots { + if err := ctx.Err(); err != nil { + return nil, err + } abs, err := filepath.Abs(root) if err != nil { return nil, fmt.Errorf("resolve %s: %w", root, err) @@ -50,6 +64,13 @@ func Discover(roots []string, matcher *Matcher, maxDepth int) ([]Repo, error) { } err = filepath.WalkDir(abs, func(path string, d os.DirEntry, werr error) error { + // Return ctx.Err() (not SkipDir) so WalkDir short-circuits + // the entire walk. Without this check, a cancelled scan + // would keep stat'ing every directory in a large tree + // before the caller saw the interrupt. + if err := ctx.Err(); err != nil { + return err + } if werr != nil { // Permission errors on one subtree shouldn't abort the // whole scan. Log and keep walking. diff --git a/internal/scan/discovery_test.go b/internal/scan/discovery_test.go index 9ec3aa7..8639acb 100644 --- a/internal/scan/discovery_test.go +++ b/internal/scan/discovery_test.go @@ -1,6 +1,7 @@ package scan import ( + "context" "crypto/sha1" "encoding/hex" "fmt" @@ -20,7 +21,7 @@ func TestDiscover_FindsRepos(t *testing.T) { t.Fatal(err) } - repos, err := Discover([]string{root}, NewMatcher(nil), 0) + repos, err := Discover(context.Background(), []string{root}, NewMatcher(nil), 0) if err != nil { t.Fatalf("Discover: %v", err) } @@ -44,7 +45,7 @@ func TestDiscover_RespectsIgnore(t *testing.T) { mustMkRepo(t, filepath.Join(root, "node_modules", "garbage")) matcher := NewMatcher([]string{"node_modules"}) - repos, err := Discover([]string{root}, matcher, 0) + repos, err := Discover(context.Background(), []string{root}, matcher, 0) if err != nil { t.Fatal(err) } @@ -64,7 +65,7 @@ func TestDiscover_HonorsNegatedDescendant(t *testing.T) { mustMkRepo(t, filepath.Join(root, "vendor", "keep")) matcher := NewMatcher([]string{"vendor/", "!vendor/keep"}) - repos, err := Discover([]string{root}, matcher, 0) + repos, err := Discover(context.Background(), []string{root}, matcher, 0) if err != nil { t.Fatal(err) } @@ -183,6 +184,36 @@ func TestAssignSlugs_DeterministicUnderRetry(t *testing.T) { } } +// A cancelled context must abort the walk — previously Discover +// ignored ctx entirely, so Ctrl+C during the walk phase only took +// effect after every directory had been stat'd. Test uses a +// pre-cancelled context to trip the early-return on the very first +// callback; no repos should be returned and the error should equal +// ctx.Err(). +func TestDiscover_AbortsOnCancelledContext(t *testing.T) { + root := t.TempDir() + // Enough decoy directories that an un-aborted walk would still + // produce observable work — the assertion below fails loudly if + // the check is skipped. + for i := 0; i < 20; i++ { + if err := os.MkdirAll(filepath.Join(root, fmt.Sprintf("dir-%d", i)), 0o755); err != nil { + t.Fatal(err) + } + } + mustMkRepo(t, filepath.Join(root, "would-be-found")) + + ctx, cancel := context.WithCancel(context.Background()) + cancel() // pre-cancelled + + repos, err := Discover(ctx, []string{root}, NewMatcher(nil), 0) + if err != context.Canceled { + t.Fatalf("want context.Canceled, got err=%v", err) + } + if len(repos) != 0 { + t.Errorf("want empty repo list after cancel, got %+v", repos) + } +} + func TestDiscover_IgnoredRepoNotRecorded_DescendantStillFound(t *testing.T) { root := t.TempDir() // `vendor` is itself a repo AND is ignored by `vendor/`. @@ -191,7 +222,7 @@ func TestDiscover_IgnoredRepoNotRecorded_DescendantStillFound(t *testing.T) { mustMkRepo(t, filepath.Join(root, "vendor", "keep")) matcher := NewMatcher([]string{"vendor/", "!vendor/keep"}) - repos, err := Discover([]string{root}, matcher, 0) + repos, err := Discover(context.Background(), []string{root}, matcher, 0) if err != nil { t.Fatal(err) } @@ -250,7 +281,7 @@ func TestDiscover_HonorsGlobbedNegation(t *testing.T) { mustMkRepo(t, filepath.Join(root, "unrelated")) matcher := NewMatcher([]string{"vendor*/", "!vendor*/keep"}) - repos, err := Discover([]string{root}, matcher, 0) + repos, err := Discover(context.Background(), []string{root}, matcher, 0) if err != nil { t.Fatal(err) } @@ -277,7 +308,7 @@ func TestDiscover_DoesNotDescendIntoRepo(t *testing.T) { // submodule and skipped — we don't double-count. mustMkRepo(t, filepath.Join(parent, "submodule")) - repos, err := Discover([]string{root}, NewMatcher(nil), 0) + repos, err := Discover(context.Background(), []string{root}, NewMatcher(nil), 0) if err != nil { t.Fatal(err) } @@ -291,7 +322,7 @@ func TestDiscover_SlugCollisionGetsHashSuffix(t *testing.T) { mustMkRepo(t, filepath.Join(root, "a", "myrepo")) mustMkRepo(t, filepath.Join(root, "b", "myrepo")) - repos, err := Discover([]string{root}, NewMatcher(nil), 0) + repos, err := Discover(context.Background(), []string{root}, NewMatcher(nil), 0) if err != nil { t.Fatal(err) } @@ -320,11 +351,11 @@ func TestDiscover_SlugDeterministicAcrossRuns(t *testing.T) { mustMkRepo(t, filepath.Join(root, "a", "myrepo")) mustMkRepo(t, filepath.Join(root, "b", "myrepo")) - first, err := Discover([]string{root}, NewMatcher(nil), 0) + first, err := Discover(context.Background(), []string{root}, NewMatcher(nil), 0) if err != nil { t.Fatal(err) } - second, err := Discover([]string{root}, NewMatcher(nil), 0) + second, err := Discover(context.Background(), []string{root}, NewMatcher(nil), 0) if err != nil { t.Fatal(err) } @@ -354,7 +385,7 @@ func TestDiscover_RejectsSymlinkGit(t *testing.T) { } mustMkRepo(t, filepath.Join(root, "real")) - repos, err := Discover([]string{root}, NewMatcher(nil), 0) + repos, err := Discover(context.Background(), []string{root}, NewMatcher(nil), 0) if err != nil { t.Fatal(err) } @@ -368,7 +399,7 @@ func TestDiscover_MaxDepthHonored(t *testing.T) { mustMkRepo(t, filepath.Join(root, "shallow")) mustMkRepo(t, filepath.Join(root, "a", "b", "c", "deep")) - repos, err := Discover([]string{root}, NewMatcher(nil), 2) + repos, err := Discover(context.Background(), []string{root}, NewMatcher(nil), 2) if err != nil { t.Fatal(err) } @@ -411,7 +442,7 @@ func TestDiscover_FindsBareRepo(t *testing.T) { t.Fatal(err) } - repos, err := Discover([]string{root}, NewMatcher(nil), 0) + repos, err := Discover(context.Background(), []string{root}, NewMatcher(nil), 0) if err != nil { t.Fatal(err) } diff --git a/internal/scan/scan.go b/internal/scan/scan.go index 0d4a41d..4417ad6 100644 --- a/internal/scan/scan.go +++ b/internal/scan/scan.go @@ -80,7 +80,7 @@ func Run(ctx context.Context, cfg Config) (*Result, error) { return nil, err } - repos, err := Discover(cfg.Roots, matcher, cfg.MaxDepth) + repos, err := Discover(ctx, cfg.Roots, matcher, cfg.MaxDepth) if err != nil { return nil, err } From 6153c59f9e651c5b842eb79a9547065c8600f298 Mon Sep 17 00:00:00 2001 From: lex0c Date: Mon, 20 Apr 2026 13:44:30 -0300 Subject: [PATCH 12/37] Harden tests for scan feature gaps Three additions that fill coverage holes called out in the self- assessment of this branch: 1. TestGenerate_PerRepoBreakdownWidthsAreNumeric now asserts at least one bar width renders above zero. The absence-of-error- markers check would previously pass a regression where every width silently collapsed to 0% (bars invisible but template still valid). 2. TestDiscover_AbortsMidWalk measures a baseline walk time, cancels at 25% of that interval, and asserts the cancelled walk returned ctx.Canceled, cut short before baseline, and produced fewer than the full repo count. Complements the existing pre- cancelled test, which only proved the first callback checks ctx. 3. TestGenerateProfile_MultiRepoBreakdownFiltersByEmail covers the scan --email --report flow end-to-end: two repos, one with a colleague-exclusive commit + file, assert the breakdown section renders, commit counts per repo reflect only the filtered dev, and colleague-only files do not bleed into the report. Co-Authored-By: Claude Opus 4.7 (1M context) --- internal/report/report_test.go | 93 +++++++++++++++++++++++++++++++++ internal/scan/discovery_test.go | 69 ++++++++++++++++++++++++ 2 files changed, 162 insertions(+) diff --git a/internal/report/report_test.go b/internal/report/report_test.go index 595b0f6..f27cf33 100644 --- a/internal/report/report_test.go +++ b/internal/report/report_test.go @@ -5,6 +5,7 @@ import ( "fmt" "os" "path/filepath" + "regexp" "strings" "testing" @@ -119,6 +120,98 @@ func TestGenerate_PerRepoBreakdownWidthsAreNumeric(t *testing.T) { if strings.Contains(out, "%!f(string=") { t.Error("report contains %!f(string=...) — printf was given a string where a float was expected") } + + // Stronger guard: the absence-of-markers check above would pass + // if a future bug rendered every bar at `width:0%`. The breakdown + // tile has a green fill (background:#216e39) — extract every + // rendered width for those tiles and assert at least one is + // non-zero. With alpha's 10-churn commit and beta's 25-churn + // commit producing distinct totals, the commit-share proportions + // can't all collapse to zero unless the template is broken. + widthRe := regexp.MustCompile(`width:([0-9]+(?:\.[0-9]+)?)%; background:#216e39`) + matches := widthRe.FindAllStringSubmatch(out, -1) + if len(matches) == 0 { + t.Fatal("no repo-breakdown bar widths found; template may have changed selector") + } + sawNonZero := false + for _, m := range matches { + if m[1] != "0" && m[1] != "0.0" { + sawNonZero = true + break + } + } + if !sawNonZero { + t.Errorf("every repo bar rendered with width:0 — proportions would be invisible in the UI; widths: %v", matches) + } +} + +// End-to-end for the `gitcortex scan --email me@x.com --report …` +// flow. Covers three assertions at once: +// 1. GenerateProfile emits the Per-Repository Breakdown section +// when the dataset is multi-repo (gated on len(Repos) > 1). +// 2. Counts per repo are filtered to the dev — a commit by +// someone-else@x.com in alpha doesn't bleed into my profile's +// alpha row. +// 3. Files counted per repo are only files THIS dev touched — a +// colleague-exclusive file in alpha must not inflate my scope. +func TestGenerateProfile_MultiRepoBreakdownFiltersByEmail(t *testing.T) { + dir := t.TempDir() + alpha := filepath.Join(dir, "alpha.jsonl") + beta := filepath.Join(dir, "beta.jsonl") + + // alpha: me has 1 commit on a.go; colleague has 1 commit on + // colleague-only.go. beta: me has 1 commit on b.go. + alphaContent := `{"type":"commit","sha":"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa01","author_email":"me@x.com","author_name":"Me","author_date":"2024-01-10T00:00:00Z","additions":10,"deletions":0,"files_changed":1} +{"type":"commit_file","commit":"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa01","path_current":"a.go","additions":10,"deletions":0} +{"type":"commit","sha":"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa02","author_email":"colleague@x.com","author_name":"Col","author_date":"2024-01-11T00:00:00Z","additions":5,"deletions":0,"files_changed":1} +{"type":"commit_file","commit":"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa02","path_current":"colleague-only.go","additions":5,"deletions":0} +` + betaContent := `{"type":"commit","sha":"bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb01","author_email":"me@x.com","author_name":"Me","author_date":"2024-02-10T00:00:00Z","additions":20,"deletions":5,"files_changed":1} +{"type":"commit_file","commit":"bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb01","path_current":"b.go","additions":20,"deletions":5} +` + if err := os.WriteFile(alpha, []byte(alphaContent), 0o644); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(beta, []byte(betaContent), 0o644); err != nil { + t.Fatal(err) + } + + ds, err := stats.LoadMultiJSONL([]string{alpha, beta}) + if err != nil { + t.Fatalf("LoadMultiJSONL: %v", err) + } + + var buf bytes.Buffer + if err := GenerateProfile(&buf, ds, "scan-profile", "me@x.com"); err != nil { + t.Fatalf("GenerateProfile: %v", err) + } + out := buf.String() + + if !strings.Contains(out, "Per-Repository Breakdown") { + t.Fatal("profile report missing the breakdown section — scan --email users won't see their cross-repo split") + } + + // Each row renders as `
alphaN(alpha|beta)(\d+)N
- - - - - - - - - - - -{{range .Repos}} - - - - - - - - - - - -{{end}} -
RepositoryCommits% CommitsChurn% ChurnFilesActive daysDevsFirst → Last
{{.Repo}}{{thousands .Commits}} -
-
-
-
- {{printf "%.1f" .PctOfTotalCommits}}% -
-
{{thousands .Churn}}{{printf "%.1f" .PctOfTotalChurn}}%{{thousands .Files}}{{.ActiveDays}}{{.UniqueDevs}}{{.FirstCommitDate}} → {{.LastCommitDate}}
-{{end}} - {{if .ActivityYears}}

Activity

Monthly commit heatmap. Darker = more commits. Sudden drop-offs may mark team changes, re-orgs, or freezes; steady cadence signals healthy pace. Hover for details; toggle to table for exact numbers. · {{docRef "activity"}}

From a0a6251af16c5da751344e997fa6f64be363c541 Mon Sep 17 00:00:00 2001 From: lex0c Date: Mon, 20 Apr 2026 18:07:25 -0300 Subject: [PATCH 21/37] Add --report-dir for per-repo HTML reports plus an index landing page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The old `scan --report ` rendered one consolidated team report across every scanned repo, which mixed hotspots, bus factor, and coupling signal from unrelated codebases. A small `main.go` in a 100-commit repo could never surface beside a 50k-commit `wp-includes/` — the ranking was dominated by history size, not architectural significance. Replace the team-consolidation flow with --report-dir: for each successful repo in the manifest, load its JSONL as a standalone Dataset and render /.html via the existing report pipeline. Each HTML is equivalent to running `gitcortex report` against that one repo; no cross-repo mixing. Afterward emit /index.html with per-repo summary cards (commits, churn, devs, first→last date, status pill, relative-volume bar) linking into every successful report. Failed and pending entries surface inline so operators can spot them without opening the manifest. The only report flag that still takes a single file path is --report combined with --email, which renders the consolidated profile (genuinely cross-repo signal: "where did this person spend their time"). --report alone is now a CLI error pointing at both alternatives. Tests cover both the validation branch and the end-to-end --report-dir flow: two real git repos produce two standalone HTMLs (asserted to carry no `:` path prefix, proving LoadJSONL and not LoadMultiJSONL was used) and an index with hrefs into each. Co-Authored-By: Claude Opus 4.7 (1M context) --- README.md | 20 ++-- cmd/gitcortex/main.go | 190 ++++++++++++++++++++++++++++------ cmd/gitcortex/main_test.go | 115 ++++++++++++++++++++ internal/report/scan_index.go | 157 ++++++++++++++++++++++++++++ 4 files changed, 445 insertions(+), 37 deletions(-) create mode 100644 internal/report/scan_index.go diff --git a/README.md b/README.md index df25f09..4d1cede 100644 --- a/README.md +++ b/README.md @@ -405,16 +405,22 @@ For workspaces containing many repos (an engineer's `~/work`, a platform team's ### Scan: discover and aggregate every repo under a root -Walk one or more directories, find every git repository (working trees and bare clones both detected), extract them in parallel, and optionally generate a consolidated HTML report. The main use case is "show my manager every project I've touched in the last year" or "give a platform team one dashboard across N services" without scripting the multi-extract + multi-input pipeline manually. +Walk one or more directories, find every git repository (working trees and bare clones both detected), extract them in parallel, and optionally render HTML. Two output modes: + +- `--report-dir ` — one standalone HTML per repo plus an `index.html` landing page linking them. Each per-repo report is equivalent to running `gitcortex report` against that repo alone; no metric mixing across unrelated codebases. +- `--report --email
` — a **single** consolidated profile report for one developer across every scanned repo. The only cross-repo aggregation in the feature, because "where did this person spend their time?" is the only question that genuinely benefits from pooling signal across projects. + +There is no third mode. Cross-repo consolidation at the team/codebase level inflates hotspots, bus factor, and coupling with noise from unrelated codebases; if that's what you want, inspect `manifest.json` or run `gitcortex report` per JSONL. ```bash -# Discover and extract every repo under ~/work, one JSONL per repo +# Discover and extract every repo under ~/work (JSONLs + manifest, no HTML) gitcortex scan --root ~/work --output ./scan-out -# Consolidated report across all discovered repos -gitcortex scan --root ~/work --output ./scan-out --report ./all.html +# Per-repo HTML reports + index landing page +gitcortex scan --root ~/work --output ./scan-out --report-dir ./reports +# opens ./reports/index.html → click through to each repo -# Personal cross-repo profile: only MY commits, with per-repo breakdown +# Personal cross-repo profile: only MY commits, consolidated into one HTML gitcortex scan --root ~/work --output ./scan-out \ --report ./me.html --email me@company.com --since 1y \ --include-commit-messages @@ -422,7 +428,7 @@ gitcortex scan --root ~/work --output ./scan-out \ # Multiple roots, higher parallelism, pre-set ignore patterns gitcortex scan --root ~/work --root ~/personal --root ~/oss \ --parallel 8 --max-depth 4 \ - --output ./scan-out --report ./all.html + --output ./scan-out --report-dir ./reports ``` The scan output directory holds: @@ -450,7 +456,7 @@ vendor/ Directory rules, globs, `**/foo`, and `!path` negations all work. Globbed negations like `!vendor*/keep` are honored — discovery descends into any dir where a negation rule could match a descendant. If `--ignore-file` is not set, scan looks for `.gitcortex-ignore` in the first `--root`. -**Consolidated profile report.** When `scan --email me@company.com --report path.html` runs against a multi-repo dataset, the profile report renders a *Per-Repository Breakdown* section: commits, churn, files, active days, and share-of-total — all filtered to that developer's contributions (files count reflects only files the dev touched). The section is profile-only; the team report focuses on the aggregate across repos without a per-repo split (run `stats --input repo.jsonl` per-repo or read the `manifest.json` for that). +**Consolidated profile report.** When `scan --email me@company.com --report path.html` runs against a multi-repo dataset, the profile report renders a *Per-Repository Breakdown* section: commits, churn, files, active days, and share-of-total — all filtered to that developer's contributions (files count reflects only files the dev touched). This is the one report that legitimately aggregates across repos; team-level views live in `--report-dir` (one HTML per repo, never mixed). **Flags worth knowing:** diff --git a/cmd/gitcortex/main.go b/cmd/gitcortex/main.go index 948aac4..90aaaf5 100644 --- a/cmd/gitcortex/main.go +++ b/cmd/gitcortex/main.go @@ -872,6 +872,114 @@ func reportCmd() *cobra.Command { return cmd } +// renderScanReportDir is `scan --report-dir` in one place: for each +// successful repo in the manifest, load only that repo's JSONL as a +// standalone dataset, run the normal `report` pipeline into +// /.html, and afterward emit /index.html linking +// all of them. Intentionally uses LoadJSONL (single-input) rather +// than LoadMultiJSONL so each per-repo report's paths carry no +// `:` prefix — each tab of the experience is self-contained. +func renderScanReportDir(result *scan.Result, dir string, loadOpts stats.LoadOptions, sf stats.StatsFlags, topN int) error { + if err := os.MkdirAll(dir, 0o755); err != nil { + return fmt.Errorf("create report dir: %w", err) + } + + // Build an index-row slice parallel to Manifest.Repos so the + // landing page preserves scan order. We can't skip failed/pending + // entries — operators need to see what didn't produce a report. + entries := make([]reportpkg.ScanIndexEntry, 0, len(result.Manifest.Repos)) + var totalCommits int + allDevs := map[string]struct{}{} + maxCommits := 0 + + for _, m := range result.Manifest.Repos { + entry := reportpkg.ScanIndexEntry{ + Slug: m.Slug, + Path: m.Path, + Status: m.Status, + Error: m.Error, + } + if m.Status != "ok" { + entries = append(entries, entry) + continue + } + + ds, err := stats.LoadJSONL(m.JSONL, loadOpts) + if err != nil { + // Treat load failure as a per-repo failure — index shows it, + // but the scan itself was already successful to that point. + entry.Status = "failed" + entry.Error = fmt.Sprintf("load %s: %v", m.JSONL, err) + entries = append(entries, entry) + continue + } + + outFile := filepath.Join(dir, m.Slug+".html") + f, err := os.Create(outFile) + if err != nil { + return fmt.Errorf("create %s: %w", outFile, err) + } + if err := reportpkg.Generate(f, ds, m.Slug, topN, sf); err != nil { + f.Close() + return fmt.Errorf("generate report for %s: %w", m.Slug, err) + } + f.Close() + + summary := stats.ComputeSummary(ds) + entry.Commits = summary.TotalCommits + entry.Devs = summary.TotalDevs + entry.Files = summary.TotalFiles + entry.Churn = summary.TotalAdditions + summary.TotalDeletions + entry.FirstCommitDate = summary.FirstCommitDate + entry.LastCommitDate = summary.LastCommitDate + entry.ReportHref = m.Slug + ".html" + + totalCommits += entry.Commits + if entry.Commits > maxCommits { + maxCommits = entry.Commits + } + for _, c := range stats.TopContributors(ds, 0) { + allDevs[c.Email] = struct{}{} + } + + entries = append(entries, entry) + } + + indexPath := filepath.Join(dir, "index.html") + f, err := os.Create(indexPath) + if err != nil { + return fmt.Errorf("create index: %w", err) + } + defer f.Close() + + var okCount, failedCount int + for _, e := range entries { + switch e.Status { + case "ok": + okCount++ + case "failed", "pending": + failedCount++ + } + } + + data := reportpkg.ScanIndexData{ + Roots: result.Manifest.Roots, + Repos: entries, + TotalRepos: len(entries), + OKRepos: okCount, + FailedRepos: failedCount, + TotalCommits: totalCommits, + TotalDevs: len(allDevs), + MaxCommits: maxCommits, + } + if err := reportpkg.GenerateScanIndex(f, data); err != nil { + return fmt.Errorf("generate index: %w", err) + } + fmt.Fprintf(os.Stderr, "Per-repo reports written to %s (%d ok, %d failed); open %s\n", + dir, okCount, failedCount, fileURL(indexPath)) + return nil +} + // --- Scan --- func scanCmd() *cobra.Command { @@ -886,6 +994,7 @@ func scanCmd() *cobra.Command { to string since string reportPath string + reportDir string topN int extractIgnore []string batchSize int @@ -921,6 +1030,21 @@ breakdown — handy for showing aggregated work across many repos.`, if from != "" && to != "" && from > to { return fmt.Errorf("--from (%s) must be on or before --to (%s)", from, to) } + // Report-flag combinations. --report is reserved for the + // only case that genuinely consolidates signal across repos + // — the per-developer profile; cross-repo team aggregation + // inflates hotspots/bus-factor/coupling with mixed-codebase + // noise and has been replaced by --report-dir which emits + // one standalone HTML per repo plus an index landing page. + if reportPath != "" && email == "" { + return fmt.Errorf("--report requires --email (profile consolidation); for per-repo HTML reports use --report-dir ") + } + if reportPath != "" && reportDir != "" { + return fmt.Errorf("--report and --report-dir are mutually exclusive; pick profile (--report + --email) or per-repo (--report-dir)") + } + if email != "" && reportDir != "" { + return fmt.Errorf("--email filters a single consolidated profile report; combine it with --report , not --report-dir") + } // Resolve --since up-front. If this fails we'd otherwise // discover the typo only after a full multi-repo scan — // minutes to hours on a large workspace, all thrown away @@ -977,48 +1101,53 @@ breakdown — handy for showing aggregated work across many repos.`, filepath.Join(result.OutputDir, "manifest.json")) } - if reportPath == "" { - fmt.Fprintf(os.Stderr, "Scan complete: %d JSONL file(s) in %s\n", len(result.JSONLs), result.OutputDir) - return nil - } - - ds, err := stats.LoadMultiJSONL(result.JSONLs, stats.LoadOptions{ + sf := stats.StatsFlags{CouplingMinChanges: couplingMinChanges, NetworkMinFiles: networkMinFiles} + loadOpts := stats.LoadOptions{ From: fromDate, To: to, HalfLifeDays: churnHalfLife, CoupMaxFiles: couplingMaxFiles, - }) - if err != nil { - return fmt.Errorf("load consolidated dataset: %w", err) } - fmt.Fprintf(os.Stderr, "Loaded %d commits across %d repo(s)\n", ds.CommitCount, len(result.JSONLs)) - f, err := os.Create(reportPath) - if err != nil { - return fmt.Errorf("create report: %w", err) - } - defer f.Close() - - // Label the report after the basename of the first --root - // (or the output dir as a fallback). "scan-scan-output" was - // the previous default; users find the root name far more - // recognizable as the H1 of the report. - repoLabel := filepath.Base(result.OutputDir) - if len(cfg.Roots) > 0 { - repoLabel = filepath.Base(absPath(cfg.Roots[0])) - } - sf := stats.StatsFlags{CouplingMinChanges: couplingMinChanges, NetworkMinFiles: networkMinFiles} + // Path 1: profile consolidation — the one cross-repo + // aggregation that earns its place (a single dev's work + // across all scanned repos). Build a multi-repo Dataset + // (path-prefixed via LoadMultiJSONL) and filter by email + // at render time. if email != "" { + ds, err := stats.LoadMultiJSONL(result.JSONLs, loadOpts) + if err != nil { + return fmt.Errorf("load consolidated dataset: %w", err) + } + fmt.Fprintf(os.Stderr, "Loaded %d commits across %d repo(s)\n", ds.CommitCount, len(result.JSONLs)) + + f, err := os.Create(reportPath) + if err != nil { + return fmt.Errorf("create report: %w", err) + } + defer f.Close() + + repoLabel := filepath.Base(absPath(cfg.Roots[0])) if err := reportpkg.GenerateProfile(f, ds, repoLabel, email); err != nil { return fmt.Errorf("generate profile report: %w", err) } fmt.Fprintf(os.Stderr, "Profile report for %s written to %s\n", email, fileURL(reportPath)) return nil } - if err := reportpkg.Generate(f, ds, repoLabel, topN, sf); err != nil { - return fmt.Errorf("generate report: %w", err) + + // Path 2: per-repo reports + index. Each repo is loaded + // standalone so its stats reflect its own codebase — no + // hotspot/bus-factor/coupling mixing across unrelated + // projects. The index.html lists every repo with cards + // linking into each per-repo report; failed extracts are + // surfaced inline so operators can spot them without + // opening the manifest. + if reportDir != "" { + return renderScanReportDir(result, reportDir, loadOpts, sf, topN) } - fmt.Fprintf(os.Stderr, "Consolidated report written to %s\n", fileURL(reportPath)) + + // Path 3: no report flag — just confirm JSONLs landed. + fmt.Fprintf(os.Stderr, "Scan complete: %d JSONL file(s) in %s\n", len(result.JSONLs), result.OutputDir) return nil }, } @@ -1032,8 +1161,9 @@ breakdown — handy for showing aggregated work across many repos.`, cmd.Flags().StringVar(&from, "from", "", "Window start date YYYY-MM-DD (forwarded to the consolidated report)") cmd.Flags().StringVar(&to, "to", "", "Window end date YYYY-MM-DD (forwarded to the consolidated report)") cmd.Flags().StringVar(&since, "since", "", "Filter to recent period (e.g. 7d, 4w, 3m, 1y); mutually exclusive with --from/--to") - cmd.Flags().StringVar(&reportPath, "report", "", "If set, generate a consolidated HTML report at this path after the scan") - cmd.Flags().IntVar(&topN, "top", 20, "Top-N entries per section in the consolidated report") + cmd.Flags().StringVar(&reportPath, "report", "", "With --email, write a developer profile consolidated across all scanned repos to this HTML file. For per-repo reports use --report-dir instead.") + cmd.Flags().StringVar(&reportDir, "report-dir", "", "Directory to write one standalone HTML report per repo plus an index.html landing page. Each per-repo report is equivalent to running `gitcortex report` on that repo alone — no cross-repo metric mixing.") + cmd.Flags().IntVar(&topN, "top", 20, "Top-N entries per section in each per-repo report") cmd.Flags().StringSliceVar(&extractIgnore, "extract-ignore", nil, "Glob patterns forwarded to per-repo extract --ignore (e.g. package-lock.json)") cmd.Flags().IntVar(&batchSize, "batch-size", 1000, "Per-repo extract checkpoint interval") cmd.Flags().BoolVar(&mailmap, "mailmap", false, "Use .mailmap (per repo) to normalize identities") diff --git a/cmd/gitcortex/main_test.go b/cmd/gitcortex/main_test.go index aca5896..884db8f 100644 --- a/cmd/gitcortex/main_test.go +++ b/cmd/gitcortex/main_test.go @@ -3,6 +3,7 @@ package main import ( "bytes" "os" + "os/exec" "path/filepath" "strings" "testing" @@ -116,6 +117,120 @@ func TestScanCmd_RejectsMissingIgnoreFile(t *testing.T) { } } +// --report without --email is ambiguous under the "no team-wide +// consolidation" design: the only meaningful single-HTML output +// across multiple repos is a developer profile. The error must +// point the user at the right flag for each intent. +func TestScanCmd_RejectsReportWithoutEmail(t *testing.T) { + cmd := scanCmd() + cmd.SilenceUsage = true + cmd.SilenceErrors = true + cmd.SetOut(&bytes.Buffer{}) + cmd.SetErr(&bytes.Buffer{}) + cmd.SetArgs([]string{ + "--root", t.TempDir(), + "--output", t.TempDir(), + "--report", filepath.Join(t.TempDir(), "out.html"), + }) + err := cmd.Execute() + if err == nil { + t.Fatal("expected error for --report without --email") + } + if !strings.Contains(err.Error(), "--report-dir") || !strings.Contains(err.Error(), "--email") { + t.Errorf("error should point at both --report-dir and --email as remediation; got %q", err) + } +} + +// `scan --report-dir ` writes index.html plus one HTML per +// successful repo, with each per-repo HTML rendered from a +// single-input Dataset (no cross-repo path prefix). End-to-end +// check against two real repos built with git. +func TestScanCmd_ReportDirGeneratesPerRepoHTMLAndIndex(t *testing.T) { + if _, err := exec.LookPath("git"); err != nil { + t.Skip("git not installed") + } + root := t.TempDir() + makeRepoWithCommit(t, filepath.Join(root, "alpha"), "a.go", "package a\n") + makeRepoWithCommit(t, filepath.Join(root, "beta"), "b.py", "print('b')\n") + + scanOut := t.TempDir() + reportDir := t.TempDir() + + cmd := scanCmd() + cmd.SilenceUsage = true + cmd.SilenceErrors = true + cmd.SetOut(&bytes.Buffer{}) + cmd.SetErr(&bytes.Buffer{}) + cmd.SetArgs([]string{ + "--root", root, + "--output", scanOut, + "--report-dir", reportDir, + }) + if err := cmd.Execute(); err != nil { + t.Fatalf("scan --report-dir: %v", err) + } + + // Every successful repo gets a standalone HTML named .html. + for _, slug := range []string{"alpha", "beta"} { + path := filepath.Join(reportDir, slug+".html") + b, err := os.ReadFile(path) + if err != nil { + t.Fatalf("per-repo report missing: %v", err) + } + if !strings.Contains(string(b), "") { + t.Errorf("%s should be a full HTML document", path) + } + // Paths inside the per-repo report must NOT carry a + // `:` prefix — each tab is standalone, so LoadJSONL + // (not LoadMultiJSONL) was used and file paths appear raw. + if strings.Contains(string(b), slug+":") { + t.Errorf("per-repo report %s contains a `%s:` path prefix; Dataset was loaded as multi-repo", path, slug) + } + } + + // index.html links to each per-repo report. + indexBytes, err := os.ReadFile(filepath.Join(reportDir, "index.html")) + if err != nil { + t.Fatalf("index.html missing: %v", err) + } + index := string(indexBytes) + if !strings.Contains(index, `href="alpha.html"`) || !strings.Contains(index, `href="beta.html"`) { + t.Errorf("index.html should link to both per-repo reports; got body excerpt: %.300s", index) + } + if !strings.Contains(index, "Scan Index") { + t.Errorf("index.html missing title block") + } +} + +// makeRepoWithCommit initializes a git repo with one file and one +// commit using a deterministic identity. +func makeRepoWithCommit(t *testing.T, dir, file, contents string) { + t.Helper() + if err := os.MkdirAll(dir, 0o755); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(filepath.Join(dir, file), []byte(contents), 0o644); err != nil { + t.Fatal(err) + } + run := func(args ...string) { + cmd := exec.Command("git", args...) + cmd.Dir = dir + cmd.Env = append(os.Environ(), + "GIT_AUTHOR_NAME=Test", + "GIT_AUTHOR_EMAIL=test@example.com", + "GIT_COMMITTER_NAME=Test", + "GIT_COMMITTER_EMAIL=test@example.com", + ) + out, err := cmd.CombinedOutput() + if err != nil { + t.Fatalf("git %s: %v\n%s", strings.Join(args, " "), err, out) + } + } + run("init", "-q", "-b", "main") + run("add", ".") + run("commit", "-q", "-m", "initial") +} + func TestValidateDate(t *testing.T) { cases := []struct { in string diff --git a/internal/report/scan_index.go b/internal/report/scan_index.go new file mode 100644 index 0000000..82e39b7 --- /dev/null +++ b/internal/report/scan_index.go @@ -0,0 +1,157 @@ +package report + +import ( + "html/template" + "io" + "time" +) + +// ScanIndexEntry is one repo's row on the scan-index landing page. +// Successful repos populate the numeric fields; failed / pending +// repos leave them zero and surface Error instead. ReportHref is the +// relative URL the index uses to link into each per-repo report — +// empty when no report exists for that entry. +type ScanIndexEntry struct { + Slug string + Path string + Status string + Error string + Commits int + Devs int + Files int + Churn int64 + FirstCommitDate string + LastCommitDate string + ReportHref string +} + +// ScanIndexData is the top-level template input for the index page. +type ScanIndexData struct { + GeneratedAt string + Roots []string + Repos []ScanIndexEntry + // TotalRepos / OKRepos / FailedRepos are precomputed so the + // template doesn't need conditional arithmetic; the summary strip + // at the top renders directly from these. + TotalRepos int + OKRepos int + FailedRepos int + TotalCommits int + TotalDevs int + // Largest repo commit count — used to normalize the bar widths so + // the relative-volume bars are visually comparable across repos. + MaxCommits int +} + +// GenerateScanIndex writes the scan landing page: a per-repo card +// list with links to each repo's standalone report and a short +// summary strip. Failures are surfaced inline rather than hidden so +// operators can spot them at a glance and dig into the manifest. +func GenerateScanIndex(w io.Writer, data ScanIndexData) error { + if data.GeneratedAt == "" { + data.GeneratedAt = time.Now().Format("2006-01-02 15:04") + } + return scanIndexTmpl.Execute(w, data) +} + +var scanIndexTmpl = template.Must(template.New("scan-index").Funcs(funcMap).Parse(scanIndexHTML)) + +const scanIndexHTML = ` + + + +gitcortex scan index ({{.TotalRepos}} repositories) + + + + +

Scan Index {{.TotalRepos}} repositor{{if eq .TotalRepos 1}}y{{else}}ies{{end}}

+

Generated {{.GeneratedAt}}{{if .Roots}} · roots: {{range $i, $r := .Roots}}{{if $i}}, {{end}}{{$r}}{{end}}{{end}}

+ +
+
Repositories
{{thousands .OKRepos}}{{if gt .FailedRepos 0}} ({{.FailedRepos}} failed){{end}}
+
Total commits
{{humanize .TotalCommits}}
+
Unique devs
{{thousands .TotalDevs}}
+
+ +

Each repo below links to its own standalone report. Metrics are per-repository — no cross-repo aggregation that would mix signals from unrelated codebases. For a consolidated developer view, use gitcortex scan --report <file> --email <address>.

+ +{{$max := .MaxCommits}} +{{range .Repos}} +
+
+
+ {{if .ReportHref}}{{.Slug}}{{else}}{{.Slug}}{{end}} + {{.Status}} +
+
{{.Path}}
+ {{if .Error}}
{{.Error}}
{{end}} +
+ {{if eq .Status "ok"}} +
+ {{humanize .Commits}} + commits +
+
+ {{humanize .Churn}} + churn +
+
+ {{.Devs}} + devs +
+
+ {{humanize .Files}} + files +
+
+ {{.FirstCommitDate}}
→ {{.LastCommitDate}} +
+ {{else}} +
No report available.
+ {{end}} +
+{{if eq .Status "ok"}} +
+
+ {{humanize .Commits}} / {{humanize $max}} max +
+{{end}} +{{end}} + + + + + +` From fefced53834a3727f366fcc4f3f2b857a27f0d31 Mon Sep 17 00:00:00 2001 From: lex0c Date: Mon, 20 Apr 2026 18:17:58 -0300 Subject: [PATCH 22/37] Fix report-dir pending accounting, downgrade render errors, reserve index slug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three MAJORs from the review of commit a0a6251: 1. Pending/skipped entries were counted as failed in both the index summary card and the CLI log line. A user-cancelled scan read as "N failed" — misleading because nothing actually broke. Separate pendingCount bucket, new ScanIndexData.PendingRepos field, and the summary card shows "(M failed) (K pending)" as distinct chips. Log line drops zero buckets so a clean run still says "N ok" instead of "N ok, 0 failed, 0 pending". 2. os.Create and reportpkg.Generate errors aborted the entire batch, discarding every HTML already written AND skipping the index. Policy now matches LoadJSONL: the entry is downgraded to Status=failed with an explanatory Error, best-effort os.Remove cleans up any partial file, and the loop keeps going. Operators get an index listing exactly which repos rendered and which didn't. 3. A repo literally named `index` would emit /index.html and then be overwritten by the landing page. Add a reservedSlugs registry to scan.discovery; isReservedSlug triggers the same hash branch as a duplicate-basename collision. Case-insensitive per the existing fold (macOS/Windows can't tell `Index` from `index` on default filesystems). Tests cover all three: mixed ok+failed+pending manifest asserts the correct counts appear in the index, slug collisions force the hash, and all-failed still emits index.html without tripping the MaxCommits==0 guard in the bar-width template expression. Co-Authored-By: Claude Opus 4.7 (1M context) --- cmd/gitcortex/main.go | 47 ++++++++++++-- cmd/gitcortex/main_test.go | 111 ++++++++++++++++++++++++++++++++ internal/report/scan_index.go | 22 ++++--- internal/scan/discovery.go | 19 +++++- internal/scan/discovery_test.go | 26 ++++++++ 5 files changed, 209 insertions(+), 16 deletions(-) diff --git a/cmd/gitcortex/main.go b/cmd/gitcortex/main.go index 90aaaf5..897f33c 100644 --- a/cmd/gitcortex/main.go +++ b/cmd/gitcortex/main.go @@ -914,14 +914,29 @@ func renderScanReportDir(result *scan.Result, dir string, loadOpts stats.LoadOpt continue } + // All render-time errors for a single repo (create the output + // file, template.Execute) get demoted to an inline failure and + // the loop continues to the next repo — matching the policy + // already applied above for LoadJSONL. Aborting the batch on + // one bad render would discard every HTML already written and + // skip the index entirely, leaving operators staring at an + // incomplete directory with no way to discover the state. outFile := filepath.Join(dir, m.Slug+".html") f, err := os.Create(outFile) if err != nil { - return fmt.Errorf("create %s: %w", outFile, err) + entry.Status = "failed" + entry.Error = fmt.Sprintf("create %s: %v", outFile, err) + entries = append(entries, entry) + continue } if err := reportpkg.Generate(f, ds, m.Slug, topN, sf); err != nil { f.Close() - return fmt.Errorf("generate report for %s: %w", m.Slug, err) + // Best-effort cleanup: leave nothing half-written on disk. + _ = os.Remove(outFile) + entry.Status = "failed" + entry.Error = fmt.Sprintf("render %s: %v", m.Slug, err) + entries = append(entries, entry) + continue } f.Close() @@ -952,12 +967,20 @@ func renderScanReportDir(result *scan.Result, dir string, loadOpts stats.LoadOpt } defer f.Close() - var okCount, failedCount int + // Pending and failed live in separate buckets: pending means the + // worker never reached the repo (user cancelled mid-scan, or a + // future scheduling change), while failed means the extract was + // attempted and broke. Conflating them in the summary strip would + // mislead operators reading a cancel-shaped failure as a real + // repo-level problem. + var okCount, failedCount, pendingCount int for _, e := range entries { switch e.Status { case "ok": okCount++ - case "failed", "pending": + case "pending", "skipped": + pendingCount++ + default: failedCount++ } } @@ -968,6 +991,7 @@ func renderScanReportDir(result *scan.Result, dir string, loadOpts stats.LoadOpt TotalRepos: len(entries), OKRepos: okCount, FailedRepos: failedCount, + PendingRepos: pendingCount, TotalCommits: totalCommits, TotalDevs: len(allDevs), MaxCommits: maxCommits, @@ -975,8 +999,19 @@ func renderScanReportDir(result *scan.Result, dir string, loadOpts stats.LoadOpt if err := reportpkg.GenerateScanIndex(f, data); err != nil { return fmt.Errorf("generate index: %w", err) } - fmt.Fprintf(os.Stderr, "Per-repo reports written to %s (%d ok, %d failed); open %s\n", - dir, okCount, failedCount, fileURL(indexPath)) + + // Log line tracks the summary card: only non-zero buckets appear + // so a fully-successful scan doesn't confuse the operator with + // trailing "0 failed, 0 pending". + parts := []string{fmt.Sprintf("%d ok", okCount)} + if failedCount > 0 { + parts = append(parts, fmt.Sprintf("%d failed", failedCount)) + } + if pendingCount > 0 { + parts = append(parts, fmt.Sprintf("%d pending", pendingCount)) + } + fmt.Fprintf(os.Stderr, "Per-repo reports written to %s (%s); open %s\n", + dir, strings.Join(parts, ", "), fileURL(indexPath)) return nil } diff --git a/cmd/gitcortex/main_test.go b/cmd/gitcortex/main_test.go index 884db8f..2b361d1 100644 --- a/cmd/gitcortex/main_test.go +++ b/cmd/gitcortex/main_test.go @@ -7,6 +7,9 @@ import ( "path/filepath" "strings" "testing" + + "github.com/lex0c/gitcortex/internal/scan" + "github.com/lex0c/gitcortex/internal/stats" ) // scanCmd must validate --since BEFORE running the discovery walk @@ -202,6 +205,114 @@ func TestScanCmd_ReportDirGeneratesPerRepoHTMLAndIndex(t *testing.T) { } } +// renderScanReportDir must render the index for a manifest that +// mixes ok / failed / pending statuses. Failed entries show their +// error message, pending entries render with the pending pill, and +// neither inflates the "failed" count in the summary card. +func TestRenderScanReportDir_MixedStatuses(t *testing.T) { + dir := t.TempDir() + okJSONL := filepath.Join(dir, "good.jsonl") + jsonlContent := `{"type":"commit","sha":"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa","author_email":"me@x.com","author_name":"Me","author_date":"2024-01-01T00:00:00Z","additions":10,"deletions":0,"files_changed":1} +{"type":"commit_file","commit":"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa","path_current":"a.go","additions":10,"deletions":0} +` + if err := os.WriteFile(okJSONL, []byte(jsonlContent), 0o644); err != nil { + t.Fatal(err) + } + + reportDir := t.TempDir() + result := &scan.Result{ + OutputDir: dir, + Manifest: scan.Manifest{ + Roots: []string{"/fake/root"}, + Repos: []scan.ManifestRepo{ + {Slug: "good", Path: "/fake/root/good", Status: "ok", JSONL: okJSONL}, + {Slug: "broken", Path: "/fake/root/broken", Status: "failed", Error: "simulated extract crash"}, + {Slug: "skipped", Path: "/fake/root/skipped", Status: "pending"}, + }, + }, + } + + err := renderScanReportDir(result, reportDir, + stats.LoadOptions{HalfLifeDays: 90, CoupMaxFiles: 50}, + stats.StatsFlags{CouplingMinChanges: 1, NetworkMinFiles: 1}, + 10) + if err != nil { + t.Fatalf("renderScanReportDir returned error despite inline-failure policy: %v", err) + } + + // OK entry's report file exists; failed/pending do not. + if _, err := os.Stat(filepath.Join(reportDir, "good.html")); err != nil { + t.Errorf("good.html missing: %v", err) + } + for _, unwanted := range []string{"broken.html", "skipped.html"} { + if _, err := os.Stat(filepath.Join(reportDir, unwanted)); err == nil { + t.Errorf("%s should NOT be written — status was not ok", unwanted) + } + } + + indexB, err := os.ReadFile(filepath.Join(reportDir, "index.html")) + if err != nil { + t.Fatalf("index.html: %v", err) + } + index := string(indexB) + + // Summary card splits failed and pending correctly: "(1 failed)" + // AND "(1 pending)" appear, not "(2 failed)". + if strings.Contains(index, "(2 failed)") { + t.Error("summary miscounted pending as failed — `(2 failed)` appeared instead of separate buckets") + } + if !strings.Contains(index, "1 failed") { + t.Errorf("summary should show `1 failed`; index excerpt: %.600s", index) + } + if !strings.Contains(index, "1 pending") { + t.Errorf("summary should show `1 pending`; index excerpt: %.600s", index) + } + + // Failed entry's error bubbles up to the card. + if !strings.Contains(index, "simulated extract crash") { + t.Error("failed entry's error message missing from index") + } + + // Status pills render for each entry. Grep for the pill classes. + for _, want := range []string{"status-ok", "status-failed", "status-pending"} { + if !strings.Contains(index, want) { + t.Errorf("index missing %q pill — a status branch lost its style", want) + } + } + + // Only OK entries carry an href; failed/pending must not link to + // a file that doesn't exist. + if strings.Contains(index, `href="broken.html"`) || strings.Contains(index, `href="skipped.html"`) { + t.Error("index links to a non-existent per-repo HTML for a non-ok entry") + } +} + +// Render with zero successful repos must still emit an index +// without tripping on the MaxCommits==0 guard in the bar-width +// template expression. +func TestRenderScanReportDir_AllFailedStillEmitsIndex(t *testing.T) { + reportDir := t.TempDir() + result := &scan.Result{ + OutputDir: t.TempDir(), + Manifest: scan.Manifest{ + Repos: []scan.ManifestRepo{ + {Slug: "a", Status: "failed", Error: "boom"}, + {Slug: "b", Status: "failed", Error: "also boom"}, + }, + }, + } + err := renderScanReportDir(result, reportDir, + stats.LoadOptions{HalfLifeDays: 90, CoupMaxFiles: 50}, + stats.StatsFlags{CouplingMinChanges: 1, NetworkMinFiles: 1}, + 10) + if err != nil { + t.Fatalf("render should not fail on all-failed manifest: %v", err) + } + if _, err := os.Stat(filepath.Join(reportDir, "index.html")); err != nil { + t.Fatalf("index.html must be emitted even when every repo failed: %v", err) + } +} + // makeRepoWithCommit initializes a git repo with one file and one // commit using a deterministic identity. func makeRepoWithCommit(t *testing.T, dir, file, contents string) { diff --git a/internal/report/scan_index.go b/internal/report/scan_index.go index 82e39b7..80f410a 100644 --- a/internal/report/scan_index.go +++ b/internal/report/scan_index.go @@ -30,14 +30,18 @@ type ScanIndexData struct { GeneratedAt string Roots []string Repos []ScanIndexEntry - // TotalRepos / OKRepos / FailedRepos are precomputed so the - // template doesn't need conditional arithmetic; the summary strip - // at the top renders directly from these. - TotalRepos int - OKRepos int - FailedRepos int - TotalCommits int - TotalDevs int + // TotalRepos / OKRepos / FailedRepos / PendingRepos are + // precomputed so the template doesn't need conditional arithmetic. + // Pending is distinct from failed: a pending repo is one the + // worker never reached (cancelled mid-scan), a failed repo is one + // whose extract or render broke. Mixing them in the summary would + // read a cancel-shaped partial run as a fleet-of-errors. + TotalRepos int + OKRepos int + FailedRepos int + PendingRepos int + TotalCommits int + TotalDevs int // Largest repo commit count — used to normalize the bar widths so // the relative-volume bars are visually comparable across repos. MaxCommits int @@ -100,7 +104,7 @@ footer { margin-top: 30px; color: #656d76; font-size: 12px; text-align: center;

Generated {{.GeneratedAt}}{{if .Roots}} · roots: {{range $i, $r := .Roots}}{{if $i}}, {{end}}{{$r}}{{end}}{{end}}

-
Repositories
{{thousands .OKRepos}}{{if gt .FailedRepos 0}} ({{.FailedRepos}} failed){{end}}
+
Repositories
{{thousands .OKRepos}}{{if gt .FailedRepos 0}} ({{.FailedRepos}} failed){{end}}{{if gt .PendingRepos 0}} ({{.PendingRepos}} pending){{end}}
Total commits
{{humanize .TotalCommits}}
Unique devs
{{thousands .TotalDevs}}
diff --git a/internal/scan/discovery.go b/internal/scan/discovery.go index 2989893..2d13774 100644 --- a/internal/scan/discovery.go +++ b/internal/scan/discovery.go @@ -187,6 +187,23 @@ func Discover(ctx context.Context, roots []string, matcher *Matcher, maxDepth in // grows the length if a truncation collision occurs. const initialSlugHashLen = 6 +// reservedSlugs are base names that cannot be emitted bare because +// downstream consumers reserve the name for their own output file. +// `scan --report-dir` writes /index.html as the landing page; +// a repo whose basename was literally `index` would collide and +// silently get overwritten by the landing or vice versa. Forcing +// the hash branch for reserved names avoids the collision without +// losing the repo. Compared case-insensitively to align with the +// case-folding done elsewhere in assignSlugs. +var reservedSlugs = map[string]struct{}{ + "index": {}, +} + +func isReservedSlug(base string) bool { + _, ok := reservedSlugs[strings.ToLower(base)] + return ok +} + // assignSlugs derives a unique slug per repo from its basename, falling // back to `-` when two repos share a name. // The slug is also the JSONL filename stem and the persistence key @@ -239,7 +256,7 @@ func assignSlugs(repos []Repo) { for i := range repos { base := bases[i] slug := base - if counts[strings.ToLower(base)] > 1 { + if counts[strings.ToLower(base)] > 1 || isReservedSlug(base) { h := sha1.Sum([]byte(repos[i].AbsPath)) slug = fmt.Sprintf("%s-%s", base, hex.EncodeToString(h[:])[:hashLen]) } diff --git a/internal/scan/discovery_test.go b/internal/scan/discovery_test.go index 5c1fa19..f8fd900 100644 --- a/internal/scan/discovery_test.go +++ b/internal/scan/discovery_test.go @@ -163,6 +163,32 @@ func findColliding6HexPaths(maxAttempts int) (string, string, bool) { return "", "", false } +// `index` is the landing-page filename used by `scan --report-dir`. +// A repo whose basename is literally `index` would emit +// `/index.html`, colliding with (and getting overwritten by) +// the landing page. Force the hash branch for reserved names so +// the per-repo report file never lands on a reserved path. +func TestAssignSlugs_ReservesIndex(t *testing.T) { + repos := []Repo{ + {AbsPath: "/workspace/index"}, + {AbsPath: "/other/unrelated"}, + } + assignSlugs(repos) + + for _, r := range repos { + if filepath.Base(r.AbsPath) == "index" && r.Slug == "index" { + t.Errorf("`index` basename must not produce the bare slug; landing page would be overwritten (path=%s, slug=%s)", r.AbsPath, r.Slug) + } + } + // Case-insensitivity: `Index` would also collide on a case- + // insensitive filesystem serving the HTML. + repos = []Repo{{AbsPath: "/workspace/Index"}} + assignSlugs(repos) + if strings.EqualFold(repos[0].Slug, "index") { + t.Errorf("`Index` basename must also be reserved; got %q", repos[0].Slug) + } +} + // Regression: on macOS (APFS/HFS+ default) and Windows (NTFS), // `Repo.jsonl` and `repo.jsonl` are the same file on disk. A // case-sensitive slug compare would hand both repos the bare From f2810b5cd679afccc6328b91b569d5d29125927f Mon Sep 17 00:00:00 2001 From: lex0c Date: Mon, 20 Apr 2026 18:24:05 -0300 Subject: [PATCH 23/37] Align scan-index footer with the per-repo report footer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both pages now render the identical 'Generated by gitcortex · ' footer — same link attributes, same color and text-decoration, same border-top spacing. The duplicate date in the subtitle (removed) kept the top strip uncluttered for the only stable identifier that varied between index and repo pages: the scanned roots. Visually the index now reads as the sibling of every per-repo report it links to instead of a one-off landing. Co-Authored-By: Claude Opus 4.7 (1M context) --- internal/report/scan_index.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/internal/report/scan_index.go b/internal/report/scan_index.go index 80f410a..ec0a063 100644 --- a/internal/report/scan_index.go +++ b/internal/report/scan_index.go @@ -94,14 +94,14 @@ h1 { font-size: 24px; margin-bottom: 4px; } .status-ok { background: #dafbe1; color: #1a7f37; } .status-failed { background: #ffebe9; color: #cf222e; } .status-pending { background: #fff4d4; color: #9a6700; } -footer { margin-top: 30px; color: #656d76; font-size: 12px; text-align: center; border-top: 1px solid #d0d7de; padding-top: 14px; } +footer { margin-top: 40px; padding-top: 16px; border-top: 1px solid #d0d7de; color: #656d76; font-size: 12px; } .hint { font-size: 12px; color: #656d76; font-style: italic; margin-bottom: 14px; }

Scan Index {{.TotalRepos}} repositor{{if eq .TotalRepos 1}}y{{else}}ies{{end}}

-

Generated {{.GeneratedAt}}{{if .Roots}} · roots: {{range $i, $r := .Roots}}{{if $i}}, {{end}}{{$r}}{{end}}{{end}}

+

{{if .Roots}}roots: {{range $i, $r := .Roots}}{{if $i}}, {{end}}{{$r}}{{end}}{{end}}

Repositories
{{thousands .OKRepos}}{{if gt .FailedRepos 0}} ({{.FailedRepos}} failed){{end}}{{if gt .PendingRepos 0}} ({{.PendingRepos}} pending){{end}}
@@ -154,7 +154,7 @@ footer { margin-top: 30px; color: #656d76; font-size: 12px; text-align: center; {{end}} {{end}} - + From c0bfccb4d4d1c8a7c74dd99562c4377ee5ff8527 Mon Sep 17 00:00:00 2001 From: lex0c Date: Mon, 20 Apr 2026 19:01:13 -0300 Subject: [PATCH 24/37] Simplify scan-index heading and drop roots subtitle MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Shorten the H1 from "Scan Index" to "Index" and drop the "roots: ..." subtitle line. The roots were borderline-useful — the user picked them, so echoing them back added chrome without adding information. With the data summary cards carrying the interesting counts below, a terser heading reads cleaner. Co-Authored-By: Claude Opus 4.7 (1M context) --- cmd/gitcortex/main_test.go | 2 +- internal/report/scan_index.go | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/cmd/gitcortex/main_test.go b/cmd/gitcortex/main_test.go index 2b361d1..7b5c9c3 100644 --- a/cmd/gitcortex/main_test.go +++ b/cmd/gitcortex/main_test.go @@ -200,7 +200,7 @@ func TestScanCmd_ReportDirGeneratesPerRepoHTMLAndIndex(t *testing.T) { if !strings.Contains(index, `href="alpha.html"`) || !strings.Contains(index, `href="beta.html"`) { t.Errorf("index.html should link to both per-repo reports; got body excerpt: %.300s", index) } - if !strings.Contains(index, "Scan Index") { + if !strings.Contains(index, "

Index") { t.Errorf("index.html missing title block") } } diff --git a/internal/report/scan_index.go b/internal/report/scan_index.go index ec0a063..2340a8c 100644 --- a/internal/report/scan_index.go +++ b/internal/report/scan_index.go @@ -100,8 +100,7 @@ footer { margin-top: 40px; padding-top: 16px; border-top: 1px solid #d0d7de; col -

Scan Index {{.TotalRepos}} repositor{{if eq .TotalRepos 1}}y{{else}}ies{{end}}

-

{{if .Roots}}roots: {{range $i, $r := .Roots}}{{if $i}}, {{end}}{{$r}}{{end}}{{end}}

+

Index {{.TotalRepos}} repositor{{if eq .TotalRepos 1}}y{{else}}ies{{end}}

Repositories
{{thousands .OKRepos}}{{if gt .FailedRepos 0}} ({{.FailedRepos}} failed){{end}}{{if gt .PendingRepos 0}} ({{.PendingRepos}} pending){{end}}
From 3494e0e1ecb0cb5318d7cd284f715533af93d7a7 Mon Sep 17 00:00:00 2001 From: lex0c Date: Mon, 20 Apr 2026 19:03:34 -0300 Subject: [PATCH 25/37] Trim redundant chrome from scan-index cards MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two small subtractions driven by the same principle — don't repeat signal the surrounding layout already conveys: - Drop the "ok" status pill. The link itself plus the populated metric cells make status obvious for successful entries; showing a neutral pill on every card added noise. Failed and pending pills stay because those statuses swap the card body for an italic "No report available." and the pill is the only affordance marking which failure mode it was. - Drop the "N / M max" caption next to the relative-volume bar. The bar already visualizes the ratio, the "commits" metric cell already shows N in the same row, and the max is inferred by eyeballing the widest bar. Keeping the bar alone preserves comparison-at-a-glance without a third copy of the same number. Test updated to assert `status-ok` pill is absent while failed / pending pills remain. Co-Authored-By: Claude Opus 4.7 (1M context) --- cmd/gitcortex/main_test.go | 11 ++++++++--- internal/report/scan_index.go | 3 +-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/cmd/gitcortex/main_test.go b/cmd/gitcortex/main_test.go index 7b5c9c3..ba25b4c 100644 --- a/cmd/gitcortex/main_test.go +++ b/cmd/gitcortex/main_test.go @@ -273,12 +273,17 @@ func TestRenderScanReportDir_MixedStatuses(t *testing.T) { t.Error("failed entry's error message missing from index") } - // Status pills render for each entry. Grep for the pill classes. - for _, want := range []string{"status-ok", "status-failed", "status-pending"} { + // Status pills render only for non-ok entries (ok is implicit + // given the link itself and the presence of metric cells). Assert + // the two non-ok pills exist and the ok one does not. + for _, want := range []string{"status-failed", "status-pending"} { if !strings.Contains(index, want) { - t.Errorf("index missing %q pill — a status branch lost its style", want) + t.Errorf("index missing %q pill — status branch lost its style", want) } } + if strings.Contains(index, `class="status-pill status-ok"`) { + t.Error("ok entries should not render a status pill — redundant with the link and metric cells") + } // Only OK entries carry an href; failed/pending must not link to // a file that doesn't exist. diff --git a/internal/report/scan_index.go b/internal/report/scan_index.go index 2340a8c..460ec43 100644 --- a/internal/report/scan_index.go +++ b/internal/report/scan_index.go @@ -116,7 +116,7 @@ footer { margin-top: 40px; padding-top: 16px; border-top: 1px solid #d0d7de; col
{{if .ReportHref}}{{.Slug}}{{else}}{{.Slug}}{{end}} - {{.Status}} + {{if ne .Status "ok"}}{{.Status}}{{end}}
{{.Path}}
{{if .Error}}
{{.Error}}
{{end}} @@ -148,7 +148,6 @@ footer { margin-top: 40px; padding-top: 16px; border-top: 1px solid #d0d7de; col {{if eq .Status "ok"}}
- {{humanize .Commits}} / {{humanize $max}} max
{{end}} {{end}} From 49cd00daee7a706211a29ef0a53b86324c0b2771 Mon Sep 17 00:00:00 2001 From: lex0c Date: Mon, 20 Apr 2026 19:05:19 -0300 Subject: [PATCH 26/37] Drop green left-border accent from ok scan-index cards The green .repo.ok border competed with the layout's default border-radius and made a table of healthy repos look noisier than it needed to. Red (failed) and amber (pending) borders stay as attention cues for the genuinely actionable states; the absence of a stripe on ok cards now reads as "nothing to flag" by contrast. Co-Authored-By: Claude Opus 4.7 (1M context) --- internal/report/scan_index.go | 1 - 1 file changed, 1 deletion(-) diff --git a/internal/report/scan_index.go b/internal/report/scan_index.go index 460ec43..898b467 100644 --- a/internal/report/scan_index.go +++ b/internal/report/scan_index.go @@ -77,7 +77,6 @@ h1 { font-size: 24px; margin-bottom: 4px; } .repo { background: #fff; border: 1px solid #d0d7de; border-radius: 6px; padding: 16px 20px; margin-bottom: 10px; display: grid; grid-template-columns: 2fr 1fr 1fr 1fr 1fr 1.2fr; gap: 16px; align-items: center; } .repo.failed { border-left: 4px solid #cf222e; } .repo.pending { border-left: 4px solid #bf8700; } -.repo.ok { border-left: 4px solid #2da44e; } .repo .name { font-weight: 600; font-size: 15px; } .repo .name a { color: #0969da; text-decoration: none; } .repo .name a:hover { text-decoration: underline; } From 2ae796156495ac977b41ce525ab2ce65c392353a Mon Sep 17 00:00:00 2001 From: lex0c Date: Mon, 20 Apr 2026 19:13:37 -0300 Subject: [PATCH 27/37] Surface last-commit recency on scan index cards MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The dates cell used to show "2024-01-15 → 2026-04-18" raw, forcing operators scanning a list of 30 repos to do date math to pick out the abandoned ones. Add a recency chip above the date range: "today" / "Nd ago" / "Nmo ago" / "Ny ago", colored by freshness bucket — green for fresh (≤30d), gray for stable (≤1y), amber for stale (>1y). The raw dates stay as supporting detail for anyone who wants the exact range. HumanizeAgo is the caller entry point (uses time.Now); humanizeAgoAt takes an explicit reference time for deterministic tests. Table test pins the day/month/year transitions and the 365-day stable/stale boundary; render test asserts the chip class reaches the HTML and is absent from failed entries that have no dates. Co-Authored-By: Claude Opus 4.7 (1M context) --- cmd/gitcortex/main.go | 1 + internal/report/scan_index.go | 58 +++++++++++++++++- internal/report/scan_index_test.go | 98 ++++++++++++++++++++++++++++++ 3 files changed, 156 insertions(+), 1 deletion(-) create mode 100644 internal/report/scan_index_test.go diff --git a/cmd/gitcortex/main.go b/cmd/gitcortex/main.go index 897f33c..e1f3132 100644 --- a/cmd/gitcortex/main.go +++ b/cmd/gitcortex/main.go @@ -947,6 +947,7 @@ func renderScanReportDir(result *scan.Result, dir string, loadOpts stats.LoadOpt entry.Churn = summary.TotalAdditions + summary.TotalDeletions entry.FirstCommitDate = summary.FirstCommitDate entry.LastCommitDate = summary.LastCommitDate + entry.LastCommitAgo, entry.RecencyBucket = reportpkg.HumanizeAgo(summary.LastCommitDate) entry.ReportHref = m.Slug + ".html" totalCommits += entry.Commits diff --git a/internal/report/scan_index.go b/internal/report/scan_index.go index 898b467..64084f2 100644 --- a/internal/report/scan_index.go +++ b/internal/report/scan_index.go @@ -1,6 +1,7 @@ package report import ( + "fmt" "html/template" "io" "time" @@ -22,7 +23,57 @@ type ScanIndexEntry struct { Churn int64 FirstCommitDate string LastCommitDate string - ReportHref string + // LastCommitAgo is LastCommitDate humanized relative to the + // index generation time: "today" / "Nd ago" / "Nmo ago" / "Ny ago". + // Lets operators spot abandoned repos in a list of 30 without + // doing date math in their head. + LastCommitAgo string + // RecencyBucket classifies LastCommitAgo into "fresh" (≤ 30 days), + // "stable" (≤ 1 year), "stale" (> 1 year). Drives the badge color + // in the template so the cold repos stand out at a glance. + RecencyBucket string + ReportHref string +} + +// HumanizeAgo is the caller entry point for the index: formats +// `lastDate` (YYYY-MM-DD) as a compact "ago" phrase relative to +// wall-clock now, and classifies recency into fresh / stable / +// stale for template coloring. Empty pair on unparsable input so +// failed/pending rows with no dates render nothing. +func HumanizeAgo(lastDate string) (label, bucket string) { + return humanizeAgoAt(lastDate, time.Now()) +} + +// humanizeAgoAt is the testable core — same logic as HumanizeAgo +// but takes an explicit "now" so tests can pin the reference time +// and avoid drift. +func humanizeAgoAt(lastDate string, now time.Time) (label, bucket string) { + t, err := time.Parse("2006-01-02", lastDate) + if err != nil { + return "", "" + } + days := int(now.Sub(t).Hours() / 24) + switch { + case days < 0: + return "", "" + case days == 0: + label = "today" + case days < 30: + label = fmt.Sprintf("%dd ago", days) + case days < 365: + label = fmt.Sprintf("%dmo ago", days/30) + default: + label = fmt.Sprintf("%dy ago", days/365) + } + switch { + case days <= 30: + bucket = "fresh" + case days <= 365: + bucket = "stable" + default: + bucket = "stale" + } + return label, bucket } // ScanIndexData is the top-level template input for the index page. @@ -89,6 +140,10 @@ h1 { font-size: 24px; margin-bottom: 4px; } .repo .bar-outer { flex: 1; height: 8px; background: #eaeef2; border-radius: 3px; overflow: hidden; } .repo .bar-inner { height: 100%; background: #2da44e; } .repo .dates { font-size: 11px; color: #656d76; font-family: "SF Mono", Consolas, monospace; } +.recency { display: inline-block; padding: 2px 8px; border-radius: 10px; font-size: 11px; font-weight: 600; font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif; margin-bottom: 4px; } +.recency.fresh { background: #dafbe1; color: #1a7f37; } +.recency.stable { background: #eaeef2; color: #656d76; } +.recency.stale { background: #fff4d4; color: #9a6700; } .status-pill { display: inline-block; padding: 2px 8px; border-radius: 10px; font-size: 11px; font-weight: 600; } .status-ok { background: #dafbe1; color: #1a7f37; } .status-failed { background: #ffebe9; color: #cf222e; } @@ -138,6 +193,7 @@ footer { margin-top: 40px; padding-top: 16px; border-top: 1px solid #d0d7de; col files
+ {{if .LastCommitAgo}}{{.LastCommitAgo}}
{{end}} {{.FirstCommitDate}}
→ {{.LastCommitDate}}
{{else}} diff --git a/internal/report/scan_index_test.go b/internal/report/scan_index_test.go new file mode 100644 index 0000000..6164640 --- /dev/null +++ b/internal/report/scan_index_test.go @@ -0,0 +1,98 @@ +package report + +import ( + "bytes" + "strings" + "testing" + "time" +) + +func TestHumanizeAgoAt(t *testing.T) { + now := time.Date(2026, 4, 20, 12, 0, 0, 0, time.UTC) + + cases := []struct { + name string + lastDate string + wantLabel string + wantBucket string + }{ + // Boundary on day 0 — "today" reads cleaner than "0d ago". + {"same day", "2026-04-20", "today", "fresh"}, + {"one day", "2026-04-19", "1d ago", "fresh"}, + {"29 days", "2026-03-22", "29d ago", "fresh"}, + // Transition from days to months at 30. + {"30 days exact", "2026-03-21", "1mo ago", "fresh"}, + {"two months", "2026-02-15", "2mo ago", "stable"}, + {"eleven months", "2025-05-25", "11mo ago", "stable"}, + // 365-day boundary: still "stable" at exactly one year, + // "stale" the day after. 2025-04-20 is 365 days before now. + {"one year exact (stable boundary)", "2025-04-20", "1y ago", "stable"}, + {"one year + one day (stale)", "2025-04-19", "1y ago", "stale"}, + {"two years stale", "2024-04-10", "2y ago", "stale"}, + // Parse failure yields empty. + {"bad input", "not-a-date", "", ""}, + // Future dates (clock skew) yield empty — we don't label "in 3d" + // on an index that exists to surface recency of PAST commits. + {"future date", "2026-05-01", "", ""}, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + label, bucket := humanizeAgoAt(c.lastDate, now) + if label != c.wantLabel { + t.Errorf("label: got %q, want %q", label, c.wantLabel) + } + if bucket != c.wantBucket { + t.Errorf("bucket: got %q, want %q", bucket, c.wantBucket) + } + }) + } +} + +// End-to-end check that the recency chip renders for an ok entry +// and is absent for failed entries (which have no dates). Confirms +// the CSS bucket class is reachable by the template. +func TestGenerateScanIndex_RecencyChipRenders(t *testing.T) { + data := ScanIndexData{ + GeneratedAt: "2026-04-20 12:00", + TotalRepos: 2, + OKRepos: 1, + FailedRepos: 1, + MaxCommits: 10, + Repos: []ScanIndexEntry{ + { + Slug: "alive", + Path: "/work/alive", + Status: "ok", + Commits: 10, + FirstCommitDate: "2024-01-01", + LastCommitDate: "2026-04-18", + LastCommitAgo: "2d ago", + RecencyBucket: "fresh", + ReportHref: "alive.html", + }, + { + Slug: "broken", + Path: "/work/broken", + Status: "failed", + Error: "boom", + }, + }, + } + var buf bytes.Buffer + if err := GenerateScanIndex(&buf, data); err != nil { + t.Fatal(err) + } + out := buf.String() + if !strings.Contains(out, `class="recency fresh"`) { + t.Error("fresh recency chip missing from ok entry") + } + if !strings.Contains(out, `>2d ago<`) { + t.Error("recency label text missing") + } + // Failed entry has no dates block, so no recency chip should + // render for it — a weak but useful guard against a template + // restructure leaking the chip into the failure render. + if strings.Count(out, `class="recency`) != 1 { + t.Errorf("expected exactly one recency chip (ok entry only); got %d", strings.Count(out, `class="recency`)) + } +} From d59338ef6817927803efdde6bc36b0752cb6d4d5 Mon Sep 17 00:00:00 2001 From: lex0c Date: Mon, 20 Apr 2026 19:37:32 -0300 Subject: [PATCH 28/37] Address 7 minors from the recency-batch review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Seven small polish items surfaced by the post-recency review. All live in or around the new scan-index landing page: - Recency label at 360-364 days read as "12mo ago" under the stable color. Clamp months at 11 so the label progression never crosses into two-digit territory inside the still-stable band. - Ok entry whose dataset had no dates (empty JSONL, rename-only history) rendered an orphan arrow `
→ ` in the dates cell. Guard the date range behind `{{if .LastCommitDate}}` so the cell emits nothing instead of a lone glyph. - Log line started every run with `0 ok` even when the count was zero, drifting from the summary card's zero-suppression convention. Build the log bucket list the same way — only non-zero counts, and open with the first non-zero bucket. - `Data.Roots` field and `.subtitle` CSS rule were dead after the roots-subtitle removal commit. Drop both plus the now-unused Roots assignment at the call site. - Convert `reservedSlugs` from a package-level mutable map to a `isReservedSlug` switch. Keeps the reserved set immutable — no risk of a future test silently leaking an entry across the process. - Strengthen the recency render test: anchor the chip inside the `.dates` wrapper so a refactor that moves the chip outside its structural context fails the test instead of passing on the CSS class alone. - Extend the humanizeAgoAt table with 360/364-day "stable" cases (pinning the new month clamp) and a sub-day-future case that documents the "today" fallback for intra-day committer-date skew. Co-Authored-By: Claude Opus 4.7 (1M context) --- cmd/gitcortex/main.go | 16 +++++++++++----- internal/report/scan_index.go | 25 +++++++++++++++++++++---- internal/report/scan_index_test.go | 24 +++++++++++++++++++++++- internal/scan/discovery.go | 28 +++++++++++++++------------- 4 files changed, 70 insertions(+), 23 deletions(-) diff --git a/cmd/gitcortex/main.go b/cmd/gitcortex/main.go index e1f3132..3937fcb 100644 --- a/cmd/gitcortex/main.go +++ b/cmd/gitcortex/main.go @@ -987,7 +987,6 @@ func renderScanReportDir(result *scan.Result, dir string, loadOpts stats.LoadOpt } data := reportpkg.ScanIndexData{ - Roots: result.Manifest.Roots, Repos: entries, TotalRepos: len(entries), OKRepos: okCount, @@ -1001,16 +1000,23 @@ func renderScanReportDir(result *scan.Result, dir string, loadOpts stats.LoadOpt return fmt.Errorf("generate index: %w", err) } - // Log line tracks the summary card: only non-zero buckets appear - // so a fully-successful scan doesn't confuse the operator with - // trailing "0 failed, 0 pending". - parts := []string{fmt.Sprintf("%d ok", okCount)} + // Log line tracks the summary card's zero-suppression: only + // non-zero buckets appear, and the opener is whichever bucket + // has a count so an all-failed scan reads as "2 failed" instead + // of the misleading "0 ok, 2 failed". + var parts []string + if okCount > 0 { + parts = append(parts, fmt.Sprintf("%d ok", okCount)) + } if failedCount > 0 { parts = append(parts, fmt.Sprintf("%d failed", failedCount)) } if pendingCount > 0 { parts = append(parts, fmt.Sprintf("%d pending", pendingCount)) } + if len(parts) == 0 { + parts = []string{"0 repos"} // defensive; renderScanReportDir requires > 0 manifest entries to be reached at all + } fmt.Fprintf(os.Stderr, "Per-repo reports written to %s (%s); open %s\n", dir, strings.Join(parts, ", "), fileURL(indexPath)) return nil diff --git a/internal/report/scan_index.go b/internal/report/scan_index.go index 64084f2..4887641 100644 --- a/internal/report/scan_index.go +++ b/internal/report/scan_index.go @@ -47,6 +47,16 @@ func HumanizeAgo(lastDate string) (label, bucket string) { // humanizeAgoAt is the testable core — same logic as HumanizeAgo // but takes an explicit "now" so tests can pin the reference time // and avoid drift. +// +// Future-commit policy: a date strictly in the past ("days >= 0 +// when measured against midnight-to-midnight") is labeled. +// Anything earlier than today's date (even by one hour, via +// committer-date clock skew or CI rewrite) yields empty — the +// index exists to surface how stale each repo is, and "in 3d" on +// a repo whose scanner clock drifted would be actively misleading. +// Sub-day skew within TODAY lands in `days == 0 → "today"`, which +// is the most reasonable fallback given the template's day +// granularity. func humanizeAgoAt(lastDate string, now time.Time) (label, bucket string) { t, err := time.Parse("2006-01-02", lastDate) if err != nil { @@ -61,7 +71,16 @@ func humanizeAgoAt(lastDate string, now time.Time) (label, bucket string) { case days < 30: label = fmt.Sprintf("%dd ago", days) case days < 365: - label = fmt.Sprintf("%dmo ago", days/30) + // Cap the month reading at 11 — otherwise `days/30` emits + // "12mo ago" at days 360-364, which reads as older than + // "1y ago" even though it's actually in the still-stable + // (≤365d) band. Clamping keeps the label progression + // monotonic with the bucket color. + months := days / 30 + if months > 11 { + months = 11 + } + label = fmt.Sprintf("%dmo ago", months) default: label = fmt.Sprintf("%dy ago", days/365) } @@ -79,7 +98,6 @@ func humanizeAgoAt(lastDate string, now time.Time) (label, bucket string) { // ScanIndexData is the top-level template input for the index page. type ScanIndexData struct { GeneratedAt string - Roots []string Repos []ScanIndexEntry // TotalRepos / OKRepos / FailedRepos / PendingRepos are // precomputed so the template doesn't need conditional arithmetic. @@ -120,7 +138,6 @@ const scanIndexHTML = ` * { margin: 0; padding: 0; box-sizing: border-box; } body { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif; color: #24292f; background: #f6f8fa; padding: 20px; max-width: 1200px; margin: 0 auto; } h1 { font-size: 24px; margin-bottom: 4px; } -.subtitle { color: #656d76; font-size: 13px; margin-bottom: 24px; } .summary { display: grid; grid-template-columns: repeat(auto-fit, minmax(160px, 1fr)); gap: 12px; margin-bottom: 24px; } .summary-card { background: #fff; border: 1px solid #d0d7de; border-radius: 6px; padding: 16px; } .summary-card .label { font-size: 12px; color: #656d76; text-transform: uppercase; } @@ -194,7 +211,7 @@ footer { margin-top: 40px; padding-top: 16px; border-top: 1px solid #d0d7de; col
{{if .LastCommitAgo}}{{.LastCommitAgo}}
{{end}} - {{.FirstCommitDate}}
→ {{.LastCommitDate}} + {{if .LastCommitDate}}{{.FirstCommitDate}}
→ {{.LastCommitDate}}{{end}}
{{else}}
No report available.
diff --git a/internal/report/scan_index_test.go b/internal/report/scan_index_test.go index 6164640..aca7ff1 100644 --- a/internal/report/scan_index_test.go +++ b/internal/report/scan_index_test.go @@ -24,6 +24,11 @@ func TestHumanizeAgoAt(t *testing.T) { {"30 days exact", "2026-03-21", "1mo ago", "fresh"}, {"two months", "2026-02-15", "2mo ago", "stable"}, {"eleven months", "2025-05-25", "11mo ago", "stable"}, + // Month label must not exceed "11mo" — days/30 = 12 on the + // 360-day boundary, but the label is clamped so the stable + // band always reads as sub-year. + {"month clamp at 360 days", "2025-04-25", "11mo ago", "stable"}, + {"month clamp at 364 days", "2025-04-21", "11mo ago", "stable"}, // 365-day boundary: still "stable" at exactly one year, // "stale" the day after. 2025-04-20 is 365 days before now. {"one year exact (stable boundary)", "2025-04-20", "1y ago", "stable"}, @@ -33,7 +38,12 @@ func TestHumanizeAgoAt(t *testing.T) { {"bad input", "not-a-date", "", ""}, // Future dates (clock skew) yield empty — we don't label "in 3d" // on an index that exists to surface recency of PAST commits. - {"future date", "2026-05-01", "", ""}, + {"full-day future", "2026-05-01", "", ""}, + // Sub-day future (committer-date hours ahead of scanner clock) + // lands in days==0 → "today". Documented: the index works at + // day granularity and "today" is the least-misleading fallback + // for intra-day skew. + {"same-day future (sub-day skew)", "2026-04-20", "today", "fresh"}, } for _, c := range cases { t.Run(c.name, func(t *testing.T) { @@ -89,6 +99,18 @@ func TestGenerateScanIndex_RecencyChipRenders(t *testing.T) { if !strings.Contains(out, `>2d ago<`) { t.Error("recency label text missing") } + // Anchor the chip inside the `.dates` wrapper — without this, + // a future template change that moved the chip outside the + // dates cell (losing the date context) would still satisfy the + // class-only assertion above. + if !strings.Contains(out, `class="dates"`) { + t.Error("`.dates` wrapper missing — the recency chip has lost its structural anchor") + } + datesIdx := strings.Index(out, `class="dates"`) + recencyIdx := strings.Index(out, `class="recency fresh"`) + if datesIdx < 0 || recencyIdx < 0 || recencyIdx < datesIdx { + t.Errorf("recency chip not inside `.dates` wrapper: dates@%d recency@%d", datesIdx, recencyIdx) + } // Failed entry has no dates block, so no recency chip should // render for it — a weak but useful guard against a template // restructure leaking the chip into the failure render. diff --git a/internal/scan/discovery.go b/internal/scan/discovery.go index 2d13774..533f522 100644 --- a/internal/scan/discovery.go +++ b/internal/scan/discovery.go @@ -187,21 +187,23 @@ func Discover(ctx context.Context, roots []string, matcher *Matcher, maxDepth in // grows the length if a truncation collision occurs. const initialSlugHashLen = 6 -// reservedSlugs are base names that cannot be emitted bare because -// downstream consumers reserve the name for their own output file. +// isReservedSlug reports whether base would collide with a name +// downstream consumers already use for their own output file. // `scan --report-dir` writes /index.html as the landing page; -// a repo whose basename was literally `index` would collide and -// silently get overwritten by the landing or vice versa. Forcing -// the hash branch for reserved names avoids the collision without -// losing the repo. Compared case-insensitively to align with the -// case-folding done elsewhere in assignSlugs. -var reservedSlugs = map[string]struct{}{ - "index": {}, -} - +// a repo whose basename was literally `index` would overwrite it +// (or be overwritten by it). Forcing the hash branch for reserved +// names avoids the collision without losing the repo. +// +// Switch, not a package-level map, to keep the reserved set +// immutable — a mutable map would let one test silently leak an +// entry into every other test that runs afterward. Case folded to +// align with the case-insensitivity elsewhere in assignSlugs. func isReservedSlug(base string) bool { - _, ok := reservedSlugs[strings.ToLower(base)] - return ok + switch strings.ToLower(base) { + case "index": + return true + } + return false } // assignSlugs derives a unique slug per repo from its basename, falling From 87d3e8b5818fe243f382bff43e3fe84a52e954c6 Mon Sep 17 00:00:00 2001 From: lex0c Date: Mon, 20 Apr 2026 19:39:27 -0300 Subject: [PATCH 29/37] Drop scan-index H1; promote summary cards to page anchor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The "Index · N repositories" heading restated information the summary strip's "Repositories" card already surfaces one line below. Removing it lets the three-card summary (Repositories, Total commits, Unique devs) open the page — the counts users actually came to see. Dead `h1` CSS rule dropped at the same time. Test now anchors on the "Repositories" label in the summary strip, which survives layout tweaks better than a specific H1 string. Co-Authored-By: Claude Opus 4.7 (1M context) --- cmd/gitcortex/main_test.go | 7 +++++-- internal/report/scan_index.go | 3 --- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cmd/gitcortex/main_test.go b/cmd/gitcortex/main_test.go index ba25b4c..34c55ee 100644 --- a/cmd/gitcortex/main_test.go +++ b/cmd/gitcortex/main_test.go @@ -200,8 +200,11 @@ func TestScanCmd_ReportDirGeneratesPerRepoHTMLAndIndex(t *testing.T) { if !strings.Contains(index, `href="alpha.html"`) || !strings.Contains(index, `href="beta.html"`) { t.Errorf("index.html should link to both per-repo reports; got body excerpt: %.300s", index) } - if !strings.Contains(index, "

Index") { - t.Errorf("index.html missing title block") + // The summary card strip is now the top-of-page anchor (no H1). + // Asserting on the "Repositories" label keeps the test meaningful + // if the card layout changes. + if !strings.Contains(index, "Repositories") { + t.Errorf("index.html missing summary strip") } } diff --git a/internal/report/scan_index.go b/internal/report/scan_index.go index 4887641..2393546 100644 --- a/internal/report/scan_index.go +++ b/internal/report/scan_index.go @@ -137,7 +137,6 @@ const scanIndexHTML = `