From 5117a3081d8e3e12ba4771b2f89b43b02f5065bf Mon Sep 17 00:00:00 2001 From: Keegan Carruthers-Smith Date: Thu, 2 Apr 2026 21:51:30 +0200 Subject: [PATCH] gitindex: fall back when cat-file filter is unsupported Older Git versions can reject cat-file --batch with --filter, causing the process to exit before yielding any blobs. We were treating the closed stdout pipe as EOF, which made indexing silently produce empty shards. Capture cat-file stderr and wait status so startup failures surface as real errors. If cat-file fails before any blobs are processed, fall back to the existing go-git path so indexing still succeeds on older Git installations. Keep mid-stream cat-file failures fatal instead of falling back once some blobs have already been indexed. That avoids mixing two readers and writing partial or inconsistent shards. --- gitindex/catfile.go | 107 +++++++++++++++++++++++++++++----- gitindex/index.go | 63 ++++++++++++++------ gitindex/index_test.go | 127 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 268 insertions(+), 29 deletions(-) diff --git a/gitindex/catfile.go b/gitindex/catfile.go index 1933c337c..d422355c2 100644 --- a/gitindex/catfile.go +++ b/gitindex/catfile.go @@ -18,10 +18,12 @@ import ( "bufio" "bytes" "encoding/hex" + "errors" "fmt" "io" "os/exec" "strconv" + "strings" "sync" "syscall" @@ -60,10 +62,16 @@ type catfileReader struct { cmd *exec.Cmd reader *bufio.Reader writeErr <-chan error + stderr <-chan catfileStderrResult + + procOnce sync.Once + procMu sync.Mutex + procErr error // pending tracks unread content bytes + trailing LF for the current // entry. Next() discards any pending bytes before reading the next header. pending int + eof bool closeOnce sync.Once closeErr error @@ -92,14 +100,36 @@ func newCatfileReader(repoDir string, ids []plumbing.Hash, opts catfileReaderOpt return nil, fmt.Errorf("stdout pipe: %w", err) } + stderr, err := cmd.StderrPipe() + if err != nil { + stdin.Close() + stdout.Close() + return nil, fmt.Errorf("stderr pipe: %w", err) + } + if err := cmd.Start(); err != nil { stdin.Close() stdout.Close() + stderr.Close() return nil, fmt.Errorf("start git cat-file: %w", err) } - // Writer goroutine: feed all SHAs then close stdin to trigger flush. writeErr := make(chan error, 1) + stderrDone := make(chan catfileStderrResult, 1) + + cr := &catfileReader{ + cmd: cmd, + reader: bufio.NewReaderSize(stdout, 512*1024), + writeErr: writeErr, + stderr: stderrDone, + } + go func() { + stderrBytes, stderrReadErr := io.ReadAll(stderr) + stderrDone <- catfileStderrResult{data: stderrBytes, err: stderrReadErr} + close(stderrDone) + }() + + // Writer goroutine: feed all SHAs then close stdin to trigger flush. go func() { defer close(writeErr) defer stdin.Close() @@ -116,11 +146,7 @@ func newCatfileReader(repoDir string, ids []plumbing.Hash, opts catfileReaderOpt writeErr <- bw.Flush() }() - return &catfileReader{ - cmd: cmd, - reader: bufio.NewReaderSize(stdout, 512*1024), - writeErr: writeErr, - }, nil + return cr, nil } // Next advances to the next blob entry. It returns the blob's size and whether @@ -131,10 +157,14 @@ func newCatfileReader(repoDir string, ids []plumbing.Hash, opts catfileReaderOpt // After Next returns successfully with missing=false and excluded=false, call // Read to consume the blob content, or call Next again to skip it. func (cr *catfileReader) Next() (size int, missing bool, excluded bool, err error) { + if cr.eof { + return 0, false, false, io.EOF + } + // Discard unread content from the previous entry. if cr.pending > 0 { if _, err := cr.reader.Discard(cr.pending); err != nil { - return 0, false, false, fmt.Errorf("discard pending bytes: %w", err) + return 0, false, false, cr.wrapReadError("discard pending bytes", err) } cr.pending = 0 } @@ -142,12 +172,15 @@ func (cr *catfileReader) Next() (size int, missing bool, excluded bool, err erro headerBytes, err := cr.reader.ReadBytes('\n') if err != nil { if err == io.EOF { + if procErr := cr.waitProcess(); procErr != nil { + return 0, false, false, procErr + } + cr.eof = true return 0, false, false, io.EOF } return 0, false, false, fmt.Errorf("read header: %w", err) } header := headerBytes[:len(headerBytes)-1] // trim \n - if bytes.HasSuffix(header, []byte(" missing")) { return 0, true, false, nil } @@ -184,7 +217,7 @@ func (cr *catfileReader) Read(p []byte) (int, error) { if contentRemaining <= 0 { // Only the trailing LF remains; consume it and signal EOF. if _, err := cr.reader.ReadByte(); err != nil { - return 0, fmt.Errorf("read trailing LF: %w", err) + return 0, cr.wrapReadError("read trailing LF", err) } cr.pending = 0 return 0, io.EOF @@ -197,13 +230,13 @@ func (cr *catfileReader) Read(p []byte) (int, error) { n, err := cr.reader.Read(p) cr.pending -= n if err != nil { - return n, err + return n, cr.wrapReadError("read blob content", err) } // If we've consumed all content bytes, also consume the trailing LF. if cr.pending == 1 { if _, err := cr.reader.ReadByte(); err != nil { - return n, fmt.Errorf("read trailing LF: %w", err) + return n, cr.wrapReadError("read trailing LF", err) } cr.pending = 0 } @@ -223,7 +256,7 @@ func (cr *catfileReader) Close() error { _, _ = io.Copy(io.Discard, cr.reader) // Wait for writer goroutine (unblocks via broken pipe from Kill). <-cr.writeErr - err := cr.cmd.Wait() + err := cr.waitProcess() // Suppress the expected "signal: killed" error from our own Kill(). if isKilledErr(err) { err = nil @@ -233,9 +266,57 @@ func (cr *catfileReader) Close() error { return cr.closeErr } +func (cr *catfileReader) waitProcess() error { + cr.procOnce.Do(func() { + stderr := <-cr.stderr + waitErr := cr.cmd.Wait() + + cr.procMu.Lock() + cr.procErr = formatCatfileProcessErr(waitErr, stderr.data, stderr.err) + cr.procMu.Unlock() + }) + + cr.procMu.Lock() + defer cr.procMu.Unlock() + + return cr.procErr +} + +func (cr *catfileReader) wrapReadError(context string, err error) error { + if err == io.EOF { + if procErr := cr.waitProcess(); procErr != nil { + return procErr + } + } + + return fmt.Errorf("%s: %w", context, err) +} + +func formatCatfileProcessErr(waitErr error, stderr []byte, stderrReadErr error) error { + if waitErr == nil { + if stderrReadErr != nil { + return fmt.Errorf("read git cat-file stderr: %w", stderrReadErr) + } + return nil + } + + stderrText := strings.TrimSpace(string(stderr)) + if stderrText == "" { + return fmt.Errorf("git cat-file exited unsuccessfully: %w", waitErr) + } + + return fmt.Errorf("git cat-file exited unsuccessfully: %w: %s", waitErr, stderrText) +} + +type catfileStderrResult struct { + data []byte + err error +} + // isKilledErr reports whether err is an exec.ExitError caused by SIGKILL. func isKilledErr(err error) bool { - exitErr, ok := err.(*exec.ExitError) + var exitErr *exec.ExitError + ok := errors.As(err, &exitErr) if !ok { return false } diff --git a/gitindex/index.go b/gitindex/index.go index 5b9d5abf1..91b0b85a1 100644 --- a/gitindex/index.go +++ b/gitindex/index.go @@ -655,24 +655,21 @@ func indexGitRepo(opts Options, config gitIndexConfig) (bool, error) { } if err := indexCatfileBlobs(cr, mainRepoKeys, repos, opts, builder); err != nil { - return false, err + var readErr *catfileReadError + if errors.As(err, &readErr) && readErr.processed == 0 { + log.Printf("git cat-file failed before yielding any blobs, falling back to go-git for %d blobs: %v", len(mainRepoKeys), err) + if err := indexGoGitBlobs(mainRepoKeys, repos, opts.BuildOptions, builder); err != nil { + return false, err + } + } else { + return false, err + } } } // Index submodule blobs via go-git. - for idx, key := range submoduleKeys { - doc, err := createDocument(key, repos, opts.BuildOptions) - if err != nil { - return false, err - } - - if err := builder.Add(doc); err != nil { - return false, fmt.Errorf("error adding document with name %s: %w", key.FullPath(), err) - } - - if idx%10_000 == 0 { - builder.CheckMemoryUsage() - } + if err := indexGoGitBlobs(submoduleKeys, repos, opts.BuildOptions, builder); err != nil { + return false, err } return true, builder.Finish() @@ -684,11 +681,12 @@ func indexGitRepo(opts Options, config gitIndexConfig) (bool, error) { // The reader is always closed when this function returns. func indexCatfileBlobs(cr *catfileReader, keys []fileKey, repos map[fileKey]BlobLocation, opts Options, builder *index.Builder) error { defer cr.Close() + processed := 0 for idx, key := range keys { size, missing, excluded, err := cr.Next() if err != nil { - return fmt.Errorf("cat-file next for %s: %w", key.FullPath(), err) + return &catfileReadError{processed: processed, err: fmt.Errorf("cat-file next for %s: %w", key.FullPath(), err)} } branches := repos[key].Branches @@ -712,7 +710,7 @@ func indexCatfileBlobs(cr *catfileReader, keys []fileKey, repos map[fileKey]Blob // avoids the intermediate allocation and the size is known. content := make([]byte, size) if _, err := io.ReadFull(cr, content); err != nil { - return fmt.Errorf("read blob %s: %w", keyFullPath, err) + return &catfileReadError{processed: processed, err: fmt.Errorf("read blob %s: %w", keyFullPath, err)} } doc = index.Document{ SubRepositoryPath: key.SubRepoPath, @@ -723,6 +721,39 @@ func indexCatfileBlobs(cr *catfileReader, keys []fileKey, repos map[fileKey]Blob } } + if err := builder.Add(doc); err != nil { + return fmt.Errorf("error adding document with name %s: %w", key.FullPath(), err) + } + processed++ + + if idx%10_000 == 0 { + builder.CheckMemoryUsage() + } + } + + return nil +} + +type catfileReadError struct { + processed int + err error +} + +func (e *catfileReadError) Error() string { + return e.err.Error() +} + +func (e *catfileReadError) Unwrap() error { + return e.err +} + +func indexGoGitBlobs(keys []fileKey, repos map[fileKey]BlobLocation, opts index.Options, builder *index.Builder) error { + for idx, key := range keys { + doc, err := createDocument(key, repos, opts) + if err != nil { + return err + } + if err := builder.Add(doc); err != nil { return fmt.Errorf("error adding document with name %s: %w", key.FullPath(), err) } diff --git a/gitindex/index_test.go b/gitindex/index_test.go index aa09369a5..e6f566644 100644 --- a/gitindex/index_test.go +++ b/gitindex/index_test.go @@ -18,6 +18,7 @@ import ( "bytes" "context" "errors" + "fmt" "net/url" "os" "os/exec" @@ -137,6 +138,132 @@ func TestIndexTinyRepo(t *testing.T) { } } +func TestIndexGitRepo_FallbackWhenCatfileFilterUnsupported(t *testing.T) { + dir := t.TempDir() + runGit(t, dir, "init", "-b", "main", "repo") + + repoDir := filepath.Join(dir, "repo") + if err := os.WriteFile(filepath.Join(repoDir, "file1.go"), []byte("package main\n\nfunc main() {}\n"), 0o644); err != nil { + t.Fatalf("WriteFile: %v", err) + } + runGit(t, repoDir, "add", ".") + runGit(t, repoDir, "commit", "-m", "initial commit") + + realGit, err := exec.LookPath("git") + if err != nil { + t.Fatalf("LookPath(git): %v", err) + } + + shimDir := t.TempDir() + shimPath := filepath.Join(shimDir, "git") + shim := fmt.Sprintf(`#!/bin/sh +if [ "$1" = "cat-file" ]; then + shift + for arg in "$@"; do + case "$arg" in + --filter=*) + echo "error: unknown option '$arg'" >&2 + echo "usage: git cat-file" >&2 + exit 129 + ;; + esac + done + exec %q cat-file "$@" +fi +exec %q "$@" +`, realGit, realGit) + if err := os.WriteFile(shimPath, []byte(shim), 0o755); err != nil { + t.Fatalf("WriteFile(%q): %v", shimPath, err) + } + t.Setenv("PATH", shimDir+string(os.PathListSeparator)+os.Getenv("PATH")) + + indexDir := t.TempDir() + opts := Options{ + RepoDir: repoDir, + Branches: []string{"main"}, + BuildOptions: index.Options{ + RepositoryDescription: zoekt.Repository{Name: "repo"}, + IndexDir: indexDir, + SizeMax: 1 << 20, + }, + } + + if _, err := IndexGitRepo(opts); err != nil { + t.Fatalf("IndexGitRepo: %v", err) + } + + searcher, err := search.NewDirectorySearcher(indexDir) + if err != nil { + t.Fatal("NewDirectorySearcher", err) + } + defer searcher.Close() + + results, err := searcher.Search(context.Background(), &query.Const{Value: true}, &zoekt.SearchOptions{}) + if err != nil { + t.Fatal("search failed", err) + } + + if len(results.Files) != 1 { + t.Fatalf("got search result %v, want 1 file", results.Files) + } +} + +func TestIndexGitRepo_ErrorsIfCatfileFailsAfterFirstBlob(t *testing.T) { + dir := t.TempDir() + runGit(t, dir, "init", "-b", "main", "repo") + + repoDir := filepath.Join(dir, "repo") + for name, content := range map[string]string{ + "a.go": "package main\n\nconst A = 1\n", + "b.go": "package main\n\nconst B = 2\n", + } { + if err := os.WriteFile(filepath.Join(repoDir, name), []byte(content), 0o644); err != nil { + t.Fatalf("WriteFile(%q): %v", name, err) + } + } + runGit(t, repoDir, "add", ".") + runGit(t, repoDir, "commit", "-m", "initial commit") + + realGit, err := exec.LookPath("git") + if err != nil { + t.Fatalf("LookPath(git): %v", err) + } + + shimDir := t.TempDir() + shimPath := filepath.Join(shimDir, "git") + shim := fmt.Sprintf(`#!/bin/sh +if [ "$1" = "cat-file" ]; then + shift + IFS= read -r first || exit 1 + printf '%%s\n' "$first" | %q cat-file "$@" + echo "error: synthetic cat-file failure after first object" >&2 + exit 1 +fi +exec %q "$@" +`, realGit, realGit) + if err := os.WriteFile(shimPath, []byte(shim), 0o755); err != nil { + t.Fatalf("WriteFile(%q): %v", shimPath, err) + } + t.Setenv("PATH", shimDir+string(os.PathListSeparator)+os.Getenv("PATH")) + + indexDir := t.TempDir() + opts := Options{ + RepoDir: repoDir, + Branches: []string{"main"}, + BuildOptions: index.Options{ + RepositoryDescription: zoekt.Repository{Name: "repo"}, + IndexDir: indexDir, + SizeMax: 1 << 20, + }, + } + + if _, err := IndexGitRepo(opts); err == nil { + t.Fatal("IndexGitRepo succeeded, want error") + } else if !strings.Contains(err.Error(), "synthetic cat-file failure after first object") { + t.Fatalf("IndexGitRepo error = %v, want synthetic cat-file failure", err) + } +} + func TestIndexGitRepo_Worktree(t *testing.T) { t.Parallel()