Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 29 additions & 4 deletions index/contentprovider.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ package index

import (
"bytes"
"fmt"
"log"
"path"
"slices"
Expand Down Expand Up @@ -99,14 +100,21 @@ func (p *contentProvider) findOffset(filename bool, r uint32) uint32 {
return r
}

sample := p.id.runeOffsets
runeEnds := p.id.fileEndRunes
fileStartByte := p.id.boundaries[p.idx]
var sample runeOffsetMap
var runeEnds []uint32
var fileStartByte, fileEndByte uint32
if filename {
sample = p.id.fileNameRuneOffsets
runeEnds = p.id.fileNameEndRunes
fileStartByte = p.id.fileNameIndex[p.idx]
fileEndByte = p.id.fileNameIndex[p.idx+1]
} else {
sample = p.id.runeOffsets
runeEnds = p.id.fileEndRunes
fileStartByte = p.id.boundaries[p.idx]
fileEndByte = p.id.boundaries[p.idx+1]
}
fileSize := fileEndByte - fileStartByte

absR := r
if p.idx > 0 {
Expand All @@ -118,20 +126,37 @@ func (p *contentProvider) findOffset(filename bool, r uint32) uint32 {
var data []byte

if filename {
if byteOff > uint32(len(p.id.fileNameContent)) {
p.err = fmt.Errorf("corrupt index: filename rune offset %d maps to byte offset %d past filename data size %d", absR, byteOff, len(p.id.fileNameContent))
return fileSize
}
data = p.id.fileNameContent[byteOff:]
} else {
data, p.err = p.id.readContentSlice(byteOff, 3*runeOffsetFrequency)
if p.err != nil {
return 0
return fileSize
}
}
for left > 0 {
if len(data) == 0 {
p.err = fmt.Errorf("corrupt index: rune offset %d maps past available data", absR)
return fileSize
}
_, sz := utf8.DecodeRune(data)
byteOff += uint32(sz)
data = data[sz:]
left--
}

if byteOff < fileStartByte {
p.err = fmt.Errorf("corrupt index: rune offset %d maps to byte offset %d before file start %d", absR, byteOff, fileStartByte)
return fileSize
}
if byteOff > fileEndByte {
p.err = fmt.Errorf("corrupt index: rune offset %d maps to byte offset %d after file end %d", absR, byteOff, fileEndByte)
return fileSize
}

byteOff -= fileStartByte
return byteOff
}
Expand Down
7 changes: 7 additions & 0 deletions index/matchiter.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,14 @@ type candidateMatch struct {

// Matches content against the substring, and populates byteMatchSz on success
func (m *candidateMatch) matchContent(content []byte) bool {
if int(m.byteOffset) > len(content) {
return false
}

if m.caseSensitive {
if int(m.byteOffset)+len(m.substrBytes) > len(content) {
return false
}
comp := bytes.Equal(m.substrBytes, content[m.byteOffset:m.byteOffset+uint32(len(m.substrBytes))])

m.byteMatchSz = uint32(len(m.substrBytes))
Expand Down
47 changes: 47 additions & 0 deletions index/matchiter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ package index

import (
"reflect"
"strings"
"testing"
)

Expand All @@ -38,3 +39,49 @@ tool fieldalignment then update this test.`, c.v, c.size, got)
}
}
}

func TestCandidateMatchContentOutOfBounds(t *testing.T) {
for _, tc := range []struct {
name string
match candidateMatch
}{
{
name: "offset past content",
match: candidateMatch{
byteOffset: 4,
substrLowered: []byte("x"),
},
},
{
name: "case-sensitive match extends past content",
match: candidateMatch{
byteOffset: 2,
substrBytes: []byte("cd"),
caseSensitive: true,
},
},
} {
t.Run(tc.name, func(t *testing.T) {
if tc.match.matchContent([]byte("abc")) {
t.Fatal("matchContent returned true for an out-of-bounds match")
}
})
}
}

func TestFindOffsetRejectsByteOffsetBeforeFileStart(t *testing.T) {
cp := contentProvider{
id: &indexData{
fileNameContent: []byte("previous/current"),
fileNameIndex: []uint32{9, 16},
fileNameEndRunes: []uint32{7},
},
}

if got, want := cp.findOffset(true, 0), uint32(7); got != want {
t.Fatalf("findOffset returned %d, want file size %d", got, want)
}
if cp.err == nil || !strings.Contains(cp.err.Error(), "before file start") {
t.Fatalf("findOffset error = %v", cp.err)
}
}
73 changes: 73 additions & 0 deletions index/read.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"os"
"slices"
"sort"
"unicode/utf8"

"github.com/RoaringBitmap/roaring"
"github.com/prometheus/client_golang/prometheus"
Expand Down Expand Up @@ -513,11 +514,83 @@ func (d *indexData) verify() error {
"branch masks": len(d.fileBranchMasks),
"doc section index": len(d.docSectionsIndex) - 1,
"newlines index": len(d.newlinesIndex) - 1,
"file end runes": len(d.fileEndRunes),
"name end runes": len(d.fileNameEndRunes),
} {
if got != n {
return fmt.Errorf("got %s %d, want %d", what, got, n)
}
}

if err := d.verifyRuneBoundaryMapping("content", d.boundaries, d.fileEndRunes, d.runeOffsets, false); err != nil {
return err
}
if err := d.verifyRuneBoundaryMapping("filename", d.fileNameIndex, d.fileNameEndRunes, d.fileNameRuneOffsets, true); err != nil {
return err
}
return nil
}

func (d *indexData) verifyRuneBoundaryMapping(what string, byteEnds, runeEnds []uint32, sample runeOffsetMap, filename bool) error {
var prevRune uint32
for i, endRune := range runeEnds {
startByte := byteEnds[i]
endByte := byteEnds[i+1]
if endByte < startByte {
return fmt.Errorf("corrupt index: %s %d byte end %d before start %d", what, i, endByte, startByte)
}
if endRune < prevRune {
return fmt.Errorf("corrupt index: %s %d rune end %d before previous end %d", what, i, endRune, prevRune)
}

if d.metaData.PlainASCII {
wantRuneEnd := prevRune + endByte - startByte
if endRune != wantRuneEnd {
return fmt.Errorf("corrupt index: plain ASCII %s %d ends at rune %d, want %d from byte boundaries", what, i, endRune, wantRuneEnd)
}
prevRune = endRune
continue
}

byteOff, left := sample.lookup(endRune)
if byteOff > endByte {
return fmt.Errorf("corrupt index: %s %d rune end %d maps to byte offset %d after byte end %d", what, i, endRune, byteOff, endByte)
}

var data []byte
if filename {
if byteOff > uint32(len(d.fileNameContent)) {
return fmt.Errorf("corrupt index: %s %d rune end %d maps to byte offset %d past filename data size %d", what, i, endRune, byteOff, len(d.fileNameContent))
}
if endByte > uint32(len(d.fileNameContent)) {
return fmt.Errorf("corrupt index: %s %d byte end %d past filename data size %d", what, i, endByte, len(d.fileNameContent))
}
data = d.fileNameContent[byteOff:endByte]
} else {
blob, err := d.readSectionBlob(simpleSection{
off: d.boundariesStart + byteOff,
sz: endByte - byteOff,
})
if err != nil {
return err
}
data = blob
}

for ; left > 0; left-- {
if len(data) == 0 {
return fmt.Errorf("corrupt index: %s %d rune end %d does not have enough bytes before byte end %d", what, i, endRune, endByte)
}
_, sz := utf8.DecodeRune(data)
byteOff += uint32(sz)
data = data[sz:]
}

if byteOff != endByte {
return fmt.Errorf("corrupt index: %s %d rune end %d maps to byte offset %d, want byte end %d", what, i, endRune, byteOff, endByte)
}
prevRune = endRune
}
return nil
}

Expand Down
63 changes: 63 additions & 0 deletions index/read_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,69 @@ func TestReadWrite(t *testing.T) {
}
}

func TestVerifyRuneBoundaryMapping(t *testing.T) {
id := &indexData{
file: &memSeeker{[]byte("éx")},
boundaries: []uint32{0, 2, 3},
fileEndRunes: []uint32{1, 2},
fileNameContent: []byte("abé"),
fileNameIndex: []uint32{0, 1, 4},
fileNameEndRunes: []uint32{1, 3},
fileBranchMasks: []uint64{0, 0},
docSectionsIndex: []uint32{0, 0, 0},
newlinesIndex: []uint32{0, 0, 0},
}

if err := id.verify(); err != nil {
t.Fatalf("verify: %v", err)
}

id.fileEndRunes[0] = 0
if err := id.verify(); err == nil || !strings.Contains(err.Error(), "content 0 rune end 0 maps to byte offset 0, want byte end 2") {
t.Fatalf("verify content corruption error = %v", err)
}
id.fileEndRunes[0] = 1

id.fileNameEndRunes[1] = 2
if err := id.verify(); err == nil || !strings.Contains(err.Error(), "filename 1 rune end 2 maps to byte offset 2, want byte end 4") {
t.Fatalf("verify filename corruption error = %v", err)
}
}

func TestVerifyPlainASCIIRuneBoundaryMapping(t *testing.T) {
id := &indexData{
file: &memSeeker{[]byte("abc")},
boundaries: []uint32{0, 1, 3},
fileEndRunes: []uint32{1, 3},
fileNameContent: []byte("fg"),
fileNameIndex: []uint32{0, 1, 2},
fileNameEndRunes: []uint32{1, 2},
fileBranchMasks: []uint64{0, 0},
docSectionsIndex: []uint32{0, 0, 0},
newlinesIndex: []uint32{0, 0, 0},
metaData: zoekt.IndexMetadata{
PlainASCII: true,
},
}

if err := id.verify(); err != nil {
t.Fatalf("verify: %v", err)
}

id.file = &memSeeker{[]byte("é")}
id.boundaries = []uint32{0, 2}
id.fileEndRunes = []uint32{1}
id.fileNameContent = []byte("f")
id.fileNameIndex = []uint32{0, 1}
id.fileNameEndRunes = []uint32{1}
id.fileBranchMasks = []uint64{0}
id.docSectionsIndex = []uint32{0, 0}
id.newlinesIndex = []uint32{0, 0}
if err := id.verify(); err == nil || !strings.Contains(err.Error(), "plain ASCII content 0 ends at rune 1, want 2") {
t.Fatalf("verify plain ASCII corruption error = %v", err)
}
}

func TestReadWriteNames(t *testing.T) {
b, err := NewShardBuilder(nil)
if err != nil {
Expand Down
Loading