Skip to content

Commit 07cfd09

Browse files
authored
Merge pull request #953 from krissetto/cleanup-rag
Cleanup RAG code a bit
2 parents 8b6f62a + 8351ef6 commit 07cfd09

File tree

12 files changed

+1272
-1062
lines changed

12 files changed

+1272
-1062
lines changed

examples/rag.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,6 @@ rag:
2525
- ./rag/blork_field_guide.txt
2626
strategies:
2727
- type: chunked-embeddings
28-
model: openai/text-embedding-3-small
28+
embedding_model: openai/text-embedding-3-small
2929
database: ./rag/chunked_embeddings.db
3030
vector_dimensions: 1536

examples/rag/hybrid.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ rag:
2525
strategies:
2626
# Chunked embeddings strategy for semantic search
2727
- type: chunked-embeddings
28-
model: openai/text-embedding-3-small
28+
embedding_model: openai/text-embedding-3-small
2929
docs:
3030
- ./docs # Additional docs for chunked-embeddings strategy only
3131
database: ./chunked_embeddings.db # chunked-embeddings database to use; path to local sqlite database

examples/rag/reranking.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ rag:
3131

3232
strategies:
3333
- type: chunked-embeddings
34-
model: openai-embedder
34+
embedding_model: openai-embedder
3535
database: ./chunked_embeddings.db
3636
similarity_metric: cosine_similarity
3737
chunking:

pkg/config/latest/types.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -326,7 +326,7 @@ type RAGStrategyConfig struct {
326326

327327
// Strategy-specific parameters (arbitrary key-value pairs)
328328
// Examples:
329-
// - chunked-embeddings: model, similarity_metric, threshold, vector_dimensions
329+
// - chunked-embeddings: embedding_model, similarity_metric, threshold, vector_dimensions
330330
// - bm25: k1, b, threshold
331331
Params map[string]any // Flattened into parent JSON
332332
}

pkg/rag/database/database.go

Lines changed: 13 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,54 +1,37 @@
11
package database
22

33
import (
4-
"context"
54
"math"
65
)
76

8-
// Document represents a chunk of text with its embedding
7+
// Document represents a chunk of text - the base type returned by all RAG strategies.
8+
// Strategy-specific fields (embeddings, semantic summaries) are handled internally
9+
// by each strategy and don't need to be exposed here.
910
type Document struct {
10-
ID string `json:"id"`
11-
SourcePath string `json:"source_path"`
12-
ChunkIndex int `json:"chunk_index"`
13-
Content string `json:"content"`
14-
Embedding []float64 `json:"-"`
15-
FileHash string `json:"file_hash"`
16-
CreatedAt string `json:"created_at"`
11+
ID string `json:"id"`
12+
SourcePath string `json:"source_path"`
13+
ChunkIndex int `json:"chunk_index"`
14+
Content string `json:"content"`
15+
FileHash string `json:"file_hash"`
16+
CreatedAt string `json:"created_at"`
1717
}
1818

19-
// SearchResult represents a document with its similarity score
19+
// SearchResult represents a document with its relevance score.
20+
// This is the common return type for all Strategy.Query() implementations.
2021
type SearchResult struct {
2122
Document Document `json:"document"`
2223
Similarity float64 `json:"similarity"`
2324
}
2425

25-
// FileMetadata represents metadata about an indexed file
26+
// FileMetadata represents metadata about an indexed file.
27+
// Used for change detection and incremental indexing.
2628
type FileMetadata struct {
2729
SourcePath string
2830
FileHash string
2931
LastIndexed string
3032
ChunkCount int
3133
}
3234

33-
// Database interface for RAG operations
34-
// Implementations: SQLite (sqlite.go), PostgreSQL (future), Pinecone (future), etc.
35-
type Database interface {
36-
// Document operations
37-
AddDocument(ctx context.Context, doc Document) error
38-
DeleteDocumentsByPath(ctx context.Context, sourcePath string) error
39-
SearchSimilar(ctx context.Context, queryEmbedding []float64, limit int) ([]SearchResult, error)
40-
GetDocumentsByPath(ctx context.Context, sourcePath string) ([]Document, error)
41-
42-
// File metadata operations (for change detection and incremental indexing)
43-
GetFileMetadata(ctx context.Context, sourcePath string) (*FileMetadata, error)
44-
SetFileMetadata(ctx context.Context, metadata FileMetadata) error
45-
GetAllFileMetadata(ctx context.Context) ([]FileMetadata, error)
46-
DeleteFileMetadata(ctx context.Context, sourcePath string) error
47-
48-
// Resource management
49-
Close() error
50-
}
51-
5235
// Helper functions
5336

5437
// CosineSimilarity calculates cosine similarity between two vectors

pkg/rag/strategy/bm25.go

Lines changed: 9 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,9 @@ func NewBM25FromConfig(ctx context.Context, cfg latest.RAGStrategyConfig, buildC
4444
return nil, fmt.Errorf("invalid database config: %w", err)
4545
}
4646

47-
// Create BM25-specific database (no vectors needed)
48-
db, err := NewBM25Database(dbPath)
47+
// Create BM25-specific database (no vectors needed).
48+
// Pass strategy type as table prefix so multiple strategies can share the same DB file.
49+
db, err := newBM25DB(dbPath, cfg.Type)
4950
if err != nil {
5051
return nil, fmt.Errorf("failed to create database: %w", err)
5152
}
@@ -66,14 +67,14 @@ func NewBM25FromConfig(ctx context.Context, cfg latest.RAGStrategyConfig, buildC
6667
"chunk_overlap", chunkOverlap,
6768
"respect_word_boundaries", respectWordBoundaries)
6869
if chunkSize == 0 {
69-
chunkSize = 1000
70+
chunkSize = 1500 // General text: good paragraph/section size
7071
}
7172
if chunkOverlap == 0 {
7273
chunkOverlap = 75
7374
}
7475

7576
// Create strategy
76-
strategy := NewBM25Strategy(
77+
strategy := newBM25Strategy(
7778
"bm25",
7879
db,
7980
events,
@@ -97,7 +98,7 @@ func NewBM25FromConfig(ctx context.Context, cfg latest.RAGStrategyConfig, buildC
9798
// BM25 is a ranking function that uses term frequency and inverse document frequency
9899
type BM25Strategy struct {
99100
name string
100-
db database.Database
101+
db *bm25DB
101102
processor *chunk.Processor
102103
fileHashes map[string]string
103104
watcher *fsnotify.Watcher
@@ -111,8 +112,8 @@ type BM25Strategy struct {
111112
docCount int // total number of documents
112113
}
113114

114-
// NewBM25Strategy creates a new BM25-based retrieval strategy
115-
func NewBM25Strategy(name string, db database.Database, events chan<- types.Event, k1, b float64) *BM25Strategy {
115+
// newBM25Strategy creates a new BM25-based retrieval strategy
116+
func newBM25Strategy(name string, db *bm25DB, events chan<- types.Event, k1, b float64) *BM25Strategy {
116117
return &BM25Strategy{
117118
name: name,
118119
db: db,
@@ -460,19 +461,10 @@ func (s *BM25Strategy) calculateBM25Score(queryTerms []string, doc database.Docu
460461
}
461462

462463
func (s *BM25Strategy) getAllDocuments(ctx context.Context) ([]database.Document, error) {
463-
// This is a placeholder - you'd need to add a method to the database interface
464-
// For now, we'll use SearchSimilar with an empty embedding to get all docs
465-
// In production, add a proper GetAllDocuments method to the database interface
466-
results, err := s.db.SearchSimilar(ctx, []float64{}, 10000)
464+
docs, err := s.db.GetAllDocuments(ctx)
467465
if err != nil {
468466
return nil, err
469467
}
470-
471-
docs := make([]database.Document, len(results))
472-
for i, result := range results {
473-
docs[i] = result.Document
474-
}
475-
476468
s.docCount = len(docs)
477469
return docs, nil
478470
}
@@ -550,13 +542,11 @@ func (s *BM25Strategy) indexFile(ctx context.Context, filePath string, chunkSize
550542
continue
551543
}
552544

553-
// For BM25, we don't need embeddings, but we still store the document
554545
doc := database.Document{
555546
ID: fmt.Sprintf("%s_%d_%d", filePath, chunk.Index, time.Now().UnixNano()),
556547
SourcePath: filePath,
557548
ChunkIndex: chunk.Index,
558549
Content: chunk.Content,
559-
Embedding: []float64{}, // Empty embedding for BM25
560550
FileHash: fileHash,
561551
}
562552

0 commit comments

Comments
 (0)