Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
2c3c688
feat(config): add RetrievalConfig with rrf_k and fts top-k knobs
jorgenosberg Jun 11, 2026
016967d
feat(fts): add FtsStorage with chunks_fts schema and CRUD
jorgenosberg Jun 11, 2026
3448ab7
feat(fts): add symbol indexing and search to FtsStorage
jorgenosberg Jun 11, 2026
8f780e0
feat(retrieval): add FtsRetriever wrapper around FtsStorage
jorgenosberg Jun 11, 2026
c707f4c
feat(retrieval): add FusionRetriever with RRF over dense and lexical …
jorgenosberg Jun 12, 2026
b15b87e
feat(ingestion): write chunks and symbols to FtsStorage alongside Chroma
jorgenosberg Jun 12, 2026
97334d1
feat(query): use FusionRetriever as the base for context-aware retrieval
jorgenosberg Jun 12, 2026
1a3c6de
refactor(retrieval): replace HybridRetriever with FusionRetriever at …
jorgenosberg Jun 12, 2026
4dd4a25
refactor(retrieval): remove obsolete HybridRetriever and tidy generat…
jorgenosberg Jun 12, 2026
167a78d
feat(ingestion): clear FTS DB on force or legacy-state reindex
jorgenosberg Jun 12, 2026
630f925
test(retrieval): add fusion retriever to the evaluation harness
jorgenosberg Jun 12, 2026
2fe650a
fix(retrieval): sanitize FTS queries and normalize hit shape with Chroma
jorgenosberg Jun 15, 2026
4efe9ea
fix(retrieval): hydrate symbol-derived chunks with content and wire F…
jorgenosberg Jun 15, 2026
e424832
fix(retrieval): accept and ignore unknown filter keys in FtsRetriever
jorgenosberg Jun 15, 2026
7a1b8e6
fix(fts): allow FtsStorage connection across threads
jorgenosberg Jun 15, 2026
08cca5f
fix(fts): make add_symbols idempotent by deleting touched file_ids first
jorgenosberg Jun 15, 2026
3876ca5
perf(indexing): cache chunks-by-file lookup in CodebaseIndex
jorgenosberg Jun 15, 2026
c64ef35
fix(retrieval): drop incompatible score subtraction in retrieve_code_…
jorgenosberg Jun 15, 2026
99cb690
refactor(retrieval): require FusionRetriever in ContextAwareRetriever
jorgenosberg Jun 15, 2026
bce1913
chore: fix lint, format, and type errors
jorgenosberg Jun 15, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 23 additions & 7 deletions docstra/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,10 @@
from docstra.core.llm.local import LocalModelClient
from docstra.core.llm.ollama import OllamaClient
from docstra.core.llm.openai import OpenAIClient
from docstra.core.ingestion.fts_storage import FtsStorage
from docstra.core.retrieval.chroma import ChromaRetriever
from docstra.core.retrieval.hybrid import HybridRetriever
from docstra.core.retrieval.fts import FtsRetriever
from docstra.core.retrieval.fusion import FusionRetriever


class docstraant:
Expand Down Expand Up @@ -86,11 +88,16 @@ def setup_components(self):
]
)

# FTS storage (shared by indexer and retriever)
self.fts_storage = FtsStorage(f"{storage_dir}/index.db")
self.fts_retriever = FtsRetriever(self.fts_storage)

# Document indexer
self.document_indexer = DocumentIndexer(
self.storage,
self.embedding_generator,
codebase_root=str(Path.cwd()),
fts_storage=self.fts_storage,
)

# Code indexer
Expand All @@ -110,9 +117,14 @@ def setup_components(self):
codebase_root=str(Path.cwd()),
)

# Hybrid retriever
self.hybrid_retriever = HybridRetriever(
self.retriever, self.code_indexer.get_index()
# Fusion retriever
self.fusion_retriever = FusionRetriever(
dense=self.retriever,
fts=self.fts_retriever,
code_index=self.code_indexer.get_index(),
rrf_k=self.config.retrieval.rrf_k,
fts_chunks_top_k=self.config.retrieval.fts_chunks_top_k,
fts_symbols_top_k=self.config.retrieval.fts_symbols_top_k,
)

# LLM client
Expand Down Expand Up @@ -198,6 +210,12 @@ def index_file(self, filepath: str) -> str:
doc_id = self.document_indexer.index_document(document)
self.code_indexer.index_document(document)

# Write symbols to FTS for this file
manifest = self.code_indexer.get_manifest()
file_symbols = [s for s in manifest.symbols if s.file_id == doc_id]
if file_symbols:
self.fts_storage.add_symbols(file_symbols)

return doc_id

def document_code(
Expand Down Expand Up @@ -287,9 +305,7 @@ def answer_question(self, question: str, n_results: int = 5) -> str:
Generated answer
"""
# Retrieve relevant chunks
results = self.hybrid_retriever.retrieve(
query=question, n_results=n_results, use_code_context=True
)
results = self.fusion_retriever.retrieve(query=question, n_results=n_results)

# Generate answer
return self._require_text_response(
Expand Down
21 changes: 17 additions & 4 deletions docstra/core/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,9 @@
RetrievalEvalSummary,
evaluate_retrieval_cases,
)
from docstra.core.retrieval.hybrid import HybridRetriever
from docstra.core.ingestion.fts_storage import FtsStorage
from docstra.core.retrieval.fts import FtsRetriever
from docstra.core.retrieval.fusion import FusionRetriever
from docstra.core.services.initialization_service import InitializationService
from docstra.core.services.ingestion_service import IngestionService
from docstra.core.services.query_service import QueryService
Expand Down Expand Up @@ -1689,7 +1691,9 @@ def _get_persist_paths(
def _create_retrieval_eval_runner(
user_config: UserConfig, abs_codebase_path: Path
) -> Callable[[str, int], List[Dict[str, Any]]]:
_, chroma_path, index_path = _get_persist_paths(user_config, abs_codebase_path)
effective_persist_dir, chroma_path, index_path = _get_persist_paths(
user_config, abs_codebase_path
)
core_index_path = index_path / CORE_INDEX_FILENAME
chroma_check_file = chroma_path / "chroma.sqlite3"
legacy_index_artifacts = CodebaseIndex.legacy_artifacts_in(index_path)
Expand Down Expand Up @@ -1730,10 +1734,19 @@ def _create_retrieval_eval_runner(
code_index = code_indexer.get_index()

if code_index:
hybrid_retriever = HybridRetriever(base_retriever, code_index)
fts_storage = FtsStorage(str(effective_persist_dir / "index.db"))
fts_retriever = FtsRetriever(fts_storage)
fusion_retriever = FusionRetriever(
dense=base_retriever,
fts=fts_retriever,
code_index=code_index,
rrf_k=user_config.retrieval.rrf_k,
fts_chunks_top_k=user_config.retrieval.fts_chunks_top_k,
fts_symbols_top_k=user_config.retrieval.fts_symbols_top_k,
)

def retrieve(question: str, top_k: int) -> List[Dict[str, Any]]:
return hybrid_retriever.retrieve(question, n_results=top_k)
return fusion_retriever.retrieve(question, n_results=top_k)

return retrieve

Expand Down
27 changes: 27 additions & 0 deletions docstra/core/config/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,18 @@ def __init__(
self.exclude_patterns = exclude_patterns or []


class RetrievalConfig:
def __init__(
self,
rrf_k: int = 60,
fts_chunks_top_k: int = 50,
fts_symbols_top_k: int = 25,
) -> None:
self.rrf_k = rrf_k
self.fts_chunks_top_k = fts_chunks_top_k
self.fts_symbols_top_k = fts_symbols_top_k


class ConfigManager:
def __init__(self, config_path: Optional[str] = None) -> None:
self.config_path = config_path or "./.docstra/config.yaml"
Expand Down Expand Up @@ -180,6 +192,7 @@ def __init__(self) -> None:
self.processing = ProcessingConfig()
self.ingestion = IngestionConfig()
self.documentation = DocumentationConfig()
self.retrieval = RetrievalConfig()

def save_to_file(self, path: str) -> None:
"""Save configuration to YAML file."""
Expand Down Expand Up @@ -219,6 +232,11 @@ def save_to_file(self, path: str) -> None:
"exclude_patterns": self.ingestion.exclude_patterns,
},
"documentation": self.documentation.model_dump(),
"retrieval": {
"rrf_k": self.retrieval.rrf_k,
"fts_chunks_top_k": self.retrieval.fts_chunks_top_k,
"fts_symbols_top_k": self.retrieval.fts_symbols_top_k,
},
}

# Write to YAML file
Expand Down Expand Up @@ -283,3 +301,12 @@ def load_from_file(self, path: str) -> None:
self.processing.chunk_overlap = processing_data["chunk_overlap"]
if "exclude_patterns" in processing_data:
self.processing.exclude_patterns = processing_data["exclude_patterns"]

if "retrieval" in config_dict:
retrieval_data = config_dict["retrieval"]
if "rrf_k" in retrieval_data:
self.retrieval.rrf_k = retrieval_data["rrf_k"]
if "fts_chunks_top_k" in retrieval_data:
self.retrieval.fts_chunks_top_k = retrieval_data["fts_chunks_top_k"]
if "fts_symbols_top_k" in retrieval_data:
self.retrieval.fts_symbols_top_k = retrieval_data["fts_symbols_top_k"]
38 changes: 29 additions & 9 deletions docstra/core/documentation/generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,9 @@
from docstra.core.document_processing.document import Document
from docstra.core.indexing.repo_map import RepositoryMap
from docstra.core.retrieval.chroma import ChromaRetriever
from docstra.core.retrieval.hybrid import HybridRetriever
from docstra.core.ingestion.fts_storage import FtsStorage
from docstra.core.retrieval.fts import FtsRetriever
from docstra.core.retrieval.fusion import FusionRetriever
from docstra.core.indexing.code_index import CodebaseIndex
from docstra.core.documentation.prompts import (
EnhancedDocumentationPrompts,
Expand Down Expand Up @@ -114,6 +116,8 @@ def __init__(
max_workers: Optional[int] = None,
documentation_depth: str = "comprehensive", # "overview", "standard", "comprehensive"
style_guide: Optional[str] = None,
persist_directory: Optional[Union[str, Path]] = None,
user_config: Optional[Any] = None,
):
"""Initialize the enhanced documentation generator.

Expand All @@ -129,6 +133,8 @@ def __init__(
max_workers: Maximum number of worker threads
documentation_depth: Level of documentation detail to generate
style_guide: Custom style guide for documentation
persist_directory: Persist directory root (needed to locate index.db for FTS)
user_config: UserConfig instance for retrieval settings
"""
self.llm_client = llm_client
self.output_dir = Path(output_dir)
Expand All @@ -145,12 +151,26 @@ def __init__(
# Enhanced progress reporting
self.progress_reporter = DocumentationProgressReporter(self.console)

# Set up hybrid retriever if available
self.hybrid_retriever = None
if self.chroma_retriever and self.code_index:
self.hybrid_retriever = HybridRetriever(
self.chroma_retriever, self.code_index
)
# Set up fusion retriever if chroma retriever, code index, and persist dir are available
self.fusion_retriever = None
if self.chroma_retriever and self.code_index and persist_directory:
fts_storage = FtsStorage(str(Path(persist_directory) / "index.db"))
fts_retriever = FtsRetriever(fts_storage)
if user_config and hasattr(user_config, "retrieval"):
self.fusion_retriever = FusionRetriever(
dense=self.chroma_retriever,
fts=fts_retriever,
code_index=self.code_index,
rrf_k=user_config.retrieval.rrf_k,
fts_chunks_top_k=user_config.retrieval.fts_chunks_top_k,
fts_symbols_top_k=user_config.retrieval.fts_symbols_top_k,
)
else:
self.fusion_retriever = FusionRetriever(
dense=self.chroma_retriever,
fts=fts_retriever,
code_index=self.code_index,
)

# Documentation state
self.processed_documents: Dict[str, Document] = {}
Expand Down Expand Up @@ -695,7 +715,7 @@ def _build_file_context(self, document: Document) -> str:
)

# Add cross-references
if self.hybrid_retriever:
if self.fusion_retriever:
cross_refs = self._get_file_cross_references(document)
if cross_refs:
context_parts.append(
Expand Down Expand Up @@ -754,7 +774,7 @@ def _get_similar_code_examples(self, document: Document) -> List[Dict[str, Any]]

def _get_file_cross_references(self, document: Document) -> List[str]:
"""Get cross-references for a file."""
if not self.hybrid_retriever or not self.chroma_retriever:
if not self.chroma_retriever:
return []

try:
Expand Down
24 changes: 23 additions & 1 deletion docstra/core/indexing/code_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from collections import defaultdict
import os
from pathlib import Path
from typing import Any, Callable, Dict, Iterable, List, Optional, TypeVar, Union
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, TypeVar, Union

from docstra.core.document_processing.document import Document, DocumentType
from docstra.core.indexing.model import (
Expand Down Expand Up @@ -55,6 +55,7 @@ def __init__(
self._functions_by_name: Dict[str, List[IndexedSymbol]] = defaultdict(list)
self._classes_by_name: Dict[str, List[IndexedSymbol]] = defaultdict(list)
self._symbols_by_file: Dict[str, List[IndexedSymbol]] = defaultdict(list)
self._chunks_by_file: Dict[str, List[Tuple[str, int, int]]] = defaultdict(list)
self._imports_by_source: Dict[str, List[ImportRecord]] = defaultdict(list)
self._imports_by_text: Dict[str, List[str]] = defaultdict(list)
self._dependencies_by_source: Dict[str, List[str]] = defaultdict(list)
Expand Down Expand Up @@ -110,6 +111,7 @@ def _rebuild_lookups(self) -> None:
self._functions_by_name = defaultdict(list)
self._classes_by_name = defaultdict(list)
self._symbols_by_file = defaultdict(list)
self._chunks_by_file = defaultdict(list)
self._imports_by_source = defaultdict(list)
self._imports_by_text = defaultdict(list)
self._dependencies_by_source = defaultdict(list)
Expand All @@ -123,6 +125,13 @@ def _rebuild_lookups(self) -> None:
elif symbol.kind == "class":
self._classes_by_name[symbol.name].append(symbol)

for chunk in self._manifest.chunks:
self._chunks_by_file[chunk.file_id].append(
(chunk.id, chunk.start_line, chunk.end_line)
)
for chunks in self._chunks_by_file.values():
chunks.sort(key=lambda item: item[1])

for import_record in self._manifest.imports:
self._imports_by_source[import_record.source_file_id].append(import_record)
self._imports_by_text[import_record.raw_text].append(
Expand Down Expand Up @@ -455,6 +464,15 @@ def get_related_files(self, filepath: str) -> List[str]:
related_files.discard(file_id)
return sorted(related_files)

def chunks_for_file(self, file_id: str) -> List[Tuple[str, int, int]]:
"""Return (chunk_id, start_line, end_line) tuples for a file in line order."""
return list(self._chunks_by_file.get(file_id, []))

def file_language(self, file_id: str) -> Optional[str]:
"""Return the language recorded in the manifest for a file id, if any."""
entry = self._files_by_id.get(file_id)
return entry.language if entry else None

def clear(self) -> None:
"""Clear the persisted manifest and in-memory lookups."""
self._manifest = CoreIndexManifest.empty(
Expand Down Expand Up @@ -552,3 +570,7 @@ def index_documents(self, documents: List[Document]) -> None:
def get_index(self) -> CodebaseIndex:
"""Get the underlying codebase index."""
return self.index

def get_manifest(self) -> CoreIndexManifest:
"""Return the in-memory manifest built during indexing."""
return self.index.manifest
Loading