Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 15 additions & 2 deletions docstra/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
"""

from collections.abc import Generator
from pathlib import Path

from docstra import __version__ as __version__

Expand Down Expand Up @@ -86,16 +87,28 @@ def setup_components(self):
)

# Document indexer
self.document_indexer = DocumentIndexer(self.storage, self.embedding_generator)
self.document_indexer = DocumentIndexer(
self.storage,
self.embedding_generator,
codebase_root=str(Path.cwd()),
)

# Code indexer
self.code_indexer = CodebaseIndexer(
index_directory=f"{storage_dir}/index",
exclude_patterns=self.config.processing.exclude_patterns,
codebase_root=str(Path.cwd()),
embedding_backend="chroma",
embedding_model=self.config.embedding.model_name,
source_kinds=["tree-sitter"],
)

# Retriever
self.retriever = ChromaRetriever(self.storage, self.embedding_generator)
self.retriever = ChromaRetriever(
self.storage,
self.embedding_generator,
codebase_root=str(Path.cwd()),
)

# Hybrid retriever
self.hybrid_retriever = HybridRetriever(
Expand Down
33 changes: 25 additions & 8 deletions docstra/core/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@
)
from docstra.core.document_processing.extractor import DocumentProcessor
from docstra.core.documentation.generator import DocumentationGenerator
from docstra.core.indexing.code_index import CodebaseIndexer
from docstra.core.indexing.code_index import CodebaseIndex, CodebaseIndexer
from docstra.core.indexing.model import CORE_INDEX_FILENAME
from docstra.core.ingestion.embeddings import EmbeddingFactory
from docstra.core.ingestion.storage import ChromaDBStorage
from docstra.core.llm.anthropic import AnthropicClient
Expand Down Expand Up @@ -1689,16 +1690,25 @@ def _create_retrieval_eval_runner(
user_config: UserConfig, abs_codebase_path: Path
) -> Callable[[str, int], List[Dict[str, Any]]]:
_, chroma_path, index_path = _get_persist_paths(user_config, abs_codebase_path)
core_index_path = index_path / CORE_INDEX_FILENAME
chroma_check_file = chroma_path / "chroma.sqlite3"

if not index_path.exists() or not chroma_check_file.exists():
legacy_index_artifacts = CodebaseIndex.legacy_artifacts_in(index_path)
legacy_repo_map = index_path.parent / "repo_map.json"

if not core_index_path.exists() or not chroma_check_file.exists():
migration_hint = ""
if legacy_index_artifacts or legacy_repo_map.exists():
migration_hint = (
" Legacy index artifacts were found. Rerun 'docstra ingest' "
"to rebuild the index in the new format."
)
raise FileNotFoundError(
f"Codebase at {abs_codebase_path} is not fully initialized for "
f"retrieval evaluation. ChromaDB path: {chroma_path} "
f"(check file: {chroma_check_file}, exists: "
f"{chroma_check_file.exists()}), index path: {index_path} "
f"(exists: {index_path.exists()}). Run 'docstra init' and "
"'docstra ingest' first."
f"{chroma_check_file.exists()}), core index path: {core_index_path} "
f"(exists: {core_index_path.exists()}). Run 'docstra init' and "
f"'docstra ingest' first.{migration_hint}"
)

embedding_generator = EmbeddingFactory.create_embedding_generator(
Expand All @@ -1708,8 +1718,15 @@ def _create_retrieval_eval_runner(
api_base=user_config.model.api_base,
)
storage = ChromaDBStorage(persist_directory=str(chroma_path))
base_retriever = ChromaRetriever(storage, embedding_generator)
code_indexer = CodebaseIndexer(index_directory=str(index_path))
base_retriever = ChromaRetriever(
storage,
embedding_generator,
codebase_root=str(abs_codebase_path),
)
code_indexer = CodebaseIndexer(
index_directory=str(index_path),
codebase_root=str(abs_codebase_path),
)
code_index = code_indexer.get_index()

if code_index:
Expand Down
4 changes: 4 additions & 0 deletions docstra/core/document_processing/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,10 @@ class DocumentType(str, Enum):
TEXT = "text"
OTHER = "other"

def __str__(self) -> str:
"""Return the enum value instead of the enum representation."""
return self.value


class DocumentMetadata(BaseModel):
"""Metadata for a document."""
Expand Down
Loading