Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
GROQ_API_KEY=your_groq_api_key_here
46 changes: 46 additions & 0 deletions app/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
from __future__ import annotations

import os
from dataclasses import dataclass
from pathlib import Path

import yaml
from dotenv import load_dotenv

load_dotenv()


@dataclass
class Settings:
corpus_path: str
chunk_size: int
chunk_overlap: int
retrieval_k: int
model: str
embedding_model: str
chroma_path: str
collection_name: str
groq_api_key: str


def load_settings(config_path: str = "config.yaml") -> Settings:
cfg_file = Path(config_path)
if not cfg_file.exists():
raise FileNotFoundError(f"Missing config file: {cfg_file}")

with cfg_file.open("r", encoding="utf-8") as f:
cfg = yaml.safe_load(f) or {}

groq_api_key = os.getenv("GROQ_API_KEY", "")

return Settings(
corpus_path=cfg.get("corpus_path", "D:/Evalens/corpus/intercom_external/raw_pdfs"),
chunk_size=int(cfg.get("chunk_size", 1000)),
chunk_overlap=int(cfg.get("chunk_overlap", 150)),
retrieval_k=int(cfg.get("retrieval_k", 4)),
model=cfg.get("model", "llama-3.1-8b-instant"),
embedding_model=cfg.get("embedding_model", "sentence-transformers/all-MiniLM-L6-v2"),
chroma_path=cfg.get("chroma_path", ".chroma"),
collection_name=cfg.get("collection_name", "intercom_pdfs"),
Comment on lines +36 to +44
Copy link

Copilot AI Apr 8, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

load_settings() defaults corpus_path to an absolute Windows path. Even if config.yaml is changed later, this fallback will still break portability when the key is missing; prefer a relative default (or require corpus_path in config) and optionally support overriding via an environment variable.

Copilot uses AI. Check for mistakes.
groq_api_key=groq_api_key,
)
31 changes: 31 additions & 0 deletions app/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from __future__ import annotations

from fastapi import FastAPI, HTTPException
from pydantic import BaseModel

from app.config import load_settings
from app.rag import RAGEngine

settings = load_settings()
rag = RAGEngine(settings)

app = FastAPI(title="Evalens Tracer-Bullet RAG")


class QueryRequest(BaseModel):
query: str


@app.on_event("startup")
def startup_event() -> None:
rag.ingest()
Comment on lines +19 to +21
Copy link

Copilot AI Apr 8, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FastAPI's @app.on_event("startup") hook is deprecated in favor of the lifespan interface in newer FastAPI versions. Using lifespan avoids deprecation warnings and is the forward-compatible way to run ingestion on startup.

Copilot uses AI. Check for mistakes.

Comment on lines +19 to +22
Copy link

Copilot AI Apr 8, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

startup_event() calls rag.ingest() without handling errors. As written, a missing corpus directory (or any ingestion error) will prevent the app from starting; consider catching expected exceptions (e.g., missing corpus) and logging them so /query can still operate (or at least fail with a clear startup message).

Copilot uses AI. Check for mistakes.

@app.post("/query")
def query_api(payload: QueryRequest):
try:
return rag.query(payload.query)
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e)) from e
except Exception as e: # minimal tracer-bullet error handling
raise HTTPException(status_code=500, detail=str(e)) from e
Comment on lines +30 to +31
Copy link

Copilot AI Apr 8, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The blanket 500 handler returns detail=str(e) to clients. This can leak internal information such as local file paths and library error messages; prefer returning a generic message for 500s and logging the exception server-side.

Copilot uses AI. Check for mistakes.
139 changes: 139 additions & 0 deletions app/rag.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
from __future__ import annotations

from pathlib import Path
from typing import Any

import chromadb
from chromadb.config import Settings as ChromaSettings
from chromadb.utils import embedding_functions
from groq import Groq
import fitz

from app.config import Settings


class RAGEngine:
def __init__(self, settings: Settings) -> None:
self.settings = settings
self._chroma = chromadb.PersistentClient(
path=settings.chroma_path,
settings=ChromaSettings(allow_reset=False),
)
self._embedding_fn = embedding_functions.SentenceTransformerEmbeddingFunction(
model_name=settings.embedding_model,
)
self._collection = self._chroma.get_or_create_collection(
name=settings.collection_name,
embedding_function=self._embedding_fn,
metadata={"hnsw:space": "cosine"},
)

self._groq = Groq(api_key=settings.groq_api_key) if settings.groq_api_key else None

@staticmethod
def _chunk_text(text: str, chunk_size: int, chunk_overlap: int) -> list[str]:
text = " ".join(text.split())
if not text:
return []

chunks: list[str] = []
start = 0
step = max(chunk_size - chunk_overlap, 1)
while start < len(text):
end = start + chunk_size
chunks.append(text[start:end])
start += step
return chunks

def ingest(self) -> dict[str, Any]:
corpus = Path(self.settings.corpus_path)
if not corpus.exists():
raise FileNotFoundError(f"Corpus folder not found: {corpus}")

pdf_paths = sorted(corpus.glob("*.pdf"))
if not pdf_paths:
return {"indexed_files": 0, "indexed_chunks": 0}
Comment on lines +48 to +55
Copy link

Copilot AI Apr 8, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ingest() raises FileNotFoundError when the corpus directory is missing, which will crash the FastAPI startup hook. The PR description mentions ingestion should skip when the corpus is missing; consider returning a structured "skipped" result (or logging and continuing) instead of raising here.

Copilot uses AI. Check for mistakes.

existing_count = self._collection.count()
if existing_count > 0:
return {"indexed_files": 0, "indexed_chunks": existing_count, "skipped": True}

ids: list[str] = []
docs: list[str] = []
metas: list[dict[str, Any]] = []

for pdf_path in pdf_paths:
with fitz.open(pdf_path) as pdf_doc:
full_text = "\n".join(page.get_text("text") for page in pdf_doc)
chunks = self._chunk_text(
full_text,
self.settings.chunk_size,
self.settings.chunk_overlap,
)
for idx, chunk in enumerate(chunks):
ids.append(f"{pdf_path.stem}-{idx}")
docs.append(chunk)
metas.append({"source": str(pdf_path), "chunk_index": idx})

if ids:
self._collection.add(ids=ids, documents=docs, metadatas=metas)

Comment on lines +61 to +80
Copy link

Copilot AI Apr 8, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ingest() accumulates all chunk ids/documents/metadata for the entire corpus in memory before calling collection.add(). For large PDF sets this can create high peak memory usage (full_text + chunks + lists). Consider adding in batches (e.g., per-PDF or fixed-size batches) to cap memory and make ingestion more robust.

Copilot uses AI. Check for mistakes.
return {"indexed_files": len(pdf_paths), "indexed_chunks": len(ids), "skipped": False}

def query(self, user_query: str) -> dict[str, Any]:
if not user_query.strip():
raise ValueError("query cannot be empty")

results = self._collection.query(
query_texts=[user_query],
n_results=self.settings.retrieval_k,
include=["documents", "metadatas", "distances"],
Copy link

Copilot AI Apr 8, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

collection.query() requests "distances" in include=..., but the code never uses distances. Omitting it will reduce data transfer/processing, especially when retrieval_k is larger.

Suggested change
include=["documents", "metadatas", "distances"],
include=["documents", "metadatas"],

Copilot uses AI. Check for mistakes.
)

documents = results.get("documents", [[]])[0]
metadatas = results.get("metadatas", [[]])[0]

chunks = []
sources = []
for doc, meta in zip(documents, metadatas):
source = (meta or {}).get("source", "")
chunks.append(doc)
if source and source not in sources:
sources.append(source)

answer = self._generate_answer(user_query, chunks)

return {
"query": user_query,
"answer": answer,
"retrieved_chunks": chunks,
"sources": sources,
"model": self.settings.model,
"retrieval_k": self.settings.retrieval_k,
"chunk_size": self.settings.chunk_size,
}

def _generate_answer(self, query: str, chunks: list[str]) -> str:
if not chunks:
return "I couldn't find relevant context in the indexed PDF corpus."

context = "\n\n".join(chunks[: self.settings.retrieval_k])

if not self._groq:
return "Set GROQ_API_KEY to enable generation. Retrieved context is available in retrieved_chunks."

system = (
"You answer questions using only the provided context. "
"If context is insufficient, say so briefly."
)
user = f"Context:\n{context}\n\nQuestion: {query}"

completion = self._groq.chat.completions.create(
model=self.settings.model,
temperature=0,
messages=[
{"role": "system", "content": system},
{"role": "user", "content": user},
],
)
return completion.choices[0].message.content or ""
8 changes: 8 additions & 0 deletions config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
corpus_path: "D:/Evalens/corpus/intercom_external/raw_pdfs"
Copy link

Copilot AI Apr 8, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

config.yaml hard-codes an absolute Windows path for corpus_path. This makes the service non-portable and will fail on other machines/OSes; consider using a repo-relative default (e.g., ./corpus) and/or leaving this as a placeholder value documented in README/.env instead of committing a machine-specific path.

Suggested change
corpus_path: "D:/Evalens/corpus/intercom_external/raw_pdfs"
corpus_path: "./corpus"

Copilot uses AI. Check for mistakes.
chunk_size: 1000
chunk_overlap: 150
retrieval_k: 4
model: "llama-3.1-8b-instant"
embedding_model: "sentence-transformers/all-MiniLM-L6-v2"
chroma_path: ".chroma"
collection_name: "intercom_pdfs"
8 changes: 8 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
fastapi
uvicorn
pymupdf
chromadb
sentence-transformers
groq
python-dotenv
pyyaml