From 215155633538a8383786868ff06f2a26582a24fe Mon Sep 17 00:00:00 2001 From: nasr <156965421+div0rce@users.noreply.github.com> Date: Mon, 8 Jun 2026 11:30:13 -0400 Subject: [PATCH 1/5] refactor(eval): restructure harness to Code Health 10.0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit eval/harness.py carried pre-existing technical debt (CodeScene Code Health 7.52): three ~100-line evaluator functions with cyclomatic complexity 14-20, a bumpy-road nesting smell, and 5-6 argument signatures. Refactor (behaviour-preserving — the asserted metric values in test_eval_harness.py are unchanged): - Introduce EvalContext bundling the shared deps (settings, embedder, lazy llm, corpus/label dirs); every evaluator and helper now takes (session, ctx). This is the 'missing abstraction' CodeScene flagged for the argument-count smell. - Extract each scoring loop into a focused tally (_ExtractionTally / _RetrievalTally / _RagTally) with score/record + to_result methods, plus shared _ingest_relevant_ sources / _resolve_relevant_ids helpers and a _citation_validity_contribution helper. - Flatten _collect_filenames to comprehensions. Result: every evaluator drops to cc 3-5 / ~25 lines / 2 args; module mean cc 6.25 -> 3.13; Code Health 7.52 -> 10.0. make check green (222 backend pytest, 7 frontend, ruff/mypy clean). --- backend/tests/test_eval_harness.py | 33 +- eval/harness.py | 632 ++++++++++++++--------------- eval/run.py | 4 +- 3 files changed, 331 insertions(+), 338 deletions(-) diff --git a/backend/tests/test_eval_harness.py b/backend/tests/test_eval_harness.py index 5e9e97a..f714494 100644 --- a/backend/tests/test_eval_harness.py +++ b/backend/tests/test_eval_harness.py @@ -38,6 +38,7 @@ from backend.app.llm import LLMResponse from backend.app.models import SCHEMA_EMBEDDING_DIM from eval.harness import ( + EvalContext, HarnessReport, RagResult, RetrievalResult, @@ -171,14 +172,14 @@ def test_evaluate_extraction_emits_na_under_fake_llm(session: Session, tmp_path: ] }, ) - result = evaluate_extraction( - session, + ctx = EvalContext( settings=Settings.model_validate({"llm_provider": "fake", "embeddings_provider": "fake"}), llm=ScriptedFakeLLM(), embedder=FakeEmbedder(), corpus_dir=corpus, labels_dir=labels, ) + result = evaluate_extraction(session, ctx) assert result.quotable is False assert result.micro_accuracy is None assert result.macro_accuracy is None @@ -201,13 +202,13 @@ def test_evaluate_retrieval_emits_na_under_fake_embedder(session: Session, tmp_p ], }, ) - result = evaluate_retrieval( - session, + ctx = EvalContext( settings=Settings.model_validate({"embeddings_provider": "fake"}), embedder=FakeEmbedder(), corpus_dir=corpus, labels_dir=labels, ) + result = evaluate_retrieval(session, ctx) assert result.quotable is False assert result.precision_at_k is None assert result.recall_at_k is None @@ -234,14 +235,14 @@ def test_evaluate_rag_emits_na_under_either_fake(session: Session, tmp_path: Pat Settings.model_validate({"llm_provider": "fake", "embeddings_provider": "openai"}), Settings.model_validate({"llm_provider": "anthropic", "embeddings_provider": "fake"}), ): - result = evaluate_rag( - session, + ctx = EvalContext( settings=settings, llm=ScriptedFakeLLM(), embedder=FakeEmbedder(), corpus_dir=corpus, labels_dir=labels, ) + result = evaluate_rag(session, ctx) assert result.quotable is False assert result.citation_validity_rate is None assert result.cites_relevant_rate is None @@ -307,14 +308,14 @@ def test_evaluate_extraction_perfect_run_yields_unit_accuracy( ) llm = ScriptedFakeLLM(response_for_chunk=_perfect_invoice_json_for) - result = evaluate_extraction( - session, + ctx = EvalContext( settings=_real_provider_settings(), llm=llm, embedder=FakeEmbedder(), corpus_dir=corpus, labels_dir=labels, ) + result = evaluate_extraction(session, ctx) assert result.quotable is True assert result.failed_extractions == 0 assert result.micro_accuracy == pytest.approx(1.0) @@ -345,14 +346,14 @@ def test_evaluate_extraction_one_wrong_field_pins_micro_macro( ) llm = ScriptedFakeLLM(response_for_chunk=_wrong_vendor_invoice_json_for) - result = evaluate_extraction( - session, + ctx = EvalContext( settings=_real_provider_settings(), llm=llm, embedder=FakeEmbedder(), corpus_dir=corpus, labels_dir=labels, ) + result = evaluate_extraction(session, ctx) assert result.quotable is True assert result.failed_extractions == 0 # 3 of 4 fields correct → micro = 0.75. Macro = mean of (1, 0, 1, 1) = 0.75. @@ -410,13 +411,13 @@ def test_evaluate_retrieval_pins_precision_recall_mrr(session: Session, tmp_path }, ) - result = evaluate_retrieval( - session, + ctx = EvalContext( settings=_real_provider_settings(retrieval_top_k=2), embedder=embedder, corpus_dir=corpus, labels_dir=labels, ) + result = evaluate_retrieval(session, ctx) assert result.quotable is True assert result.k == 2 assert result.precision_at_k == pytest.approx(0.5) @@ -454,14 +455,14 @@ def respond(cid: int) -> str: }, ) - result = evaluate_rag( - session, + ctx = EvalContext( settings=_real_provider_settings(retrieval_top_k=3), llm=llm, embedder=embedder, corpus_dir=corpus, labels_dir=labels, ) + result = evaluate_rag(session, ctx) assert result.quotable is True assert result.refusals == 0 assert result.answered == 1 @@ -502,14 +503,14 @@ def test_render_writes_real_metrics_when_quotable(session: Session, tmp_path: Pa ) llm = ScriptedFakeLLM(response_for_chunk=_perfect_invoice_json_for) - extraction = evaluate_extraction( - session, + ctx = EvalContext( settings=_real_provider_settings(), llm=llm, embedder=FakeEmbedder(), corpus_dir=corpus, labels_dir=labels, ) + extraction = evaluate_extraction(session, ctx) report = HarnessReport( extraction=extraction, diff --git a/eval/harness.py b/eval/harness.py index 453be3a..5bf523d 100644 --- a/eval/harness.py +++ b/eval/harness.py @@ -4,6 +4,12 @@ that stitches their results into a single report. Each evaluator returns a typed, frozen dataclass that the writer in :mod:`eval.results` can render. +Every evaluator takes a session and an :class:`EvalContext` (the shared +configuration, embedder, optional LLM, and corpus/label directories). The +per-item scoring lives in small mutable tally objects whose ``to_result`` method +emits the frozen result dataclass — this keeps each evaluator short and its +control flow shallow. + Honesty discipline (per CLAUDE.md Golden Rule #5 + the M9 design lock-in): * If ``settings.embeddings_provider == "fake"`` the retrieval and citation-mapping @@ -92,30 +98,63 @@ class HarnessReport: settings_summary: dict[str, Any] -# --- helpers ---------------------------------------------------------------------- +# --- evaluation context ------------------------------------------------------------ + + +@dataclass(frozen=True, slots=True) +class EvalContext: + """The shared dependencies every evaluator needs. + + Bundling configuration, the embedder, the (optional) LLM, and the + corpus/label directories into one value keeps each evaluator's signature to + ``(session, ctx)`` and puts the dependency wiring in a single place. + """ + + settings: Settings + embedder: EmbeddingProvider + llm: LLMClient | None = None + corpus_dir: Path = DEFAULT_CORPUS_DIR + labels_dir: Path = LABELS_DIR + + @classmethod + def create(cls, settings: Settings | None = None) -> EvalContext: + """Build a context for a production run, resolving ``settings`` and the + ``embedder`` from the environment. The LLM is left lazy (see + :meth:`require_llm`) so a retrieval-only run never forces an LLM provider to + resolve — matching the original per-evaluator resolution behaviour. To inject + fakes or point at a custom corpus/label dir (as the tests do), construct + :class:`EvalContext` directly.""" + settings = settings or get_settings() + return cls(settings=settings, embedder=get_embedder(settings)) + + def require_llm(self) -> LLMClient: + """Return the LLM, resolving it from settings on first need. Evaluators that + never call this (e.g. retrieval) never force an LLM provider to resolve.""" + return self.llm if self.llm is not None else get_llm(self.settings) + + +# --- corpus / label helpers -------------------------------------------------------- def _read_corpus_file(corpus_dir: Path, source_filename: str) -> str: - path = corpus_dir / source_filename - return path.read_text(encoding="utf-8") + return (corpus_dir / source_filename).read_text(encoding="utf-8") -def _ensure_ingested( - session: Session, - *, - corpus_dir: Path, - source_filename: str, - embedder: EmbeddingProvider, - settings: Settings, -) -> int | None: - """Ensure the labelled corpus file is ingested. Returns the document id or None - if the file is missing on disk.""" - path = corpus_dir / source_filename +def _load_labels(name: str, labels_dir: Path) -> dict[str, Any]: + data = json.loads((labels_dir / name).read_text(encoding="utf-8")) + if not isinstance(data, dict): + raise ValueError(f"label file {name!r} must contain a JSON object at top level") + return data + + +def _ensure_ingested(session: Session, ctx: EvalContext, source_filename: str) -> int | None: + """Ensure the labelled corpus file is ingested. Returns the document id, or + ``None`` if the file is missing on disk.""" + path = ctx.corpus_dir / source_filename if not path.exists(): return None text = path.read_text(encoding="utf-8") - content_hash = canonical_hash(text) - existing = documents_repo.get_by_hash(session, content_hash) + existing = documents_repo.get_by_hash(session, canonical_hash(text)) if existing is not None: return existing.id result = ingest_document( @@ -124,54 +163,128 @@ def _ensure_ingested( source=str(path.resolve()), title=path.stem, mime_type="text/markdown", - embedder=embedder, - settings=settings, + embedder=ctx.embedder, + settings=ctx.settings, ) return result.document_id +def _ingest_relevant_sources( + session: Session, ctx: EvalContext, entries: list[dict[str, Any]] +) -> None: + """Ingest every corpus file referenced by any entry's ``relevant`` list.""" + referenced = {rel["source_filename"] for e in entries for rel in e.get("relevant", [])} + for source in referenced: + _ensure_ingested(session, ctx, source) + + def _resolve_chunk_id( - session: Session, *, corpus_dir: Path, source_filename: str, chunk_ord: int + session: Session, ctx: EvalContext, source_filename: str, chunk_ord: int ) -> int | None: - text = _read_corpus_file(corpus_dir, source_filename) + text = _read_corpus_file(ctx.corpus_dir, source_filename) doc = documents_repo.get_by_hash(session, canonical_hash(text)) if doc is None: return None - chunks = chunks_repo.list_for_document(session, doc.id) - for chunk in chunks: + for chunk in chunks_repo.list_for_document(session, doc.id): if chunk.ord == chunk_ord: return chunk.id return None -def _load_labels(name: str, labels_dir: Path) -> dict[str, Any]: - data = json.loads((labels_dir / name).read_text(encoding="utf-8")) - if not isinstance(data, dict): - raise ValueError(f"label file {name!r} must contain a JSON object at top level") - return data +def _resolve_relevant_ids(session: Session, ctx: EvalContext, entry: dict[str, Any]) -> set[int]: + """Resolve the chunk ids of an entry's labelled-relevant references that exist.""" + ids: set[int] = set() + for ref in entry.get("relevant", []): + cid = _resolve_chunk_id(session, ctx, ref["source_filename"], ref["chunk_ord"]) + if cid is not None: + ids.add(cid) + return ids # --- extraction -------------------------------------------------------------------- -def evaluate_extraction( - session: Session, - *, - settings: Settings | None = None, - llm: LLMClient | None = None, - embedder: EmbeddingProvider | None = None, - corpus_dir: Path = DEFAULT_CORPUS_DIR, - labels_dir: Path = LABELS_DIR, -) -> ExtractionResult: - settings = settings or get_settings() - embedder = embedder or get_embedder(settings) - llm = llm or get_llm(settings) - - labels = _load_labels("extraction_labels.json", labels_dir) +@dataclass +class _ExtractionTally: + """Mutable accumulator for per-document extraction scoring.""" + + field_correct: dict[str, int] = field(default_factory=dict) + field_total: dict[str, int] = field(default_factory=dict) + present_in_extraction: dict[str, int] = field(default_factory=dict) + present_in_truth: dict[str, int] = field(default_factory=dict) + correct_documents: int = 0 + total_documents: int = 0 + + def score_document(self, expected: dict[str, Any], actual: dict[str, Any]) -> None: + """Fold one successful extraction's per-field correctness into the tally.""" + self.total_documents += 1 + all_match = True + for fname, expected_value in expected.items(): + self.field_total[fname] = self.field_total.get(fname, 0) + 1 + if expected_value is not None: + self.present_in_truth[fname] = self.present_in_truth.get(fname, 0) + 1 + actual_value = actual.get(fname) + if actual_value is not None: + self.present_in_extraction[fname] = self.present_in_extraction.get(fname, 0) + 1 + if values_equal(expected_value, actual_value): + self.field_correct[fname] = self.field_correct.get(fname, 0) + 1 + else: + all_match = False + if all_match: + self.correct_documents += 1 + + def to_result(self, *, n_documents: int, failed: int) -> ExtractionResult: + """Emit the frozen result: micro/macro accuracy and per-field accuracy + P/R.""" + if self.total_documents == 0: + return ExtractionResult( + n_documents=n_documents, + quotable=False, + failed_extractions=failed, + note="No extractions succeeded; cannot compute accuracy.", + ) + micro_total = sum(self.field_total.values()) + micro_correct = sum(self.field_correct.values()) + micro_accuracy = micro_correct / micro_total if micro_total else 0.0 + + per_field_accuracy = { + name: self.field_correct.get(name, 0) / total + for name, total in self.field_total.items() + if total > 0 + } + macro_accuracy = ( + sum(per_field_accuracy.values()) / len(per_field_accuracy) + if per_field_accuracy + else 0.0 + ) + + per_field_pr: dict[str, dict[str, float]] = {} + for name in self.field_total: + correct = self.field_correct.get(name, 0) + ex_present = self.present_in_extraction.get(name, 0) + truth_present = self.present_in_truth.get(name, 0) + per_field_pr[name] = { + "precision": (correct / ex_present) if ex_present else 0.0, + "recall": (correct / truth_present) if truth_present else 0.0, + } + + return ExtractionResult( + n_documents=n_documents, + quotable=True, + micro_accuracy=micro_accuracy, + macro_accuracy=macro_accuracy, + per_field_accuracy=per_field_accuracy, + per_field_precision_recall=per_field_pr, + failed_extractions=failed, + ) + + +def evaluate_extraction(session: Session, ctx: EvalContext) -> ExtractionResult: + """Score schema extraction against the labelled corpus.""" + labels = _load_labels("extraction_labels.json", ctx.labels_dir) items: list[dict[str, Any]] = labels.get("items", []) n_documents = len(items) - if settings.llm_provider == "fake": + if ctx.settings.llm_provider == "fake": return ExtractionResult( n_documents=n_documents, quotable=False, @@ -181,23 +294,11 @@ def evaluate_extraction( ), ) - field_correct: dict[str, int] = {} - field_total: dict[str, int] = {} - field_present_in_extraction: dict[str, int] = {} - field_present_in_truth: dict[str, int] = {} - correct_documents = 0 - total_documents = 0 + llm = ctx.require_llm() + tally = _ExtractionTally() failed = 0 - for item in items: - source = item["source_filename"] - doc_id = _ensure_ingested( - session, - corpus_dir=corpus_dir, - source_filename=source, - embedder=embedder, - settings=settings, - ) + doc_id = _ensure_ingested(session, ctx, item["source_filename"]) if doc_id is None: failed += 1 continue @@ -206,90 +307,64 @@ def evaluate_extraction( document_id=doc_id, schema_name=item["schema_name"], llm=llm, - settings=settings, + settings=ctx.settings, ) if result.status != "ok": failed += 1 continue - total_documents += 1 - expected = item["expected"] - actual = result.payload - all_match = True - for fname, expected_value in expected.items(): - field_total[fname] = field_total.get(fname, 0) + 1 - if expected_value is not None: - field_present_in_truth[fname] = field_present_in_truth.get(fname, 0) + 1 - actual_value = actual.get(fname) - if actual_value is not None: - field_present_in_extraction[fname] = field_present_in_extraction.get(fname, 0) + 1 - if values_equal(expected_value, actual_value): - field_correct[fname] = field_correct.get(fname, 0) + 1 - else: - all_match = False - if all_match: - correct_documents += 1 - - if total_documents == 0: - return ExtractionResult( - n_documents=n_documents, - quotable=False, - failed_extractions=failed, - note="No extractions succeeded; cannot compute accuracy.", - ) - - micro_total = sum(field_total.values()) - micro_correct = sum(field_correct.values()) - micro_accuracy = micro_correct / micro_total if micro_total else 0.0 - - per_field_accuracy = { - name: (field_correct.get(name, 0) / total) - for name, total in field_total.items() - if total > 0 - } - macro_accuracy = ( - sum(per_field_accuracy.values()) / len(per_field_accuracy) if per_field_accuracy else 0.0 - ) + tally.score_document(item["expected"], result.payload) - per_field_pr: dict[str, dict[str, float]] = {} - for name in field_total: - truth_present = field_present_in_truth.get(name, 0) - ex_present = field_present_in_extraction.get(name, 0) - correct = field_correct.get(name, 0) - precision = (correct / ex_present) if ex_present else 0.0 - recall = (correct / truth_present) if truth_present else 0.0 - per_field_pr[name] = {"precision": precision, "recall": recall} - - return ExtractionResult( - n_documents=n_documents, - quotable=True, - micro_accuracy=micro_accuracy, - macro_accuracy=macro_accuracy, - per_field_accuracy=per_field_accuracy, - per_field_precision_recall=per_field_pr, - failed_extractions=failed, - ) + return tally.to_result(n_documents=n_documents, failed=failed) # --- retrieval --------------------------------------------------------------------- -def evaluate_retrieval( - session: Session, - *, - settings: Settings | None = None, - embedder: EmbeddingProvider | None = None, - corpus_dir: Path = DEFAULT_CORPUS_DIR, - labels_dir: Path = LABELS_DIR, -) -> RetrievalResult: - settings = settings or get_settings() - embedder = embedder or get_embedder(settings) +@dataclass +class _RetrievalTally: + """Mutable accumulator for per-query retrieval scoring.""" + + precision_sum: float = 0.0 + recall_sum: float = 0.0 + rr_sum: float = 0.0 + valid_queries: int = 0 + + def record(self, relevant_ids: set[int], retrieved_ids: list[int], k: int) -> None: + """Fold one query's precision@k, recall@k, and reciprocal rank into the tally.""" + self.valid_queries += 1 + overlap = len(relevant_ids & set(retrieved_ids)) + self.precision_sum += overlap / k + self.recall_sum += overlap / len(relevant_ids) + rank = next((i + 1 for i, cid in enumerate(retrieved_ids) if cid in relevant_ids), 0) + self.rr_sum += (1.0 / rank) if rank > 0 else 0.0 + + def to_result(self, *, n_queries: int, k: int) -> RetrievalResult: + """Emit the frozen result: mean precision@k, recall@k, and MRR.""" + if self.valid_queries == 0: + return RetrievalResult( + n_queries=n_queries, + k=k, + quotable=False, + note="No queries had resolvable relevant chunks.", + ) + return RetrievalResult( + n_queries=self.valid_queries, + k=k, + quotable=True, + precision_at_k=self.precision_sum / self.valid_queries, + recall_at_k=self.recall_sum / self.valid_queries, + mrr=self.rr_sum / self.valid_queries, + ) + - labels = _load_labels("retrieval_labels.json", labels_dir) +def evaluate_retrieval(session: Session, ctx: EvalContext) -> RetrievalResult: + """Score pgvector retrieval (precision@k / recall@k / MRR) against the labels.""" + labels = _load_labels("retrieval_labels.json", ctx.labels_dir) queries: list[dict[str, Any]] = labels.get("queries", []) - k = int(labels.get("k", settings.retrieval_top_k)) + k = int(labels.get("k", ctx.settings.retrieval_top_k)) n_queries = len(queries) - if settings.embeddings_provider == "fake": + if ctx.settings.embeddings_provider == "fake": return RetrievalResult( n_queries=n_queries, k=k, @@ -303,95 +378,107 @@ def evaluate_retrieval( # Lazy import here so the harness module imports cleanly when retrieval isn't run. from backend.app.retrieval import cosine_top_k - # Ensure every relevance-judged file is ingested. - referenced: set[str] = { - rel["source_filename"] for q in queries for rel in q.get("relevant", []) - } - for source in referenced: - _ensure_ingested( - session, - corpus_dir=corpus_dir, - source_filename=source, - embedder=embedder, - settings=settings, - ) - - precision_sum = 0.0 - recall_sum = 0.0 - rr_sum = 0.0 - valid_queries = 0 + _ingest_relevant_sources(session, ctx, queries) + tally = _RetrievalTally() for entry in queries: - relevant_ids: set[int] = set() - for ref in entry.get("relevant", []): - cid = _resolve_chunk_id( - session, - corpus_dir=corpus_dir, - source_filename=ref["source_filename"], - chunk_ord=ref["chunk_ord"], - ) - if cid is not None: - relevant_ids.add(cid) + relevant_ids = _resolve_relevant_ids(session, ctx, entry) if not relevant_ids: continue - valid_queries += 1 - [query_vec] = embedder.embed([entry["query"]]) + [query_vec] = ctx.embedder.embed([entry["query"]]) hits = cosine_top_k(session, query_vec=query_vec, k=k) - retrieved_ids = [h.chunk.id for h in hits] - retrieved_set = set(retrieved_ids) + tally.record(relevant_ids, [h.chunk.id for h in hits], k) - precision_sum += len(relevant_ids & retrieved_set) / k - recall_sum += len(relevant_ids & retrieved_set) / len(relevant_ids) + return tally.to_result(n_queries=n_queries, k=k) - rank = next( - (i + 1 for i, cid in enumerate(retrieved_ids) if cid in relevant_ids), - 0, - ) - rr_sum += (1.0 / rank) if rank > 0 else 0.0 - if valid_queries == 0: - return RetrievalResult( - n_queries=n_queries, - k=k, - quotable=False, - note="No queries had resolvable relevant chunks.", - ) - - return RetrievalResult( - n_queries=valid_queries, - k=k, - quotable=True, - precision_at_k=precision_sum / valid_queries, - recall_at_k=recall_sum / valid_queries, - mrr=rr_sum / valid_queries, - ) - - -# --- rag -------------------------------------------------------------------------- +# --- rag --------------------------------------------------------------------------- def _parse_cited_chunk_ids(text: str) -> set[int]: return {int(m.group(1)) for m in CITATION_PATTERN.finditer(text)} -def evaluate_rag( - session: Session, - *, - settings: Settings | None = None, - llm: LLMClient | None = None, - embedder: EmbeddingProvider | None = None, - corpus_dir: Path = DEFAULT_CORPUS_DIR, - labels_dir: Path = LABELS_DIR, -) -> RagResult: - settings = settings or get_settings() - embedder = embedder or get_embedder(settings) - llm = llm or get_llm(settings) - - labels = _load_labels("rag_labels.json", labels_dir) +def _citation_validity_contribution( + cited_ids: set[int], retrieved_ids: set[int] +) -> tuple[int, int]: + """Return ``(hits, total)`` for one answer's citation-validity score. + + Every cited id should be in the retrieved set. An answer with no citations + counts as 0/1 to penalise it — this mirrors the M3 citation-or-refuse posture + while reporting a rate rather than refusing.""" + total = len(cited_ids) if cited_ids else 1 + if not cited_ids: + return 0, total + if cited_ids.issubset(retrieved_ids): + return len(cited_ids), total + return len(cited_ids & retrieved_ids), total + + +@dataclass +class _RagTally: + """Mutable accumulator for per-question RAG scoring.""" + + refusals: int = 0 + answered: int = 0 + citation_validity_hits: int = 0 + citation_validity_total: int = 0 + cites_relevant_hits: int = 0 + answer_substring_hits: int = 0 + + def record_answer( + self, + *, + answer: str, + retrieved_ids: set[int], + relevant_ids: set[int], + expected_substring: str | None, + ) -> None: + """Fold one answered question into the citation-validity, cites-relevant, + and substring-match arms.""" + self.answered += 1 + cited_ids = _parse_cited_chunk_ids(answer) + hits, total = _citation_validity_contribution(cited_ids, retrieved_ids) + self.citation_validity_hits += hits + self.citation_validity_total += total + if cited_ids & relevant_ids: + self.cites_relevant_hits += 1 + if expected_substring and expected_substring.casefold() in answer.casefold(): + self.answer_substring_hits += 1 + + def to_result(self, *, n_questions: int) -> RagResult: + """Emit the frozen result: citation-validity, cites-relevant, substring rates.""" + if self.answered == 0: + return RagResult( + n_questions=n_questions, + refusals=self.refusals, + answered=0, + quotable=False, + note="All questions were refused; cannot compute answer-side metrics.", + ) + citation_validity_rate = ( + self.citation_validity_hits / self.citation_validity_total + if self.citation_validity_total + else 0.0 + ) + return RagResult( + n_questions=n_questions, + refusals=self.refusals, + answered=self.answered, + quotable=True, + citation_validity_rate=citation_validity_rate, + cites_relevant_rate=self.cites_relevant_hits / self.answered, + answer_substring_match_rate=self.answer_substring_hits / self.answered, + ) + + +def evaluate_rag(session: Session, ctx: EvalContext) -> RagResult: + """Score citation-grounded RAG answers against the labelled questions.""" + labels = _load_labels("rag_labels.json", ctx.labels_dir) questions: list[dict[str, Any]] = labels.get("questions", []) n_questions = len(questions) - if settings.embeddings_provider == "fake" or settings.llm_provider == "fake": + if ctx.settings.embeddings_provider == "fake" or ctx.settings.llm_provider == "fake": return RagResult( n_questions=n_questions, refusals=0, @@ -403,90 +490,29 @@ def evaluate_rag( ), ) - referenced: set[str] = { - rel["source_filename"] for q in questions for rel in q.get("relevant", []) - } - for source in referenced: - _ensure_ingested( - session, - corpus_dir=corpus_dir, - source_filename=source, - embedder=embedder, - settings=settings, - ) - - refusals = 0 - answered = 0 - citation_validity_hits = 0 - citation_validity_total = 0 - cites_relevant_hits = 0 - answer_substring_hits = 0 + llm = ctx.require_llm() + _ingest_relevant_sources(session, ctx, questions) + tally = _RagTally() for entry in questions: result = answer_query( session, query=entry["question"], - embedder=embedder, + embedder=ctx.embedder, llm=llm, - settings=settings, + settings=ctx.settings, ) if result.status == "refused": - refusals += 1 + tally.refusals += 1 continue - answered += 1 - - retrieved_ids = {h.chunk.id for h in result.retrieved} - cited_ids = _parse_cited_chunk_ids(result.answer) - citation_validity_total += len(cited_ids) if cited_ids else 1 - # Every cited id should be in the retrieved set; if no citations exist we - # count as 0/1 to penalise a citation-less answer (this matches the M3 - # citation-or-refuse posture but reports the rate rather than refusing). - if cited_ids and cited_ids.issubset(retrieved_ids): - citation_validity_hits += len(cited_ids) - elif cited_ids: - citation_validity_hits += len(cited_ids & retrieved_ids) - - relevant_ids = { - _resolve_chunk_id( - session, - corpus_dir=corpus_dir, - source_filename=ref["source_filename"], - chunk_ord=ref["chunk_ord"], - ) - for ref in entry.get("relevant", []) - } - relevant_ids.discard(None) - if cited_ids & relevant_ids: - cites_relevant_hits += 1 - - expected_substring = entry.get("expected_answer_substring") - if expected_substring and expected_substring.casefold() in result.answer.casefold(): - answer_substring_hits += 1 - - if answered == 0: - return RagResult( - n_questions=n_questions, - refusals=refusals, - answered=0, - quotable=False, - note="All questions were refused; cannot compute answer-side metrics.", + tally.record_answer( + answer=result.answer, + retrieved_ids={h.chunk.id for h in result.retrieved}, + relevant_ids=_resolve_relevant_ids(session, ctx, entry), + expected_substring=entry.get("expected_answer_substring"), ) - citation_validity_rate = ( - citation_validity_hits / citation_validity_total if citation_validity_total else 0.0 - ) - cites_relevant_rate = cites_relevant_hits / answered - answer_substring_match_rate = answer_substring_hits / answered - - return RagResult( - n_questions=n_questions, - refusals=refusals, - answered=answered, - quotable=True, - citation_validity_rate=citation_validity_rate, - cites_relevant_rate=cites_relevant_rate, - answer_substring_match_rate=answer_substring_match_rate, - ) + return tally.to_result(n_questions=n_questions) # --- orchestrator ------------------------------------------------------------------ @@ -507,46 +533,13 @@ def _settings_summary(settings: Settings) -> dict[str, Any]: } -def run_all( - session: Session, - *, - settings: Settings | None = None, - llm: LLMClient | None = None, - embedder: EmbeddingProvider | None = None, - corpus_dir: Path = DEFAULT_CORPUS_DIR, - labels_dir: Path = LABELS_DIR, -) -> HarnessReport: - settings = settings or get_settings() - - extraction = evaluate_extraction( - session, - settings=settings, - llm=llm, - embedder=embedder, - corpus_dir=corpus_dir, - labels_dir=labels_dir, - ) - retrieval = evaluate_retrieval( - session, - settings=settings, - embedder=embedder, - corpus_dir=corpus_dir, - labels_dir=labels_dir, - ) - rag = evaluate_rag( - session, - settings=settings, - llm=llm, - embedder=embedder, - corpus_dir=corpus_dir, - labels_dir=labels_dir, - ) - +def run_all(session: Session, ctx: EvalContext) -> HarnessReport: + """Run every evaluator against ``ctx`` and assemble the combined report.""" return HarnessReport( - extraction=extraction, - retrieval=retrieval, - rag=rag, - settings_summary=_settings_summary(settings), + extraction=evaluate_extraction(session, ctx), + retrieval=evaluate_retrieval(session, ctx), + rag=evaluate_rag(session, ctx), + settings_summary=_settings_summary(ctx.settings), ) @@ -564,12 +557,11 @@ def referenced_files(labels_dir: Path = LABELS_DIR) -> Iterable[str]: def _collect_filenames(labels: dict[str, Any]) -> Sequence[str]: - out: list[str] = [] - for entry in labels.get("items", []): - if "source_filename" in entry: - out.append(entry["source_filename"]) - for entry in labels.get("queries", []) + labels.get("questions", []): - for rel in entry.get("relevant", []): - if "source_filename" in rel: - out.append(rel["source_filename"]) - return out + items = [e["source_filename"] for e in labels.get("items", []) if "source_filename" in e] + relevant = [ + rel["source_filename"] + for entry in labels.get("queries", []) + labels.get("questions", []) + for rel in entry.get("relevant", []) + if "source_filename" in rel + ] + return items + relevant diff --git a/eval/run.py b/eval/run.py index b3cf073..d7161e2 100644 --- a/eval/run.py +++ b/eval/run.py @@ -11,7 +11,7 @@ from backend.app.config import get_settings from backend.app.db import get_session_factory -from eval.harness import HarnessReport, run_all +from eval.harness import EvalContext, HarnessReport, run_all from eval.results import render REPO_ROOT = Path(__file__).resolve().parent.parent @@ -88,7 +88,7 @@ def main(argv: list[str] | None = None) -> int: settings = get_settings() factory = get_session_factory() with factory() as session: - report = run_all(session, settings=settings) + report = run_all(session, EvalContext.create(settings=settings)) # The harness only reads; ensure no stray writes leak. Rolling back keeps # the eval idempotent against a long-lived DB. session.rollback() From b3385cae968a6f411d971af6c6a5ee76856aa135 Mon Sep 17 00:00:00 2001 From: nasr <156965421+div0rce@users.noreply.github.com> Date: Mon, 8 Jun 2026 11:41:01 -0400 Subject: [PATCH 2/5] docs(eval): docstring public result dataclasses and run.py entry points Close the genuine public-API docstring gap in the files this PR touches: the four result dataclasses (ExtractionResult/RetrievalResult/RagResult/HarnessReport) and the eval.run functions. Public API in eval/harness.py and eval/run.py is now 100% documented (private helpers stay per CLAUDE.md's 'docstrings on public functions'). --- eval/harness.py | 13 +++++++++++++ eval/run.py | 4 ++++ 2 files changed, 17 insertions(+) diff --git a/eval/harness.py b/eval/harness.py index 5bf523d..2de5095 100644 --- a/eval/harness.py +++ b/eval/harness.py @@ -57,6 +57,9 @@ @dataclass(frozen=True, slots=True) class ExtractionResult: + """Scored extraction metrics, or an n/a result (``quotable=False``) when the + LLM provider is fake or no extraction succeeded.""" + n_documents: int quotable: bool micro_accuracy: float | None = None @@ -69,6 +72,9 @@ class ExtractionResult: @dataclass(frozen=True, slots=True) class RetrievalResult: + """Scored retrieval metrics (precision@k / recall@k / MRR), or an n/a result + (``quotable=False``) under the fake embedder or when no query resolved.""" + n_queries: int k: int quotable: bool @@ -80,6 +86,10 @@ class RetrievalResult: @dataclass(frozen=True, slots=True) class RagResult: + """Scored RAG metrics (citation-validity / cites-relevant / substring-match + rates), or an n/a result (``quotable=False``) under a fake provider or when + every question was refused.""" + n_questions: int refusals: int answered: int @@ -92,6 +102,9 @@ class RagResult: @dataclass(frozen=True, slots=True) class HarnessReport: + """The combined output of all three evaluators plus the run's settings summary, + consumed by :mod:`eval.results` to render ``RESULTS.md``.""" + extraction: ExtractionResult retrieval: RetrievalResult rag: RagResult diff --git a/eval/run.py b/eval/run.py index d7161e2..1e813ad 100644 --- a/eval/run.py +++ b/eval/run.py @@ -19,6 +19,7 @@ def _build_parser() -> argparse.ArgumentParser: + """Build the ``python -m eval.run`` argument parser (``--out`` / ``--no-write``).""" parser = argparse.ArgumentParser( prog="python -m eval.run", description="Run the Sentinel evaluation harness and write eval/RESULTS.md.", @@ -38,6 +39,7 @@ def _build_parser() -> argparse.ArgumentParser: def _print_summary(report: HarnessReport) -> None: + """Print a one-line-per-evaluator summary of the report to stdout.""" s = report.settings_summary print( f"eval: llm={s['llm_provider']}/{s['llm_model']} " @@ -84,6 +86,8 @@ def _print_summary(report: HarnessReport) -> None: def main(argv: list[str] | None = None) -> int: + """Run every evaluator against the configured DB, print a summary, and (unless + ``--no-write``) overwrite ``eval/RESULTS.md``. Returns a process exit code.""" args = _build_parser().parse_args(argv) settings = get_settings() factory = get_session_factory() From 5680bfb8c34f44cccf2d2e52367d5128a5f5f14a Mon Sep 17 00:00:00 2001 From: nasr <156965421+div0rce@users.noreply.github.com> Date: Mon, 8 Jun 2026 11:44:10 -0400 Subject: [PATCH 3/5] docs(eval): clarify require_llm non-caching and RetrievalResult.n_queries semantics Address CodeRabbit review on PR #18: require_llm's docstring no longer implies caching (it re-resolves each call on a frozen dataclass), and RetrievalResult documents that n_queries is the scored-query count when quotable vs the total label count in the n/a case. Docstring-only; no behaviour change. --- eval/harness.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/eval/harness.py b/eval/harness.py index 2de5095..c3d9e82 100644 --- a/eval/harness.py +++ b/eval/harness.py @@ -73,7 +73,11 @@ class ExtractionResult: @dataclass(frozen=True, slots=True) class RetrievalResult: """Scored retrieval metrics (precision@k / recall@k / MRR), or an n/a result - (``quotable=False``) under the fake embedder or when no query resolved.""" + (``quotable=False``) under the fake embedder or when no query resolved. + + When ``quotable``, ``n_queries`` is the number of queries actually *scored* — + label queries whose relevant chunks don't resolve are skipped and excluded from + the averages. The n/a result reports the total label-set query count instead.""" n_queries: int k: int @@ -141,8 +145,12 @@ def create(cls, settings: Settings | None = None) -> EvalContext: return cls(settings=settings, embedder=get_embedder(settings)) def require_llm(self) -> LLMClient: - """Return the LLM, resolving it from settings on first need. Evaluators that - never call this (e.g. retrieval) never force an LLM provider to resolve.""" + """Return the LLM if set, otherwise resolve one from settings. + + Does not cache the result (this is a frozen dataclass), so callers should + bind the returned client to a local if they use it more than once. + Evaluators that never call this (e.g. retrieval) never force an LLM provider + to resolve.""" return self.llm if self.llm is not None else get_llm(self.settings) From ebcbe169a81f273b3f677a595838df24f04512e8 Mon Sep 17 00:00:00 2001 From: nasr <156965421+div0rce@users.noreply.github.com> Date: Mon, 8 Jun 2026 11:53:55 -0400 Subject: [PATCH 4/5] docs(eval): 100% docstring coverage on the files this PR touches MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CodeRabbit's docstring-coverage pre-merge check is scored on the PR's changed files (PR #18 was at 64.5% < 80% threshold). Document every remaining function/method/class in eval/harness.py and test_eval_harness.py with meaningful one-liners (consistent with the tests that already carried docstrings) — describing what each test verifies and what each helper does, not stub padding. Changed files now 100%; clears the advisory honestly without lowering the threshold. Code Health unchanged (10.0); make-check green. --- backend/tests/test_eval_harness.py | 16 ++++++++++++++++ eval/harness.py | 6 ++++++ 2 files changed, 22 insertions(+) diff --git a/backend/tests/test_eval_harness.py b/backend/tests/test_eval_harness.py index f714494..4815c8a 100644 --- a/backend/tests/test_eval_harness.py +++ b/backend/tests/test_eval_harness.py @@ -65,6 +65,7 @@ class ScriptedFakeLLM: def complete( self, *, system: str, user: str, max_tokens: int, temperature: float ) -> LLMResponse: + """Return the chunk-keyed scripted response if set, else the default text.""" self.calls += 1 match = re.search(r"\[chunk:(\d+)\]", user) text: str @@ -80,14 +81,17 @@ class _RankedEmbedder: input text. Pins cosine similarity rankings for the retrieval/RAG tests.""" def __init__(self, mapping: dict[str, list[float]], default: list[float] | None = None) -> None: + """Store the text→vector mapping and the fallback vector for unmatched text.""" self._mapping = mapping self._default = default or _basis(0) @property def dim(self) -> int: + """Embedding dimension (matches the database schema).""" return SCHEMA_EMBEDDING_DIM def embed(self, texts: Sequence[str]) -> list[list[float]]: + """Return the mapped vector for each text (first substring match wins).""" out: list[list[float]] = [] for text in texts: matched = self._default @@ -100,6 +104,7 @@ def embed(self, texts: Sequence[str]) -> list[list[float]]: def _basis(i: int, dim: int = SCHEMA_EMBEDDING_DIM) -> list[float]: + """Unit basis vector with ``1.0`` at index ``i``.""" v = [0.0] * dim v[i] = 1.0 return v @@ -118,6 +123,7 @@ def _unit(strength: float, dim: int = SCHEMA_EMBEDDING_DIM) -> list[float]: def _write_corpus(tmp_path: Path, files: dict[str, str]) -> Path: + """Write a corpus directory from ``{filename: body}`` and return its path.""" corpus = tmp_path / "corpus" corpus.mkdir() for name, body in files.items(): @@ -126,6 +132,7 @@ def _write_corpus(tmp_path: Path, files: dict[str, str]) -> Path: def _write_labels(tmp_path: Path, name: str, content: dict[str, Any]) -> Path: + """Write a label JSON file under a labels dir and return the dir.""" labels = tmp_path / "labels" labels.mkdir(exist_ok=True) (labels / name).write_text(json.dumps(content), encoding="utf-8") @@ -153,6 +160,7 @@ def _real_provider_settings(**overrides: Any) -> Settings: def test_evaluate_extraction_emits_na_under_fake_llm(session: Session, tmp_path: Path) -> None: + """A fake LLM provider makes extraction non-quotable (n/a, None metrics).""" corpus = _write_corpus(tmp_path, {"a.md": "Anything"}) labels = _write_labels( tmp_path, @@ -188,6 +196,7 @@ def test_evaluate_extraction_emits_na_under_fake_llm(session: Session, tmp_path: def test_evaluate_retrieval_emits_na_under_fake_embedder(session: Session, tmp_path: Path) -> None: + """A fake embedder makes retrieval non-quotable (n/a, None metrics).""" corpus = _write_corpus(tmp_path, {"a.md": "Anything"}) labels = _write_labels( tmp_path, @@ -217,6 +226,7 @@ def test_evaluate_retrieval_emits_na_under_fake_embedder(session: Session, tmp_p def test_evaluate_rag_emits_na_under_either_fake(session: Session, tmp_path: Path) -> None: + """Either fake provider makes RAG non-quotable (n/a, None metrics).""" corpus = _write_corpus(tmp_path, {"a.md": "Anything"}) labels = _write_labels( tmp_path, @@ -252,6 +262,7 @@ def test_evaluate_rag_emits_na_under_either_fake(session: Session, tmp_path: Pat def _perfect_invoice_json_for(chunk_id: int) -> str: + """Build a perfect-extraction invoice JSON whose fields cite ``chunk_id``.""" return json.dumps( { "invoice_number": { @@ -275,12 +286,14 @@ def _perfect_invoice_json_for(chunk_id: int) -> str: def _wrong_vendor_invoice_json_for(chunk_id: int) -> str: + """Perfect invoice JSON but with a wrong vendor value (one bad field).""" payload = json.loads(_perfect_invoice_json_for(chunk_id)) payload["vendor"]["value"] = "WRONG" return json.dumps(payload) def _expected_invoice_payload() -> dict[str, Any]: + """The ground-truth invoice payload the extraction fixtures score against.""" return { "invoice_number": "INV-X", "vendor": "Acme", @@ -292,6 +305,7 @@ def _expected_invoice_payload() -> dict[str, Any]: def test_evaluate_extraction_perfect_run_yields_unit_accuracy( session: Session, tmp_path: Path ) -> None: + """A perfect extraction run yields 1.0 on micro/macro and every per-field axis.""" corpus = _write_corpus(tmp_path, {"x.md": "INV-X from Acme, 2026-01-22, total $1,234.56."}) labels = _write_labels( tmp_path, @@ -437,6 +451,7 @@ def test_evaluate_rag_pins_three_rates_on_happy_path(session: Session, tmp_path: embedder = _RankedEmbedder({"Initech": _unit(1.0)}, default=_unit(1.0)) def respond(cid: int) -> str: + """Answer citing the runtime-allocated chunk id with the expected substring.""" return f"The total due is 90,006.92 dollars [chunk:{cid}]." llm = ScriptedFakeLLM(response_for_chunk=respond) @@ -475,6 +490,7 @@ def respond(cid: int) -> str: def test_render_pending_contains_methodology_only_no_numbers() -> None: + """The PENDING RESULTS.md renders methodology only — no numeric metrics.""" body = render_pending() assert "Numbers pending real-provider run" in body assert "claude-sonnet-4-6" in body diff --git a/eval/harness.py b/eval/harness.py index c3d9e82..4b4a97c 100644 --- a/eval/harness.py +++ b/eval/harness.py @@ -158,10 +158,12 @@ def require_llm(self) -> LLMClient: def _read_corpus_file(corpus_dir: Path, source_filename: str) -> str: + """Read a corpus file's text by name, relative to ``corpus_dir``.""" return (corpus_dir / source_filename).read_text(encoding="utf-8") def _load_labels(name: str, labels_dir: Path) -> dict[str, Any]: + """Load a label set from JSON, requiring a top-level object.""" data = json.loads((labels_dir / name).read_text(encoding="utf-8")) if not isinstance(data, dict): raise ValueError(f"label file {name!r} must contain a JSON object at top level") @@ -202,6 +204,7 @@ def _ingest_relevant_sources( def _resolve_chunk_id( session: Session, ctx: EvalContext, source_filename: str, chunk_ord: int ) -> int | None: + """Resolve the chunk id for a ``(source file, chunk ord)`` pair, or ``None``.""" text = _read_corpus_file(ctx.corpus_dir, source_filename) doc = documents_repo.get_by_hash(session, canonical_hash(text)) if doc is None: @@ -417,6 +420,7 @@ def evaluate_retrieval(session: Session, ctx: EvalContext) -> RetrievalResult: def _parse_cited_chunk_ids(text: str) -> set[int]: + """Return the chunk ids cited as ``[chunk:N]`` markers in ``text``.""" return {int(m.group(1)) for m in CITATION_PATTERN.finditer(text)} @@ -540,6 +544,7 @@ def evaluate_rag(session: Session, ctx: EvalContext) -> RagResult: def _settings_summary(settings: Settings) -> dict[str, Any]: + """Snapshot the run's provider / model / threshold settings for RESULTS.md.""" # Report the *active* provider's model, not a hardcoded Anthropic/OpenAI label, so # a Gemini (or fake) run does not mislabel itself in RESULTS.md (Golden Rule #5). return { @@ -578,6 +583,7 @@ def referenced_files(labels_dir: Path = LABELS_DIR) -> Iterable[str]: def _collect_filenames(labels: dict[str, Any]) -> Sequence[str]: + """Return every ``source_filename`` referenced in a single label set.""" items = [e["source_filename"] for e in labels.get("items", []) if "source_filename" in e] relevant = [ rel["source_filename"] From 7edd0e28069ec9eca523a31b97013fc948962e19 Mon Sep 17 00:00:00 2001 From: nasr <156965421+div0rce@users.noreply.github.com> Date: Mon, 8 Jun 2026 12:04:40 -0400 Subject: [PATCH 5/5] chore: add versioned CodeRabbit config documenting docstring standards MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Config-as-code in preference to hidden Org-UI settings. Encodes CLAUDE.md's docstring convention (public functions documented; comments explain why; tests self-document) as per-path docstring-generation instructions. Deliberately does NOT lower or disable the docstring-coverage pre-merge check — its threshold stays at the Org default. Side effect: the fresh review this commit triggers recomputes the (previously lagging) docstring-coverage figure at the current head, where the changed files are 100% documented. --- .coderabbit.yaml | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 .coderabbit.yaml diff --git a/.coderabbit.yaml b/.coderabbit.yaml new file mode 100644 index 0000000..4577dd3 --- /dev/null +++ b/.coderabbit.yaml @@ -0,0 +1,36 @@ +# yaml-language-server: $schema=https://coderabbit.ai/integrations/schema.v2.json +# +# Versioned CodeRabbit configuration (config-as-code, in preference to hidden +# Org-UI settings). Encodes this repo's documented docstring convention from +# CLAUDE.md: docstrings on PUBLIC functions; comments explain *why*, not *what*; +# test names are self-documenting (a concise one-liner of intent is enough). +# +# Note on the "Docstring Coverage" pre-merge check: its threshold is intentionally +# left at the Org-UI default — this file does not lower or disable it. It only +# guides the docstrings CodeRabbit *generates* so new code matches the house style. + +reviews: + profile: assertive + auto_review: + enabled: true + drafts: false + +code_generation: + docstrings: + path_instructions: + - path: "backend/app/**/*.py" + instructions: | + Google-style docstrings on public functions and classes, with Args / + Returns / Raises where they add information. Document behaviour and edge + cases; keep comments focused on *why*, not *what*. Private helpers may be + left undocumented when their name and signature are self-evident. + - path: "eval/**/*.py" + instructions: | + Concise docstrings on public functions and classes. For evaluators and + result types, state the honesty-gate behaviour (quotable vs n/a) so the + contract is explicit. Comments explain *why*, not *what*. + - path: "**/test_*.py" + instructions: | + A one-line docstring stating what each test verifies (e.g. the asserted + metric value or invariant). Keep it concise; the test name carries the + rest. Shared fixtures/helpers get a short purpose line.