div0rce · div0rce · Jun 8, 2026 · Jun 8, 2026 · Jun 8, 2026 · Jun 8, 2026
diff --git a/.coderabbit.yaml b/.coderabbit.yaml
@@ -0,0 +1,36 @@
+# yaml-language-server: $schema=https://coderabbit.ai/integrations/schema.v2.json
+#
+# Versioned CodeRabbit configuration (config-as-code, in preference to hidden
+# Org-UI settings). Encodes this repo's documented docstring convention from
+# CLAUDE.md: docstrings on PUBLIC functions; comments explain *why*, not *what*;
+# test names are self-documenting (a concise one-liner of intent is enough).
+#
+# Note on the "Docstring Coverage" pre-merge check: its threshold is intentionally
+# left at the Org-UI default — this file does not lower or disable it. It only
+# guides the docstrings CodeRabbit *generates* so new code matches the house style.
+
+reviews:
+  profile: assertive
+  auto_review:
+    enabled: true
+    drafts: false
+
+code_generation:
+  docstrings:
+    path_instructions:
+      - path: "backend/app/**/*.py"
+        instructions: |
+          Google-style docstrings on public functions and classes, with Args /
+          Returns / Raises where they add information. Document behaviour and edge
+          cases; keep comments focused on *why*, not *what*. Private helpers may be
+          left undocumented when their name and signature are self-evident.
+      - path: "eval/**/*.py"
+        instructions: |
+          Concise docstrings on public functions and classes. For evaluators and
+          result types, state the honesty-gate behaviour (quotable vs n/a) so the
+          contract is explicit. Comments explain *why*, not *what*.
+      - path: "**/test_*.py"
+        instructions: |
+          A one-line docstring stating what each test verifies (e.g. the asserted
+          metric value or invariant). Keep it concise; the test name carries the
+          rest. Shared fixtures/helpers get a short purpose line.
diff --git a/backend/tests/test_eval_harness.py b/backend/tests/test_eval_harness.py
@@ -38,6 +38,7 @@
 from backend.app.llm import LLMResponse
 from backend.app.models import SCHEMA_EMBEDDING_DIM
 from eval.harness import (
+    EvalContext,
     HarnessReport,
     RagResult,
     RetrievalResult,
@@ -64,6 +65,7 @@ class ScriptedFakeLLM:
     def complete(
         self, *, system: str, user: str, max_tokens: int, temperature: float
     ) -> LLMResponse:
+        """Return the chunk-keyed scripted response if set, else the default text."""
         self.calls += 1
         match = re.search(r"\[chunk:(\d+)\]", user)
         text: str
@@ -79,14 +81,17 @@ class _RankedEmbedder:
     input text. Pins cosine similarity rankings for the retrieval/RAG tests."""
 
     def __init__(self, mapping: dict[str, list[float]], default: list[float] | None = None) -> None:
+        """Store the text→vector mapping and the fallback vector for unmatched text."""
         self._mapping = mapping
         self._default = default or _basis(0)
 
     @property
     def dim(self) -> int:
+        """Embedding dimension (matches the database schema)."""
         return SCHEMA_EMBEDDING_DIM
 
     def embed(self, texts: Sequence[str]) -> list[list[float]]:
+        """Return the mapped vector for each text (first substring match wins)."""
         out: list[list[float]] = []
         for text in texts:
             matched = self._default
@@ -99,6 +104,7 @@ def embed(self, texts: Sequence[str]) -> list[list[float]]:
 
 
 def _basis(i: int, dim: int = SCHEMA_EMBEDDING_DIM) -> list[float]:
+    """Unit basis vector with ``1.0`` at index ``i``."""
     v = [0.0] * dim
     v[i] = 1.0
     return v
@@ -117,6 +123,7 @@ def _unit(strength: float, dim: int = SCHEMA_EMBEDDING_DIM) -> list[float]:
 
 
 def _write_corpus(tmp_path: Path, files: dict[str, str]) -> Path:
+    """Write a corpus directory from ``{filename: body}`` and return its path."""
     corpus = tmp_path / "corpus"
     corpus.mkdir()
     for name, body in files.items():
@@ -125,6 +132,7 @@ def _write_corpus(tmp_path: Path, files: dict[str, str]) -> Path:
 
 
 def _write_labels(tmp_path: Path, name: str, content: dict[str, Any]) -> Path:
+    """Write a label JSON file under a labels dir and return the dir."""
     labels = tmp_path / "labels"
     labels.mkdir(exist_ok=True)
     (labels / name).write_text(json.dumps(content), encoding="utf-8")
@@ -152,6 +160,7 @@ def _real_provider_settings(**overrides: Any) -> Settings:
 
 
 def test_evaluate_extraction_emits_na_under_fake_llm(session: Session, tmp_path: Path) -> None:
+    """A fake LLM provider makes extraction non-quotable (n/a, None metrics)."""
     corpus = _write_corpus(tmp_path, {"a.md": "Anything"})
     labels = _write_labels(
         tmp_path,
@@ -171,14 +180,14 @@ def test_evaluate_extraction_emits_na_under_fake_llm(session: Session, tmp_path:
             ]
         },
     )
-    result = evaluate_extraction(
-        session,
+    ctx = EvalContext(
         settings=Settings.model_validate({"llm_provider": "fake", "embeddings_provider": "fake"}),
         llm=ScriptedFakeLLM(),
         embedder=FakeEmbedder(),
         corpus_dir=corpus,
         labels_dir=labels,
     )
+    result = evaluate_extraction(session, ctx)
     assert result.quotable is False
     assert result.micro_accuracy is None
     assert result.macro_accuracy is None
@@ -187,6 +196,7 @@ def test_evaluate_extraction_emits_na_under_fake_llm(session: Session, tmp_path:
 
 
 def test_evaluate_retrieval_emits_na_under_fake_embedder(session: Session, tmp_path: Path) -> None:
+    """A fake embedder makes retrieval non-quotable (n/a, None metrics)."""
     corpus = _write_corpus(tmp_path, {"a.md": "Anything"})
     labels = _write_labels(
         tmp_path,
@@ -201,13 +211,13 @@ def test_evaluate_retrieval_emits_na_under_fake_embedder(session: Session, tmp_p
             ],
         },
     )
-    result = evaluate_retrieval(
-        session,
+    ctx = EvalContext(
         settings=Settings.model_validate({"embeddings_provider": "fake"}),
         embedder=FakeEmbedder(),
         corpus_dir=corpus,
         labels_dir=labels,
     )
+    result = evaluate_retrieval(session, ctx)
     assert result.quotable is False
     assert result.precision_at_k is None
     assert result.recall_at_k is None
@@ -216,6 +226,7 @@ def test_evaluate_retrieval_emits_na_under_fake_embedder(session: Session, tmp_p
 
 
 def test_evaluate_rag_emits_na_under_either_fake(session: Session, tmp_path: Path) -> None:
+    """Either fake provider makes RAG non-quotable (n/a, None metrics)."""
     corpus = _write_corpus(tmp_path, {"a.md": "Anything"})
     labels = _write_labels(
         tmp_path,
@@ -234,14 +245,14 @@ def test_evaluate_rag_emits_na_under_either_fake(session: Session, tmp_path: Pat
         Settings.model_validate({"llm_provider": "fake", "embeddings_provider": "openai"}),
         Settings.model_validate({"llm_provider": "anthropic", "embeddings_provider": "fake"}),
     ):
-        result = evaluate_rag(
-            session,
+        ctx = EvalContext(
             settings=settings,
             llm=ScriptedFakeLLM(),
             embedder=FakeEmbedder(),
             corpus_dir=corpus,
             labels_dir=labels,
         )
+        result = evaluate_rag(session, ctx)
         assert result.quotable is False
         assert result.citation_validity_rate is None
         assert result.cites_relevant_rate is None
@@ -251,6 +262,7 @@ def test_evaluate_rag_emits_na_under_either_fake(session: Session, tmp_path: Pat
 
 
 def _perfect_invoice_json_for(chunk_id: int) -> str:
+    """Build a perfect-extraction invoice JSON whose fields cite ``chunk_id``."""
     return json.dumps(
         {
             "invoice_number": {
@@ -274,12 +286,14 @@ def _perfect_invoice_json_for(chunk_id: int) -> str:
 
 
 def _wrong_vendor_invoice_json_for(chunk_id: int) -> str:
+    """Perfect invoice JSON but with a wrong vendor value (one bad field)."""
     payload = json.loads(_perfect_invoice_json_for(chunk_id))
     payload["vendor"]["value"] = "WRONG"
     return json.dumps(payload)
 
 
 def _expected_invoice_payload() -> dict[str, Any]:
+    """The ground-truth invoice payload the extraction fixtures score against."""
     return {
         "invoice_number": "INV-X",
         "vendor": "Acme",
@@ -291,6 +305,7 @@ def _expected_invoice_payload() -> dict[str, Any]:
 def test_evaluate_extraction_perfect_run_yields_unit_accuracy(
     session: Session, tmp_path: Path
 ) -> None:
+    """A perfect extraction run yields 1.0 on micro/macro and every per-field axis."""
     corpus = _write_corpus(tmp_path, {"x.md": "INV-X from Acme, 2026-01-22, total $1,234.56."})
     labels = _write_labels(
         tmp_path,
@@ -307,14 +322,14 @@ def test_evaluate_extraction_perfect_run_yields_unit_accuracy(
     )
     llm = ScriptedFakeLLM(response_for_chunk=_perfect_invoice_json_for)
 
-    result = evaluate_extraction(
-        session,
+    ctx = EvalContext(
         settings=_real_provider_settings(),
         llm=llm,
         embedder=FakeEmbedder(),
         corpus_dir=corpus,
         labels_dir=labels,
     )
+    result = evaluate_extraction(session, ctx)
     assert result.quotable is True
     assert result.failed_extractions == 0
     assert result.micro_accuracy == pytest.approx(1.0)
@@ -345,14 +360,14 @@ def test_evaluate_extraction_one_wrong_field_pins_micro_macro(
     )
     llm = ScriptedFakeLLM(response_for_chunk=_wrong_vendor_invoice_json_for)
 
-    result = evaluate_extraction(
-        session,
+    ctx = EvalContext(
         settings=_real_provider_settings(),
         llm=llm,
         embedder=FakeEmbedder(),
         corpus_dir=corpus,
         labels_dir=labels,
     )
+    result = evaluate_extraction(session, ctx)
     assert result.quotable is True
     assert result.failed_extractions == 0
     # 3 of 4 fields correct → micro = 0.75. Macro = mean of (1, 0, 1, 1) = 0.75.
@@ -410,13 +425,13 @@ def test_evaluate_retrieval_pins_precision_recall_mrr(session: Session, tmp_path
         },
     )
 
-    result = evaluate_retrieval(
-        session,
+    ctx = EvalContext(
         settings=_real_provider_settings(retrieval_top_k=2),
         embedder=embedder,
         corpus_dir=corpus,
         labels_dir=labels,
     )
+    result = evaluate_retrieval(session, ctx)
     assert result.quotable is True
     assert result.k == 2
     assert result.precision_at_k == pytest.approx(0.5)
@@ -436,6 +451,7 @@ def test_evaluate_rag_pins_three_rates_on_happy_path(session: Session, tmp_path:
     embedder = _RankedEmbedder({"Initech": _unit(1.0)}, default=_unit(1.0))
 
     def respond(cid: int) -> str:
+        """Answer citing the runtime-allocated chunk id with the expected substring."""
         return f"The total due is 90,006.92 dollars [chunk:{cid}]."
 
     llm = ScriptedFakeLLM(response_for_chunk=respond)
@@ -454,14 +470,14 @@ def respond(cid: int) -> str:
         },
     )
 
-    result = evaluate_rag(
-        session,
+    ctx = EvalContext(
         settings=_real_provider_settings(retrieval_top_k=3),
         llm=llm,
         embedder=embedder,
         corpus_dir=corpus,
         labels_dir=labels,
     )
+    result = evaluate_rag(session, ctx)
     assert result.quotable is True
     assert result.refusals == 0
     assert result.answered == 1
@@ -474,6 +490,7 @@ def respond(cid: int) -> str:
 
 
 def test_render_pending_contains_methodology_only_no_numbers() -> None:
+    """The PENDING RESULTS.md renders methodology only — no numeric metrics."""
     body = render_pending()
     assert "Numbers pending real-provider run" in body
     assert "claude-sonnet-4-6" in body
@@ -502,14 +519,14 @@ def test_render_writes_real_metrics_when_quotable(session: Session, tmp_path: Pa
     )
     llm = ScriptedFakeLLM(response_for_chunk=_perfect_invoice_json_for)
 
-    extraction = evaluate_extraction(
-        session,
+    ctx = EvalContext(
         settings=_real_provider_settings(),
         llm=llm,
         embedder=FakeEmbedder(),
         corpus_dir=corpus,
         labels_dir=labels,
     )
+    extraction = evaluate_extraction(session, ctx)
 
     report = HarnessReport(
         extraction=extraction,