Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions .coderabbit.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# yaml-language-server: $schema=https://coderabbit.ai/integrations/schema.v2.json
#
# Versioned CodeRabbit configuration (config-as-code, in preference to hidden
# Org-UI settings). Encodes this repo's documented docstring convention from
# CLAUDE.md: docstrings on PUBLIC functions; comments explain *why*, not *what*;
# test names are self-documenting (a concise one-liner of intent is enough).
#
# Note on the "Docstring Coverage" pre-merge check: its threshold is intentionally
# left at the Org-UI default — this file does not lower or disable it. It only
# guides the docstrings CodeRabbit *generates* so new code matches the house style.

reviews:
profile: assertive
auto_review:
enabled: true
drafts: false

code_generation:
docstrings:
path_instructions:
- path: "backend/app/**/*.py"
instructions: |
Google-style docstrings on public functions and classes, with Args /
Returns / Raises where they add information. Document behaviour and edge
cases; keep comments focused on *why*, not *what*. Private helpers may be
left undocumented when their name and signature are self-evident.
- path: "eval/**/*.py"
instructions: |
Concise docstrings on public functions and classes. For evaluators and
result types, state the honesty-gate behaviour (quotable vs n/a) so the
contract is explicit. Comments explain *why*, not *what*.
- path: "**/test_*.py"
instructions: |
A one-line docstring stating what each test verifies (e.g. the asserted
metric value or invariant). Keep it concise; the test name carries the
rest. Shared fixtures/helpers get a short purpose line.
49 changes: 33 additions & 16 deletions backend/tests/test_eval_harness.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
from backend.app.llm import LLMResponse
from backend.app.models import SCHEMA_EMBEDDING_DIM
from eval.harness import (
EvalContext,
HarnessReport,
RagResult,
RetrievalResult,
Expand All @@ -64,6 +65,7 @@ class ScriptedFakeLLM:
def complete(
self, *, system: str, user: str, max_tokens: int, temperature: float
) -> LLMResponse:
"""Return the chunk-keyed scripted response if set, else the default text."""
self.calls += 1
match = re.search(r"\[chunk:(\d+)\]", user)
text: str
Expand All @@ -79,14 +81,17 @@ class _RankedEmbedder:
input text. Pins cosine similarity rankings for the retrieval/RAG tests."""

def __init__(self, mapping: dict[str, list[float]], default: list[float] | None = None) -> None:
"""Store the text→vector mapping and the fallback vector for unmatched text."""
self._mapping = mapping
self._default = default or _basis(0)

@property
def dim(self) -> int:
"""Embedding dimension (matches the database schema)."""
return SCHEMA_EMBEDDING_DIM

def embed(self, texts: Sequence[str]) -> list[list[float]]:
"""Return the mapped vector for each text (first substring match wins)."""
out: list[list[float]] = []
for text in texts:
matched = self._default
Expand All @@ -99,6 +104,7 @@ def embed(self, texts: Sequence[str]) -> list[list[float]]:


def _basis(i: int, dim: int = SCHEMA_EMBEDDING_DIM) -> list[float]:
"""Unit basis vector with ``1.0`` at index ``i``."""
v = [0.0] * dim
v[i] = 1.0
return v
Expand All @@ -117,6 +123,7 @@ def _unit(strength: float, dim: int = SCHEMA_EMBEDDING_DIM) -> list[float]:


def _write_corpus(tmp_path: Path, files: dict[str, str]) -> Path:
"""Write a corpus directory from ``{filename: body}`` and return its path."""
corpus = tmp_path / "corpus"
corpus.mkdir()
for name, body in files.items():
Expand All @@ -125,6 +132,7 @@ def _write_corpus(tmp_path: Path, files: dict[str, str]) -> Path:


def _write_labels(tmp_path: Path, name: str, content: dict[str, Any]) -> Path:
"""Write a label JSON file under a labels dir and return the dir."""
labels = tmp_path / "labels"
labels.mkdir(exist_ok=True)
(labels / name).write_text(json.dumps(content), encoding="utf-8")
Expand Down Expand Up @@ -152,6 +160,7 @@ def _real_provider_settings(**overrides: Any) -> Settings:


def test_evaluate_extraction_emits_na_under_fake_llm(session: Session, tmp_path: Path) -> None:
"""A fake LLM provider makes extraction non-quotable (n/a, None metrics)."""
corpus = _write_corpus(tmp_path, {"a.md": "Anything"})
labels = _write_labels(
tmp_path,
Expand All @@ -171,14 +180,14 @@ def test_evaluate_extraction_emits_na_under_fake_llm(session: Session, tmp_path:
]
},
)
result = evaluate_extraction(
session,
ctx = EvalContext(
settings=Settings.model_validate({"llm_provider": "fake", "embeddings_provider": "fake"}),
llm=ScriptedFakeLLM(),
embedder=FakeEmbedder(),
corpus_dir=corpus,
labels_dir=labels,
)
result = evaluate_extraction(session, ctx)
assert result.quotable is False
assert result.micro_accuracy is None
assert result.macro_accuracy is None
Expand All @@ -187,6 +196,7 @@ def test_evaluate_extraction_emits_na_under_fake_llm(session: Session, tmp_path:


def test_evaluate_retrieval_emits_na_under_fake_embedder(session: Session, tmp_path: Path) -> None:
"""A fake embedder makes retrieval non-quotable (n/a, None metrics)."""
corpus = _write_corpus(tmp_path, {"a.md": "Anything"})
labels = _write_labels(
tmp_path,
Expand All @@ -201,13 +211,13 @@ def test_evaluate_retrieval_emits_na_under_fake_embedder(session: Session, tmp_p
],
},
)
result = evaluate_retrieval(
session,
ctx = EvalContext(
settings=Settings.model_validate({"embeddings_provider": "fake"}),
embedder=FakeEmbedder(),
corpus_dir=corpus,
labels_dir=labels,
)
result = evaluate_retrieval(session, ctx)
assert result.quotable is False
assert result.precision_at_k is None
assert result.recall_at_k is None
Expand All @@ -216,6 +226,7 @@ def test_evaluate_retrieval_emits_na_under_fake_embedder(session: Session, tmp_p


def test_evaluate_rag_emits_na_under_either_fake(session: Session, tmp_path: Path) -> None:
"""Either fake provider makes RAG non-quotable (n/a, None metrics)."""
corpus = _write_corpus(tmp_path, {"a.md": "Anything"})
labels = _write_labels(
tmp_path,
Expand All @@ -234,14 +245,14 @@ def test_evaluate_rag_emits_na_under_either_fake(session: Session, tmp_path: Pat
Settings.model_validate({"llm_provider": "fake", "embeddings_provider": "openai"}),
Settings.model_validate({"llm_provider": "anthropic", "embeddings_provider": "fake"}),
):
result = evaluate_rag(
session,
ctx = EvalContext(
settings=settings,
llm=ScriptedFakeLLM(),
embedder=FakeEmbedder(),
corpus_dir=corpus,
labels_dir=labels,
)
result = evaluate_rag(session, ctx)
assert result.quotable is False
assert result.citation_validity_rate is None
assert result.cites_relevant_rate is None
Expand All @@ -251,6 +262,7 @@ def test_evaluate_rag_emits_na_under_either_fake(session: Session, tmp_path: Pat


def _perfect_invoice_json_for(chunk_id: int) -> str:
"""Build a perfect-extraction invoice JSON whose fields cite ``chunk_id``."""
return json.dumps(
{
"invoice_number": {
Expand All @@ -274,12 +286,14 @@ def _perfect_invoice_json_for(chunk_id: int) -> str:


def _wrong_vendor_invoice_json_for(chunk_id: int) -> str:
"""Perfect invoice JSON but with a wrong vendor value (one bad field)."""
payload = json.loads(_perfect_invoice_json_for(chunk_id))
payload["vendor"]["value"] = "WRONG"
return json.dumps(payload)


def _expected_invoice_payload() -> dict[str, Any]:
"""The ground-truth invoice payload the extraction fixtures score against."""
return {
"invoice_number": "INV-X",
"vendor": "Acme",
Expand All @@ -291,6 +305,7 @@ def _expected_invoice_payload() -> dict[str, Any]:
def test_evaluate_extraction_perfect_run_yields_unit_accuracy(
session: Session, tmp_path: Path
) -> None:
"""A perfect extraction run yields 1.0 on micro/macro and every per-field axis."""
corpus = _write_corpus(tmp_path, {"x.md": "INV-X from Acme, 2026-01-22, total $1,234.56."})
labels = _write_labels(
tmp_path,
Expand All @@ -307,14 +322,14 @@ def test_evaluate_extraction_perfect_run_yields_unit_accuracy(
)
llm = ScriptedFakeLLM(response_for_chunk=_perfect_invoice_json_for)

result = evaluate_extraction(
session,
ctx = EvalContext(
settings=_real_provider_settings(),
llm=llm,
embedder=FakeEmbedder(),
corpus_dir=corpus,
labels_dir=labels,
)
result = evaluate_extraction(session, ctx)
assert result.quotable is True
assert result.failed_extractions == 0
assert result.micro_accuracy == pytest.approx(1.0)
Expand Down Expand Up @@ -345,14 +360,14 @@ def test_evaluate_extraction_one_wrong_field_pins_micro_macro(
)
llm = ScriptedFakeLLM(response_for_chunk=_wrong_vendor_invoice_json_for)

result = evaluate_extraction(
session,
ctx = EvalContext(
settings=_real_provider_settings(),
llm=llm,
embedder=FakeEmbedder(),
corpus_dir=corpus,
labels_dir=labels,
)
result = evaluate_extraction(session, ctx)
assert result.quotable is True
assert result.failed_extractions == 0
# 3 of 4 fields correct → micro = 0.75. Macro = mean of (1, 0, 1, 1) = 0.75.
Expand Down Expand Up @@ -410,13 +425,13 @@ def test_evaluate_retrieval_pins_precision_recall_mrr(session: Session, tmp_path
},
)

result = evaluate_retrieval(
session,
ctx = EvalContext(
settings=_real_provider_settings(retrieval_top_k=2),
embedder=embedder,
corpus_dir=corpus,
labels_dir=labels,
)
result = evaluate_retrieval(session, ctx)
assert result.quotable is True
assert result.k == 2
assert result.precision_at_k == pytest.approx(0.5)
Expand All @@ -436,6 +451,7 @@ def test_evaluate_rag_pins_three_rates_on_happy_path(session: Session, tmp_path:
embedder = _RankedEmbedder({"Initech": _unit(1.0)}, default=_unit(1.0))

def respond(cid: int) -> str:
"""Answer citing the runtime-allocated chunk id with the expected substring."""
return f"The total due is 90,006.92 dollars [chunk:{cid}]."

llm = ScriptedFakeLLM(response_for_chunk=respond)
Expand All @@ -454,14 +470,14 @@ def respond(cid: int) -> str:
},
)

result = evaluate_rag(
session,
ctx = EvalContext(
settings=_real_provider_settings(retrieval_top_k=3),
llm=llm,
embedder=embedder,
corpus_dir=corpus,
labels_dir=labels,
)
result = evaluate_rag(session, ctx)
assert result.quotable is True
assert result.refusals == 0
assert result.answered == 1
Expand All @@ -474,6 +490,7 @@ def respond(cid: int) -> str:


def test_render_pending_contains_methodology_only_no_numbers() -> None:
"""The PENDING RESULTS.md renders methodology only — no numeric metrics."""
body = render_pending()
assert "Numbers pending real-provider run" in body
assert "claude-sonnet-4-6" in body
Expand Down Expand Up @@ -502,14 +519,14 @@ def test_render_writes_real_metrics_when_quotable(session: Session, tmp_path: Pa
)
llm = ScriptedFakeLLM(response_for_chunk=_perfect_invoice_json_for)

extraction = evaluate_extraction(
session,
ctx = EvalContext(
settings=_real_provider_settings(),
llm=llm,
embedder=FakeEmbedder(),
corpus_dir=corpus,
labels_dir=labels,
)
extraction = evaluate_extraction(session, ctx)

report = HarnessReport(
extraction=extraction,
Expand Down
Loading
Loading