From 6d7678a31d8302753a00aaec076cf7fc2150c66c Mon Sep 17 00:00:00 2001
From: nasr <156965421+div0rce@users.noreply.github.com>
Date: Mon, 8 Jun 2026 15:41:48 -0400
Subject: [PATCH] docs: backfill docstrings on public backend/app API
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Document every remaining public production definition CodeRabbit's docstring-coverage
check flags: embedding/LLM provider impls (dim/embed/model_name/complete), the ingest
CLI entry, the request-id middleware dispatch, the three repository get() accessors, and
the dashboard/review Pydantic response models. Aligns with CLAUDE.md ('docstrings on
public functions'); private helpers stay undocumented by that same convention. With
PR #18 (which documents eval/), the repo's entire public production API is documented.
Docstring-only — no behaviour change; make check green (222 backend pytest, ruff/mypy).
---
 backend/app/embeddings/fake.py             |  2 ++
 backend/app/embeddings/gemini_provider.py  |  1 +
 backend/app/embeddings/openai_provider.py  |  3 +++
 backend/app/ingest.py                      |  2 ++
 backend/app/llm/claude.py                  |  2 ++
 backend/app/llm/fake.py                    |  2 ++
 backend/app/llm/gemini.py                  |  1 +
 backend/app/observability.py               |  2 ++
 backend/app/repositories/chunks.py         |  1 +
 backend/app/repositories/extractions.py    |  1 +
 backend/app/repositories/workflow_items.py |  1 +
 backend/app/routers/dashboard.py           | 12 ++++++++++++
 backend/app/routers/review.py              |  4 ++++
 13 files changed, 34 insertions(+)

diff --git a/backend/app/embeddings/fake.py b/backend/app/embeddings/fake.py
index 648d2d2..d578378 100644
--- a/backend/app/embeddings/fake.py
+++ b/backend/app/embeddings/fake.py
@@ -30,9 +30,11 @@ def __init__(self, dim: int = SCHEMA_EMBEDDING_DIM) -> None:
 
     @property
     def dim(self) -> int:
+        """Output vector dimension."""
         return self._dim
 
     def embed(self, texts: Sequence[str]) -> list[list[float]]:
+        """Return one deterministic, hash-derived vector per input text, in order."""
         return [self._embed_one(t) for t in texts]
 
     def _embed_one(self, text: str) -> list[float]:
diff --git a/backend/app/embeddings/gemini_provider.py b/backend/app/embeddings/gemini_provider.py
index a97e92f..769c7ca 100644
--- a/backend/app/embeddings/gemini_provider.py
+++ b/backend/app/embeddings/gemini_provider.py
@@ -60,6 +60,7 @@ def __init__(
 
     @property
     def dim(self) -> int:
+        """Output vector dimension."""
         return self._dim
 
     def embed(self, texts: Sequence[str]) -> list[list[float]]:
diff --git a/backend/app/embeddings/openai_provider.py b/backend/app/embeddings/openai_provider.py
index 1292670..e1a0c81 100644
--- a/backend/app/embeddings/openai_provider.py
+++ b/backend/app/embeddings/openai_provider.py
@@ -43,9 +43,12 @@ def __init__(
 
     @property
     def dim(self) -> int:
+        """Output vector dimension."""
         return self._dim
 
     def embed(self, texts: Sequence[str]) -> list[list[float]]:
+        """Embed each text via OpenAI's ``/v1/embeddings`` — one vector per input,
+        in order. Returns ``[]`` for empty input and validates the returned length."""
         if not texts:
             return []
         # OpenAI accepts a list input and returns one embedding per input in order.
diff --git a/backend/app/ingest.py b/backend/app/ingest.py
index 8271f3f..f2639e8 100644
--- a/backend/app/ingest.py
+++ b/backend/app/ingest.py
@@ -194,6 +194,8 @@ def _build_parser() -> argparse.ArgumentParser:
 
 
 def main(argv: list[str] | None = None) -> int:
+    """Entry point for ``python -m backend.app.ingest`` (``make seed``): ingest every
+    document under the given path and print a summary. Returns a process exit code."""
     args = _build_parser().parse_args(argv)
     if not args.path.exists():
         print(f"error: path does not exist: {args.path}", file=sys.stderr)
diff --git a/backend/app/llm/claude.py b/backend/app/llm/claude.py
index 9f95e61..9638fa1 100644
--- a/backend/app/llm/claude.py
+++ b/backend/app/llm/claude.py
@@ -41,6 +41,7 @@ def __init__(
 
     @property
     def model_name(self) -> str:
+        """Identifier of the underlying Anthropic model."""
         return self._model
 
     def complete(
@@ -51,6 +52,7 @@ def complete(
         max_tokens: int,
         temperature: float,
     ) -> LLMResponse:
+        """Return a single completion from Anthropic's ``/v1/messages`` endpoint."""
         response = httpx.post(
             f"{self._base_url}/v1/messages",
             headers={
diff --git a/backend/app/llm/fake.py b/backend/app/llm/fake.py
index b66b0ce..856bee3 100644
--- a/backend/app/llm/fake.py
+++ b/backend/app/llm/fake.py
@@ -35,6 +35,8 @@ def complete(
         max_tokens: int,
         temperature: float,
     ) -> LLMResponse:
+        """Return the canned response (or ``response_factory`` output); inputs are
+        ignored. Deterministic by construction so tests fully control the output."""
         if self.response_factory is not None:
             text = self.response_factory(system, user)
         else:
diff --git a/backend/app/llm/gemini.py b/backend/app/llm/gemini.py
index 0723f7c..59ac14c 100644
--- a/backend/app/llm/gemini.py
+++ b/backend/app/llm/gemini.py
@@ -46,6 +46,7 @@ def __init__(
 
     @property
     def model_name(self) -> str:
+        """Identifier of the underlying Gemini model."""
         return self._model
 
     def complete(
diff --git a/backend/app/observability.py b/backend/app/observability.py
index 84b69c8..978997c 100644
--- a/backend/app/observability.py
+++ b/backend/app/observability.py
@@ -97,6 +97,8 @@ async def dispatch(
         request: Request,
         call_next: Callable[[Request], Awaitable[Response]],
     ) -> Response:
+        """Assign or propagate a request id, bind it to the structlog context for the
+        duration of the request, and echo it back on the response header."""
         inbound = request.headers.get(self.HEADER_NAME, "")
         request_id = _sanitise_inbound(inbound) or _generate_request_id()
         request.state.request_id = request_id
diff --git a/backend/app/repositories/chunks.py b/backend/app/repositories/chunks.py
index 5ffc055..c6fd605 100644
--- a/backend/app/repositories/chunks.py
+++ b/backend/app/repositories/chunks.py
@@ -56,4 +56,5 @@ def list_for_document(session: Session, document_id: int) -> list[Chunk]:
 
 
 def get(session: Session, chunk_id: int) -> Chunk | None:
+    """Return the chunk with ``chunk_id``, or ``None`` if it does not exist."""
     return session.get(Chunk, chunk_id)
diff --git a/backend/app/repositories/extractions.py b/backend/app/repositories/extractions.py
index b98322d..89b49af 100644
--- a/backend/app/repositories/extractions.py
+++ b/backend/app/repositories/extractions.py
@@ -35,6 +35,7 @@ def create(
 
 
 def get(session: Session, extraction_id: int) -> Extraction | None:
+    """Return the extraction with ``extraction_id``, or ``None`` if it does not exist."""
     return session.get(Extraction, extraction_id)
 
 
diff --git a/backend/app/repositories/workflow_items.py b/backend/app/repositories/workflow_items.py
index 76c4eb1..5a25adb 100644
--- a/backend/app/repositories/workflow_items.py
+++ b/backend/app/repositories/workflow_items.py
@@ -70,6 +70,7 @@ def create_if_absent(
 
 
 def get(session: Session, item_id: int) -> WorkflowItem | None:
+    """Return the workflow item with ``item_id``, or ``None`` if it does not exist."""
     return session.get(WorkflowItem, item_id)
 
 
diff --git a/backend/app/routers/dashboard.py b/backend/app/routers/dashboard.py
index de84063..02beb73 100644
--- a/backend/app/routers/dashboard.py
+++ b/backend/app/routers/dashboard.py
@@ -34,16 +34,22 @@ class VolumePoint(BaseModel):
 
 
 class VolumeResponse(BaseModel):
+    """Daily document-ingest volume series (``points``) over the last ``days``."""
+
     days: int = Field(ge=1)
     points: list[VolumePoint]
 
 
 class CategoryPoint(BaseModel):
+    """Document count for a single extraction schema."""
+
     schema_name: str
     count: int = Field(ge=0)
 
 
 class CategoryResponse(BaseModel):
+    """Per-schema document counts for the category breakdown."""
+
     points: list[CategoryPoint]
 
 
@@ -59,6 +65,8 @@ class ConfidenceBucket(BaseModel):
 
 
 class ConfidenceResponse(BaseModel):
+    """Per-field confidence histogram (buckets) plus the total field count."""
+
     buckets: list[ConfidenceBucket]
     total_fields: int = Field(ge=0)
 
@@ -71,6 +79,8 @@ class SlaBucket(BaseModel):
 
 
 class SlaResponse(BaseModel):
+    """Review-queue SLA summary: needs-review totals and per-age-bucket counts."""
+
     threshold_hours: int = Field(ge=1)
     total_needs_review: int = Field(ge=0)
     over_sla: int = Field(ge=0)
@@ -96,6 +106,8 @@ class Kpi(BaseModel):
 
 
 class KpiResponse(BaseModel):
+    """The dashboard KPI tiles plus the SLA threshold and a generated-at timestamp."""
+
     kpis: list[Kpi]
     threshold_hours: int = Field(ge=1)
     generated_at: str  # ISO-8601 UTC; lets the UI footnote show a real refresh time
diff --git a/backend/app/routers/review.py b/backend/app/routers/review.py
index accb56a..bfa9c9a 100644
--- a/backend/app/routers/review.py
+++ b/backend/app/routers/review.py
@@ -39,6 +39,8 @@ class ReviewItem(BaseModel):
 
 
 class ReviewQueueResponse(BaseModel):
+    """The list of extractions currently awaiting human review."""
+
     items: list[ReviewItem]
 
 
@@ -50,6 +52,8 @@ class ReviewDecisionRequest(BaseModel):
 
 
 class ReviewDecisionResponse(BaseModel):
+    """The persisted workflow-item state returned after an approve/reject decision."""
+
     id: int
     extraction_id: int
     status: Literal["auto_approved", "rejected"]