From 6d7678a31d8302753a00aaec076cf7fc2150c66c Mon Sep 17 00:00:00 2001 From: nasr <156965421+div0rce@users.noreply.github.com> Date: Mon, 8 Jun 2026 15:41:48 -0400 Subject: [PATCH] docs: backfill docstrings on public backend/app API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Document every remaining public production definition CodeRabbit's docstring-coverage check flags: embedding/LLM provider impls (dim/embed/model_name/complete), the ingest CLI entry, the request-id middleware dispatch, the three repository get() accessors, and the dashboard/review Pydantic response models. Aligns with CLAUDE.md ('docstrings on public functions'); private helpers stay undocumented by that same convention. With PR #18 (which documents eval/), the repo's entire public production API is documented. Docstring-only — no behaviour change; make check green (222 backend pytest, ruff/mypy). --- backend/app/embeddings/fake.py | 2 ++ backend/app/embeddings/gemini_provider.py | 1 + backend/app/embeddings/openai_provider.py | 3 +++ backend/app/ingest.py | 2 ++ backend/app/llm/claude.py | 2 ++ backend/app/llm/fake.py | 2 ++ backend/app/llm/gemini.py | 1 + backend/app/observability.py | 2 ++ backend/app/repositories/chunks.py | 1 + backend/app/repositories/extractions.py | 1 + backend/app/repositories/workflow_items.py | 1 + backend/app/routers/dashboard.py | 12 ++++++++++++ backend/app/routers/review.py | 4 ++++ 13 files changed, 34 insertions(+) diff --git a/backend/app/embeddings/fake.py b/backend/app/embeddings/fake.py index 648d2d2..d578378 100644 --- a/backend/app/embeddings/fake.py +++ b/backend/app/embeddings/fake.py @@ -30,9 +30,11 @@ def __init__(self, dim: int = SCHEMA_EMBEDDING_DIM) -> None: @property def dim(self) -> int: + """Output vector dimension.""" return self._dim def embed(self, texts: Sequence[str]) -> list[list[float]]: + """Return one deterministic, hash-derived vector per input text, in order.""" return [self._embed_one(t) for t in texts] def _embed_one(self, text: str) -> list[float]: diff --git a/backend/app/embeddings/gemini_provider.py b/backend/app/embeddings/gemini_provider.py index a97e92f..769c7ca 100644 --- a/backend/app/embeddings/gemini_provider.py +++ b/backend/app/embeddings/gemini_provider.py @@ -60,6 +60,7 @@ def __init__( @property def dim(self) -> int: + """Output vector dimension.""" return self._dim def embed(self, texts: Sequence[str]) -> list[list[float]]: diff --git a/backend/app/embeddings/openai_provider.py b/backend/app/embeddings/openai_provider.py index 1292670..e1a0c81 100644 --- a/backend/app/embeddings/openai_provider.py +++ b/backend/app/embeddings/openai_provider.py @@ -43,9 +43,12 @@ def __init__( @property def dim(self) -> int: + """Output vector dimension.""" return self._dim def embed(self, texts: Sequence[str]) -> list[list[float]]: + """Embed each text via OpenAI's ``/v1/embeddings`` — one vector per input, + in order. Returns ``[]`` for empty input and validates the returned length.""" if not texts: return [] # OpenAI accepts a list input and returns one embedding per input in order. diff --git a/backend/app/ingest.py b/backend/app/ingest.py index 8271f3f..f2639e8 100644 --- a/backend/app/ingest.py +++ b/backend/app/ingest.py @@ -194,6 +194,8 @@ def _build_parser() -> argparse.ArgumentParser: def main(argv: list[str] | None = None) -> int: + """Entry point for ``python -m backend.app.ingest`` (``make seed``): ingest every + document under the given path and print a summary. Returns a process exit code.""" args = _build_parser().parse_args(argv) if not args.path.exists(): print(f"error: path does not exist: {args.path}", file=sys.stderr) diff --git a/backend/app/llm/claude.py b/backend/app/llm/claude.py index 9f95e61..9638fa1 100644 --- a/backend/app/llm/claude.py +++ b/backend/app/llm/claude.py @@ -41,6 +41,7 @@ def __init__( @property def model_name(self) -> str: + """Identifier of the underlying Anthropic model.""" return self._model def complete( @@ -51,6 +52,7 @@ def complete( max_tokens: int, temperature: float, ) -> LLMResponse: + """Return a single completion from Anthropic's ``/v1/messages`` endpoint.""" response = httpx.post( f"{self._base_url}/v1/messages", headers={ diff --git a/backend/app/llm/fake.py b/backend/app/llm/fake.py index b66b0ce..856bee3 100644 --- a/backend/app/llm/fake.py +++ b/backend/app/llm/fake.py @@ -35,6 +35,8 @@ def complete( max_tokens: int, temperature: float, ) -> LLMResponse: + """Return the canned response (or ``response_factory`` output); inputs are + ignored. Deterministic by construction so tests fully control the output.""" if self.response_factory is not None: text = self.response_factory(system, user) else: diff --git a/backend/app/llm/gemini.py b/backend/app/llm/gemini.py index 0723f7c..59ac14c 100644 --- a/backend/app/llm/gemini.py +++ b/backend/app/llm/gemini.py @@ -46,6 +46,7 @@ def __init__( @property def model_name(self) -> str: + """Identifier of the underlying Gemini model.""" return self._model def complete( diff --git a/backend/app/observability.py b/backend/app/observability.py index 84b69c8..978997c 100644 --- a/backend/app/observability.py +++ b/backend/app/observability.py @@ -97,6 +97,8 @@ async def dispatch( request: Request, call_next: Callable[[Request], Awaitable[Response]], ) -> Response: + """Assign or propagate a request id, bind it to the structlog context for the + duration of the request, and echo it back on the response header.""" inbound = request.headers.get(self.HEADER_NAME, "") request_id = _sanitise_inbound(inbound) or _generate_request_id() request.state.request_id = request_id diff --git a/backend/app/repositories/chunks.py b/backend/app/repositories/chunks.py index 5ffc055..c6fd605 100644 --- a/backend/app/repositories/chunks.py +++ b/backend/app/repositories/chunks.py @@ -56,4 +56,5 @@ def list_for_document(session: Session, document_id: int) -> list[Chunk]: def get(session: Session, chunk_id: int) -> Chunk | None: + """Return the chunk with ``chunk_id``, or ``None`` if it does not exist.""" return session.get(Chunk, chunk_id) diff --git a/backend/app/repositories/extractions.py b/backend/app/repositories/extractions.py index b98322d..89b49af 100644 --- a/backend/app/repositories/extractions.py +++ b/backend/app/repositories/extractions.py @@ -35,6 +35,7 @@ def create( def get(session: Session, extraction_id: int) -> Extraction | None: + """Return the extraction with ``extraction_id``, or ``None`` if it does not exist.""" return session.get(Extraction, extraction_id) diff --git a/backend/app/repositories/workflow_items.py b/backend/app/repositories/workflow_items.py index 76c4eb1..5a25adb 100644 --- a/backend/app/repositories/workflow_items.py +++ b/backend/app/repositories/workflow_items.py @@ -70,6 +70,7 @@ def create_if_absent( def get(session: Session, item_id: int) -> WorkflowItem | None: + """Return the workflow item with ``item_id``, or ``None`` if it does not exist.""" return session.get(WorkflowItem, item_id) diff --git a/backend/app/routers/dashboard.py b/backend/app/routers/dashboard.py index de84063..02beb73 100644 --- a/backend/app/routers/dashboard.py +++ b/backend/app/routers/dashboard.py @@ -34,16 +34,22 @@ class VolumePoint(BaseModel): class VolumeResponse(BaseModel): + """Daily document-ingest volume series (``points``) over the last ``days``.""" + days: int = Field(ge=1) points: list[VolumePoint] class CategoryPoint(BaseModel): + """Document count for a single extraction schema.""" + schema_name: str count: int = Field(ge=0) class CategoryResponse(BaseModel): + """Per-schema document counts for the category breakdown.""" + points: list[CategoryPoint] @@ -59,6 +65,8 @@ class ConfidenceBucket(BaseModel): class ConfidenceResponse(BaseModel): + """Per-field confidence histogram (buckets) plus the total field count.""" + buckets: list[ConfidenceBucket] total_fields: int = Field(ge=0) @@ -71,6 +79,8 @@ class SlaBucket(BaseModel): class SlaResponse(BaseModel): + """Review-queue SLA summary: needs-review totals and per-age-bucket counts.""" + threshold_hours: int = Field(ge=1) total_needs_review: int = Field(ge=0) over_sla: int = Field(ge=0) @@ -96,6 +106,8 @@ class Kpi(BaseModel): class KpiResponse(BaseModel): + """The dashboard KPI tiles plus the SLA threshold and a generated-at timestamp.""" + kpis: list[Kpi] threshold_hours: int = Field(ge=1) generated_at: str # ISO-8601 UTC; lets the UI footnote show a real refresh time diff --git a/backend/app/routers/review.py b/backend/app/routers/review.py index accb56a..bfa9c9a 100644 --- a/backend/app/routers/review.py +++ b/backend/app/routers/review.py @@ -39,6 +39,8 @@ class ReviewItem(BaseModel): class ReviewQueueResponse(BaseModel): + """The list of extractions currently awaiting human review.""" + items: list[ReviewItem] @@ -50,6 +52,8 @@ class ReviewDecisionRequest(BaseModel): class ReviewDecisionResponse(BaseModel): + """The persisted workflow-item state returned after an approve/reject decision.""" + id: int extraction_id: int status: Literal["auto_approved", "rejected"]