Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions PROGRESS.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@

- **#13** — record real-provider eval numbers (M9 follow-up). Stays open until keys are wired and `make eval` is run for real.
- **Backlog (MILESTONES.md):** multi-tenant + RBAC, eval set expansion, OTel traces, Multi-AZ + private subnets + ACM TLS + S3/DynamoDB Terraform backend.
- **Design system** — dual-theme (dark default + light) audit-grade visual layer for the frontend + a real `GET /dashboard/kpis` endpoint, on branch `claude/serene-maxwell-54yMC` (draft PR). Net-new work beyond the M0–M11 roadmap; `make check` green (201 backend pytest, 7 frontend Vitest, ruff/mypy/tsc/build clean).

---

Expand Down Expand Up @@ -92,6 +93,8 @@ Status key: ☐ not started · ◐ in progress · ☑ merged
- 2026-05-28 (M4) — Extraction failures (`parse_error`, `schema_invalid`, `invalid_citation`, `document_not_found`, `no_chunks`, `unknown_schema`) **never persist** an `extractions` row. Surfacing the typed reason to the caller is enough for M5 guardrails / M7 audit / M9 eval to bucket failures without polluting the success-only table.
- 2026-05-28 (M4) — Citation validation reuses the M3 posture: a `source_chunk_id` not in the supplied chunk set is a hard failure (`invalid_citation`), not a silent drop. Same invariant the M3 RAG layer enforces with `[chunk:N]` markers.
- 2026-05-28 (M4) — Invoice `issue_date` remains persisted as a string but is schema-constrained to a real ISO `YYYY-MM-DD` date; non-ISO or impossible dates fail schema validation before persistence.
- 2026-05-29 (design system) — Applied the dual-theme "audit-grade" design system to the frontend: lifted the token layer into `styles.css` (`:root` dark default + `[data-theme="light"]`), self-hosted IBM Plex Sans/Mono via `@fontsource` (latin subset, offline-safe), added `lucide-react`, and a persisted `localStorage["sentinel-theme"]` toggle set before first paint (inline script in `index.html`, no FOUC). Real IA/routes/API client/Recharts unchanged; charts restyled with token `var()` fills + per-bar `<Cell>` colors so they re-theme live with the toggle.
- 2026-05-29 (design system) — Dashboard KPIs are powered by a new real endpoint `GET /dashboard/kpis` (docs ingested, auto-approved rate, avg confidence, SLA-at-risk). Every figure derives from real rows; 24h-vs-prior deltas are emitted only when a comparison window has data (otherwise `null`/flat) — no fabricated numbers or deltas. The Review row renders only fields the `/review` payload returns; the prototype's `schema.field`/`value`/`confidence` are extraction-level details absent from the queue API, so they are not invented.

---

Expand Down
162 changes: 161 additions & 1 deletion backend/app/routers/dashboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from sqlalchemy.orm import Session

from backend.app.db import get_session
from backend.app.models import Extraction, WorkflowItem, WorkflowStatus
from backend.app.models import Document, Extraction, WorkflowItem, WorkflowStatus

router = APIRouter(prefix="/dashboard", tags=["dashboard"])

Expand Down Expand Up @@ -77,13 +77,71 @@ class SlaResponse(BaseModel):
buckets: list[SlaBucket]


class Kpi(BaseModel):
"""One operational KPI tile for the dashboard header.

``value``/``delta`` are the raw numbers (stable to assert on and to re-format);
``display``/``delta_display`` are the server-rendered strings the UI shows verbatim.
``direction`` drives the up/down/flat color (up=success, down=danger, flat=muted) and
is keyed purely off the sign of ``delta`` — matching the design system's KPI semantics.
"""

key: str
label: str
value: float
display: str
delta: float | None = None
delta_display: str | None = None
direction: Literal["up", "down", "flat"]


class KpiResponse(BaseModel):
kpis: list[Kpi]
threshold_hours: int = Field(ge=1)
generated_at: str # ISO-8601 UTC; lets the UI footnote show a real refresh time


# --- helpers ------------------------------------------------------------------------


def _utcnow() -> datetime:
return datetime.now(UTC)


def _direction(delta: float | None, *, eps: float = 1e-9) -> Literal["up", "down", "flat"]:
"""Color direction from the sign of a delta; ``None`` or near-zero reads as flat."""
if delta is None or abs(delta) <= eps:
return "flat"
return "up" if delta > 0 else "down"


def _mean(values: list[float]) -> float | None:
"""Arithmetic mean, or ``None`` for an empty list (so callers can omit, not fake, it)."""
return sum(values) / len(values) if values else None


def _workflow_counts(
session: Session, *, since: datetime | None = None, until: datetime | None = None
) -> tuple[int, int]:
"""Return ``(auto_approved, total)`` workflow-item counts in the optional ``[since, until)``
creation window (open bounds when an endpoint is ``None``)."""
clauses = []
if since is not None:
clauses.append(WorkflowItem.created_at >= since)
if until is not None:
clauses.append(WorkflowItem.created_at < until)
total_stmt = select(func.count(WorkflowItem.id))
approved_stmt = select(func.count(WorkflowItem.id)).where(
WorkflowItem.status == WorkflowStatus.AUTO_APPROVED
)
if clauses:
total_stmt = total_stmt.where(*clauses)
approved_stmt = approved_stmt.where(*clauses)
total = int(session.scalar(total_stmt) or 0)
approved = int(session.scalar(approved_stmt) or 0)
return approved, total


# Bucket boundaries for confidence: ten 0.1-wide bins covering [0.0, 1.0]. Values
# exactly equal to 1.0 land in the last bucket.
_CONF_BOUNDARIES: list[tuple[float, float]] = [
Expand Down Expand Up @@ -234,6 +292,108 @@ def get_sla(
)


@router.get("/kpis", response_model=KpiResponse)
def get_kpis(
session: Annotated[Session, Depends(get_session)],
threshold_hours: Annotated[int, Query(ge=1, le=720)] = 24,
) -> KpiResponse:
"""Four operational KPIs for the dashboard header.

Every figure is derived from real rows. Deltas compare the last 24h against the
preceding 24h and are reported as ``None`` whenever a comparison window has no data,
so the UI shows nothing fabricated. ``generated_at`` is a real UTC timestamp.
"""
now = _utcnow()
last_24h = now - timedelta(hours=24)
prev_24h = now - timedelta(hours=48)

# 1) Docs ingested — total, plus how many landed in the last 24h.
total_docs = int(session.scalar(select(func.count(Document.id))) or 0)
docs_24h = int(
session.scalar(select(func.count(Document.id)).where(Document.created_at >= last_24h)) or 0
)
docs_kpi = Kpi(
key="docs_ingested",
label="Docs ingested",
value=float(total_docs),
display=f"{total_docs:,}",
delta=float(docs_24h),
delta_display=f"+{docs_24h} (24h)",
direction=_direction(float(docs_24h)),
)

# 2) Auto-approved rate — share of all workflow items auto-approved, with the delta in
# percentage points between the last-24h and preceding-24h cohorts.
approved_all, total_all = _workflow_counts(session)
rate_all = approved_all / total_all if total_all else 0.0
approved_last, total_last = _workflow_counts(session, since=last_24h)
approved_prev, total_prev = _workflow_counts(session, since=prev_24h, until=last_24h)
rate_delta: float | None = None
if total_last and total_prev:
rate_delta = (approved_last / total_last) - (approved_prev / total_prev)
auto_kpi = Kpi(
key="auto_approved_rate",
label="Auto-approved",
value=rate_all,
display=f"{rate_all * 100:.1f}%",
delta=rate_delta,
delta_display=(f"{rate_delta * 100:+.1f}pp" if rate_delta is not None else None),
direction=_direction(rate_delta),
)

# 3) Avg confidence — mean of every per-field confidence value, with a 24h-vs-prior delta.
rows = session.execute(select(Extraction.created_at, Extraction.field_confidence)).all()
all_vals: list[float] = []
last_vals: list[float] = []
prev_vals: list[float] = []
for created_at, field_confidence in rows:
if not isinstance(field_confidence, dict):
continue
for value in field_confidence.values():
try:
v = float(value)
except (TypeError, ValueError):
continue
all_vals.append(v)
if created_at is None:
continue
if created_at >= last_24h:
last_vals.append(v)
elif prev_24h <= created_at < last_24h:
prev_vals.append(v)
mean_all = _mean(all_vals)
mean_last = _mean(last_vals)
mean_prev = _mean(prev_vals)
conf_delta = mean_last - mean_prev if mean_last is not None and mean_prev is not None else None
conf_kpi = Kpi(
key="avg_confidence",
label="Avg confidence",
value=mean_all if mean_all is not None else 0.0,
display=f"{mean_all:.3f}" if mean_all is not None else "—",
delta=conf_delta,
delta_display=(f"{conf_delta:+.3f}" if conf_delta is not None else None),
direction=_direction(conf_delta),
)

# 4) SLA at risk — items past the threshold over the needs-review total (reuses /sla).
sla = get_sla(session, threshold_hours=threshold_hours)
sla_kpi = Kpi(
key="sla_at_risk",
label="SLA at risk",
value=float(sla.over_sla),
display=f"{sla.over_sla} / {sla.total_needs_review}",
delta=None,
delta_display=f"threshold {threshold_hours}h",
direction="flat",
)

return KpiResponse(
kpis=[docs_kpi, auto_kpi, conf_kpi, sla_kpi],
threshold_hours=threshold_hours,
generated_at=now.isoformat(),
)


# Re-export the literal type used by the frontend's API client so a future schema
# evolution surfaces in one place.
SlaThresholdLiteral = Literal[1, 4, 24, 168, 720]
116 changes: 116 additions & 0 deletions backend/tests/test_dashboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,110 @@ def test_sla_rejects_invalid_threshold(client: TestClient) -> None:
assert client.get("/dashboard/sla?threshold_hours=10000").status_code == 422


# --- /dashboard/kpis ----------------------------------------------------------------


def test_kpis_empty_corpus(client: TestClient) -> None:
"""No rows: four KPIs, honest zeros, and no fabricated deltas."""
resp = client.get("/dashboard/kpis")
assert resp.status_code == 200
body = resp.json()
kpis = {k["key"]: k for k in body["kpis"]}
assert set(kpis) == {"docs_ingested", "auto_approved_rate", "avg_confidence", "sla_at_risk"}

assert kpis["docs_ingested"]["display"] == "0"
assert kpis["auto_approved_rate"]["display"] == "0.0%"
assert kpis["avg_confidence"]["display"] == "—" # em dash: no fields, no fake number
assert kpis["sla_at_risk"]["display"] == "0 / 0"

# Deltas that need a comparison window are omitted (None), not invented.
assert kpis["auto_approved_rate"]["delta"] is None
assert kpis["avg_confidence"]["delta"] is None
assert all(k["direction"] == "flat" for k in kpis.values())


def test_kpis_values_and_formatting(client: TestClient, session: Session) -> None:
now = datetime.now(UTC)
# Two extractions (=> two documents) created now; per-field confidences mean to 0.800.
_make_extraction(
session, hash_suffix="kv1", field_confidence={"a": 0.80, "b": 0.90}, created_at=now
)
ex2 = _make_extraction(session, hash_suffix="kv2", field_confidence={"a": 0.70}, created_at=now)
# Workflow mix: 3 auto-approved + 1 needs_review => 75.0% auto-approved.
for i in range(3):
_make_workflow_item(
session, extraction_id=ex2.id, idem_suffix=f"a{i}", status=WorkflowStatus.AUTO_APPROVED
)
_make_workflow_item(
session, extraction_id=ex2.id, idem_suffix="nr", status=WorkflowStatus.NEEDS_REVIEW
)

kpis = {k["key"]: k for k in client.get("/dashboard/kpis").json()["kpis"]}

assert kpis["docs_ingested"]["value"] == 2.0
assert kpis["docs_ingested"]["display"] == "2"
assert kpis["docs_ingested"]["delta"] == 2.0 # both landed within the last 24h
assert kpis["docs_ingested"]["delta_display"] == "+2 (24h)"

assert kpis["avg_confidence"]["value"] == pytest.approx(0.8)
assert kpis["avg_confidence"]["display"] == "0.800"

assert kpis["auto_approved_rate"]["value"] == pytest.approx(0.75)
assert kpis["auto_approved_rate"]["display"] == "75.0%"

# The single needs_review item is age ~0, so it is not yet over the 24h threshold.
assert kpis["sla_at_risk"]["display"] == "0 / 1"


def test_kpis_auto_approved_delta(client: TestClient, session: Session) -> None:
"""The auto-approved delta compares the last-24h cohort against the preceding 24h."""
ext = _make_extraction(session, hash_suffix="kad", field_confidence={"a": 0.9})
# Preceding 24–48h window: 2 items, 1 auto-approved => rate 0.50.
_make_workflow_item(
session,
extraction_id=ext.id,
idem_suffix="p1",
status=WorkflowStatus.AUTO_APPROVED,
age_hours=36,
)
_make_workflow_item(
session,
extraction_id=ext.id,
idem_suffix="p2",
status=WorkflowStatus.NEEDS_REVIEW,
age_hours=36,
)
# Last-24h window: 4 items, 3 auto-approved => rate 0.75.
for i in range(3):
_make_workflow_item(
session,
extraction_id=ext.id,
idem_suffix=f"l{i}",
status=WorkflowStatus.AUTO_APPROVED,
age_hours=2,
)
_make_workflow_item(
session,
extraction_id=ext.id,
idem_suffix="lnr",
status=WorkflowStatus.NEEDS_REVIEW,
age_hours=2,
)

auto = next(
k for k in client.get("/dashboard/kpis").json()["kpis"] if k["key"] == "auto_approved_rate"
)
assert auto["delta"] == pytest.approx(0.25) # 0.75 - 0.50
assert auto["delta_display"] == "+25.0pp"
assert auto["direction"] == "up"


def test_kpis_rejects_invalid_threshold(client: TestClient) -> None:
assert client.get("/dashboard/kpis?threshold_hours=0").status_code == 422
assert client.get("/dashboard/kpis?threshold_hours=-1").status_code == 422
assert client.get("/dashboard/kpis?threshold_hours=10000").status_code == 422


# --- shape sanity (frontend depends on these keys) ----------------------------------


Expand All @@ -243,6 +347,18 @@ def test_response_keys_match_frontend_contract(client: TestClient) -> None:
sla = client.get("/dashboard/sla").json()
assert set(sla.keys()) == {"threshold_hours", "total_needs_review", "over_sla", "buckets"}
assert set(sla["buckets"][0].keys()) == {"label", "count"}
kpis = client.get("/dashboard/kpis").json()
assert set(kpis.keys()) == {"kpis", "threshold_hours", "generated_at"}
assert len(kpis["kpis"]) == 4
assert set(kpis["kpis"][0].keys()) == {
"key",
"label",
"value",
"display",
"delta",
"delta_display",
"direction",
}


# --- silence the unused-import warning on Chunk (used by other test modules) -------
Expand Down
17 changes: 16 additions & 1 deletion frontend/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,23 @@
<head>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<meta name="color-scheme" content="light dark" />
<meta name="color-scheme" content="dark light" />
<link rel="icon" href="/brand/sentinel-mark.svg" />
<title>Sentinel</title>
<script>
// Set the theme before first paint so the persisted choice never flashes.
// Default is dark; mirrored by the useTheme hook in src/theme.ts.
(function () {
try {
document.documentElement.setAttribute(
"data-theme",
localStorage.getItem("sentinel-theme") || "dark",
);
} catch (e) {
document.documentElement.setAttribute("data-theme", "dark");
}
})();
</script>
</head>
<body>
<div id="root"></div>
Expand Down
Loading
Loading