From cf0c4ac055f8e0f17408ae9f838b634e17a95578 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andre=CC=81=20Lange?= <andre.lange@typelicious.com>
Date: Thu, 12 Mar 2026 16:06:42 +0100
Subject: [PATCH] feat(obs): add provider inventory and capability coverage

---
 CHANGELOG.md                      |   1 +
 README.md                         |  20 +++-
 docs/ARCHITECTURE.md              |   5 +
 docs/FOUNDRYGATE-ROADMAP.md       |   4 +-
 foundrygate/main.py               | 154 ++++++++++++++++++++++++++----
 foundrygate/providers.py          |   1 +
 tests/test_route_introspection.py |  39 ++++++++
 7 files changed, 199 insertions(+), 25 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4bcae09..37847d4 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,6 +11,7 @@ The format is intentionally lightweight and human-readable. Group entries by rel
 - Added modality-aware metrics and filters so stats, traces, recent requests, and the dashboard can distinguish `chat`, `image_generation`, and `image_editing`
 - Added `POST /api/route/image` for dry-run preview of image-generation and image-editing routing decisions
 - Added optional `image` provider metadata (`max_outputs`, `max_side_px`, `supported_sizes`) so image-capable providers can be ranked against `n` and `size`
+- Added top-level capability coverage to `GET /health` plus `GET /api/providers` for filtered provider inventory and dashboard coverage views
 
 ## v0.5.0 - 2026-03-12
 
diff --git a/README.md b/README.md
index 8efd544..485a748 100644
--- a/README.md
+++ b/README.md
@@ -46,7 +46,7 @@ FoundryGate is a local OpenAI-compatible router/proxy for OpenClaw and other cli
 - Multi-provider routing: use `auto` for routing or target a provider directly by model id.
 - Multi-dimensional routing: score providers across locality, context headroom, token limits, cache metadata, latency, and recent failure state during provider selection.
 - Robust fallback behavior: provider errors, timeouts, and connection failures fall through the configured fallback chain.
-- Useful observability: `/health` reports provider status, consecutive failures, last error, and average latency.
+- Useful observability: `/health` reports provider status, capability coverage, consecutive failures, last error, and average latency.
 - Hardened extension seam: request hooks are sanitized, can fail closed, and expose hook errors in dry-run and completion responses.
 - Safe database path handling: metrics use `FOUNDRYGATE_DB_PATH`, so the SQLite database does not need to live in the repo checkout.
 
@@ -153,7 +153,7 @@ These endpoints are implemented today in [foundrygate/main.py](./foundrygate/mai
 
 ### `GET /health`
 
-Returns overall service status plus one object per loaded provider. Each provider entry includes:
+Returns overall service status, provider summary, capability coverage, and one object per loaded provider. Each provider entry includes:
 
 - `healthy`
 - `consecutive_failures`
@@ -163,11 +163,23 @@ Returns overall service status plus one object per loaded provider. Each provide
 - `backend`
 - `tier`
 - `capabilities`
+- `image`
 
 ```bash
 curl -fsS http://127.0.0.1:8090/health
 ```
 
+### `GET /api/providers`
+
+Returns the loaded provider inventory plus the same capability-coverage summary used by the dashboard.
+
+- optional `capability=<name>` filter
+- optional `healthy=true|false` filter
+
+```bash
+curl -fsS 'http://127.0.0.1:8090/api/providers?capability=image_generation'
+```
+
 ### `GET /v1/models`
 
 Returns an OpenAI-compatible model list. It always includes the virtual `auto` model, plus one entry for every provider that actually loaded at startup.
@@ -240,6 +252,7 @@ curl -fsS http://127.0.0.1:8090/v1/images/edits \
 
 - `POST /api/route`
 - `POST /api/route/image`
+- `GET /api/providers`
 - `GET /api/update`
 - `GET /api/stats`
 - `GET /api/recent?limit=50`
@@ -267,6 +280,7 @@ curl -fsS http://127.0.0.1:8090/api/route/image \
 
 curl -fsS http://127.0.0.1:8090/api/stats
 curl -fsS http://127.0.0.1:8090/api/update
+curl -fsS 'http://127.0.0.1:8090/api/providers?healthy=true'
 curl -fsS 'http://127.0.0.1:8090/api/recent?limit=10'
 curl -fsS 'http://127.0.0.1:8090/api/traces?limit=10'
 curl -fsS 'http://127.0.0.1:8090/api/stats?provider=local-worker&client_tag=codex&modality=chat'
@@ -278,6 +292,8 @@ curl -fsS 'http://127.0.0.1:8090/api/stats?provider=local-worker&client_tag=code
 
 If request hooks are enabled, `POST /api/route` also shows the applied hook names and the effective request metadata after hook processing.
 
+`GET /api/providers` returns the current provider inventory, including capability flags and optional image metadata such as `max_outputs`, `max_side_px`, and `supported_sizes`.
+
 `GET /api/stats`, `GET /api/recent`, and `GET /api/traces` also accept optional `provider`, `modality`, `client_profile`, `client_tag`, `layer`, and `success` filters. The built-in dashboard uses the same filtered endpoints.
 
 `GET /api/traces` returns recent enriched routing records from the metrics store, including requested model, modality, resolved client profile, client tag, decision reason, confidence, and attempt order.
diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md
index 28df513..945b644 100644
--- a/docs/ARCHITECTURE.md
+++ b/docs/ARCHITECTURE.md
@@ -91,6 +91,7 @@ Request hooks sit beside these caller-aware signals as a narrow extension seam.
 The main operational endpoints are:
 
 - `GET /health`
+- `GET /api/providers`
 - `GET /v1/models`
 - `POST /v1/chat/completions`
 - `POST /v1/images/generations`
@@ -101,6 +102,10 @@ The main operational endpoints are:
 - `GET /api/traces`
 - `GET /dashboard`
 
+`/health` now exposes both provider-level health and top-level capability coverage, so operators can quickly see whether the gateway currently has healthy support for `chat`, `image_generation`, `image_editing`, or other boolean capabilities exposed by loaded providers.
+
+`/api/providers` exposes the normalized provider inventory with optional `capability` and `healthy` filters. This is the inventory surface the dashboard should use when it needs provider metadata beyond raw request metrics.
+
 `/api/stats`, `/api/recent`, and `/api/traces` can now be filtered by provider, client profile, client tag, layer, and success state. The dashboard is a thin UI over those same filtered endpoints and persists its active filters in the URL so operators can share one filtered view.
 
 ## Design target
diff --git a/docs/FOUNDRYGATE-ROADMAP.md b/docs/FOUNDRYGATE-ROADMAP.md
index 722a911..81904e1 100644
--- a/docs/FOUNDRYGATE-ROADMAP.md
+++ b/docs/FOUNDRYGATE-ROADMAP.md
@@ -15,7 +15,7 @@ The foundation that used to be the near-term buildout is largely in place:
 - route introspection
 - routing traces and client/profile metrics
 - local worker probing
-- a hardened simple dashboard with filtered traces, client/provider views, URL-persisted filters, and operator summary cards
+- a hardened simple dashboard with filtered traces, client/provider views, URL-persisted filters, operator summary cards, and modality/capability coverage
 
 This roadmap now shifts from "rename and foundation" to "deepen the gateway plane without bloating it".
 
@@ -195,7 +195,7 @@ Primary goals:
 - add modality-aware provider contracts, starting with image generation
 - extend that contract toward image editing where the provider surface supports it
 - keep chat and image paths explicit instead of mixing modality-specific behavior into one opaque route
-- expose modality-aware health and routing visibility in the dashboard and operational endpoints
+- expose modality-aware health, provider inventory, and routing visibility in the dashboard and operational endpoints
 
 This should borrow the useful parts of image-router patterns without copying another gateway's product shape.
 
diff --git a/foundrygate/main.py b/foundrygate/main.py
index 4ecb6a2..07157a4 100644
--- a/foundrygate/main.py
+++ b/foundrygate/main.py
@@ -178,9 +178,69 @@ def _serialize_provider(name: str) -> dict[str, Any] | None:
         "context_window": provider.context_window,
         "limits": provider.limits,
         "cache": provider.cache,
+        "image": getattr(provider, "image", {}),
     }
 
 
+def _build_provider_inventory(
+    *,
+    capability: str | None = None,
+    healthy: bool | None = None,
+) -> list[dict[str, Any]]:
+    """Return a normalized provider inventory with optional filters."""
+    rows: list[dict[str, Any]] = []
+    for name, provider in _providers.items():
+        if capability and not provider.capabilities.get(capability):
+            continue
+        if healthy is not None and bool(provider.health.healthy) != bool(healthy):
+            continue
+
+        rows.append(
+            {
+                "name": name,
+                "model": provider.model,
+                "backend": provider.backend_type,
+                "contract": provider.contract,
+                "tier": provider.tier,
+                "healthy": provider.health.healthy,
+                "capabilities": provider.capabilities,
+                "context_window": provider.context_window,
+                "limits": provider.limits,
+                "cache": provider.cache,
+                "image": getattr(provider, "image", {}),
+                "last_error": getattr(provider.health, "last_error", ""),
+                "avg_latency_ms": getattr(provider.health, "avg_latency_ms", 0.0),
+            }
+        )
+
+    return sorted(rows, key=lambda row: (row["healthy"] is False, row["name"]))
+
+
+def _build_capability_coverage() -> dict[str, dict[str, Any]]:
+    """Return operator-facing capability coverage across loaded providers."""
+    coverage: dict[str, dict[str, Any]] = {}
+    for name, provider in _providers.items():
+        for capability, value in provider.capabilities.items():
+            if value is not True:
+                continue
+            bucket = coverage.setdefault(
+                capability,
+                {
+                    "total": 0,
+                    "healthy": 0,
+                    "providers": [],
+                    "healthy_providers": [],
+                },
+            )
+            bucket["total"] += 1
+            bucket["providers"].append(name)
+            if provider.health.healthy:
+                bucket["healthy"] += 1
+                bucket["healthy_providers"].append(name)
+
+    return dict(sorted(coverage.items()))
+
+
 def _estimate_request_dimensions(body: dict[str, Any]) -> dict[str, int | str]:
     """Return lightweight request-dimension estimates for debugging and routing preview."""
     messages = body.get("messages", [])
@@ -511,21 +571,45 @@ async def lifespan(app: FastAPI):
 @app.get("/health")
 async def health():
     await _refresh_local_worker_probes()
+    providers = {
+        name: {
+            **p.health.to_dict(),
+            "contract": p.contract,
+            "backend": p.backend_type,
+            "tier": p.tier,
+            "capabilities": p.capabilities,
+            "context_window": p.context_window,
+            "limits": p.limits,
+            "cache": p.cache,
+            "image": getattr(p, "image", {}),
+        }
+        for name, p in _providers.items()
+    }
     return {
         "status": "ok",
-        "providers": {
-            name: {
-                **p.health.to_dict(),
-                "contract": p.contract,
-                "backend": p.backend_type,
-                "tier": p.tier,
-                "capabilities": p.capabilities,
-                "context_window": p.context_window,
-                "limits": p.limits,
-                "cache": p.cache,
-            }
-            for name, p in _providers.items()
+        "summary": {
+            "providers_total": len(providers),
+            "providers_healthy": sum(1 for provider in providers.values() if provider["healthy"]),
+            "providers_unhealthy": sum(
+                1 for provider in providers.values() if not provider["healthy"]
+            ),
         },
+        "coverage": _build_capability_coverage(),
+        "providers": providers,
+    }
+
+
+@app.get("/api/providers")
+async def provider_inventory(
+    capability: str | None = None,
+    healthy: bool | None = None,
+):
+    """Return the loaded provider inventory with optional capability/health filters."""
+    await _refresh_local_worker_probes()
+    rows = _build_provider_inventory(capability=capability, healthy=healthy)
+    return {
+        "providers": rows,
+        "coverage": _build_capability_coverage(),
     }
 
 
@@ -1235,7 +1319,14 @@ def main():
 <div class="sect">
   <h2>Provider Health</h2>
   <table id="health"><thead><tr>
-    <th>Provider</th><th>Status</th><th>Contract</th><th>Tier</th><th>Context</th><th>Limits</th><th>Cache</th><th>Latency</th><th>Last Error</th>
+    <th>Provider</th><th>Status</th><th>Contract</th><th>Tier</th><th>Capabilities</th><th>Context</th><th>Limits</th><th>Cache</th><th>Latency</th><th>Last Error</th>
+  </tr></thead><tbody></tbody></table>
+</div>
+
+<div class="sect">
+  <h2>Capability Coverage</h2>
+  <table id="coverage"><thead><tr>
+    <th>Capability</th><th>Healthy</th><th>Total</th><th>Healthy Providers</th><th>All Providers</th>
   </tr></thead><tbody></tbody></table>
 </div>
 
@@ -1349,26 +1440,36 @@ def main():
   return parts.length ? esc(parts.join(' / ')) : '—';
 }
 
+function formatCapabilities(provider){
+  const capabilities = Object.entries(provider?.capabilities || {})
+    .filter(([, value]) => value === true)
+    .map(([name]) => `<span class="pill">${esc(name)}</span>`);
+  return capabilities.length ? capabilities.join(' ') : '—';
+}
+
 async function load(){
   try{
     const query = currentFilters();
     persistFilters(query);
     const queryStr = query.toString();
     const suffix = queryStr ? `?${queryStr}` : '';
-    const [health, stats, traces, rec, update] = await Promise.all([
+    const [health, stats, traces, rec, update, inventory] = await Promise.all([
       fetch('/health').then(r=>r.json()),
       fetch(`/api/stats${suffix}`).then(r=>r.json()),
       fetch(`/api/traces${suffix}${suffix ? '&' : '?'}limit=20`).then(r=>r.json()),
       fetch(`/api/recent${suffix}${suffix ? '&' : '?'}limit=20`).then(r=>r.json()),
-      fetch('/api/update').then(r=>r.json()).catch(() => ({enabled:false,status:'unavailable'}))
+      fetch('/api/update').then(r=>r.json()).catch(() => ({enabled:false,status:'unavailable'})),
+      fetch('/api/providers').then(r=>r.json()),
     ]);
 
     const totals = stats.totals || {};
-    const providers = Object.values(health.providers || {});
-    const healthyProviders = providers.filter(provider => provider.healthy).length;
-    const unhealthyProviders = providers.length - healthyProviders;
+    const providers = inventory.providers || Object.values(health.providers || {});
+    const healthyProviders = (health.summary && health.summary.providers_healthy) || providers.filter(provider => provider.healthy).length;
+    const unhealthyProviders = (health.summary && health.summary.providers_unhealthy) || (providers.length - healthyProviders);
     const modalityRows = stats.modalities || [];
     const topModality = modalityRows.length ? modalityRows[0].modality : '—';
+    const capabilityCoverage = inventory.coverage || health.coverage || {};
+    const coverageEntries = Object.entries(capabilityCoverage);
     $('#status').style.background = '#5e5';
     $('#ago').textContent = ago(totals.last_request);
 
@@ -1380,22 +1481,33 @@ def main():
       <div class="card"><div class="label">Cache Hit Rate</div><div class="value cost">${fmt(totals.cache_hit_pct || 0,1)}%</div><div class="detail">${fmtTok(totals.total_cache_hit || 0)} hit / ${fmtTok(totals.total_cache_miss || 0)} miss</div></div>
       <div class="card"><div class="label">Failures</div><div class="value ${(totals.total_failures||0)>0?'err':''}">${totals.total_failures || 0}</div></div>
       <div class="card"><div class="label">Healthy Providers</div><div class="value">${healthyProviders}/${providers.length}</div><div class="detail">${unhealthyProviders} unhealthy</div></div>
+      <div class="card"><div class="label">Capability Coverage</div><div class="value">${coverageEntries.length}</div><div class="detail">${coverageEntries.map(([name]) => name).slice(0,3).join(', ') || 'none'}</div></div>
       <div class="card"><div class="label">Top Modality</div><div class="value">${esc(topModality)}</div><div class="detail">${modalityRows.length} modality groups</div></div>
       <div class="card"><div class="label">Release Status</div><div class="value ${update.update_available ? 'cost' : ''}">${esc(update.latest_version || update.current_version || 'n/a')}</div><div class="detail">${update.enabled ? (update.update_available ? 'Update available' : update.status === 'ok' ? 'Up to date' : 'Update check unavailable') : 'Update checks disabled'}</div></div>
     `;
 
-    const providerRows = Object.entries(health.providers || {}).map(([name, provider]) => `<tr>
-      <td><strong>${esc(name)}</strong></td>
+    const providerRows = providers.map(provider => `<tr>
+      <td><strong>${esc(provider.name)}</strong></td>
       <td>${statusTag(provider.healthy)}</td>
       <td>${esc(provider.contract || 'generic')}</td>
       <td>${esc(provider.tier || 'default')}</td>
+      <td>${formatCapabilities(provider)}</td>
       <td class="mono">${provider.context_window ? fmtTok(provider.context_window) : '—'}</td>
       <td class="mono">${formatLimits(provider)}</td>
       <td><span class="pill">${esc((provider.cache && provider.cache.mode) || 'none')}</span></td>
       <td class="mono">${fmtMs(provider.avg_latency_ms)}</td>
       <td class="mono">${esc(provider.last_error || '—')}</td>
     </tr>`);
-    $('#health tbody').innerHTML = providerRows.length ? providerRows.join('') : emptyRow(9, 'No provider health data');
+    $('#health tbody').innerHTML = providerRows.length ? providerRows.join('') : emptyRow(10, 'No provider health data');
+
+    const coverageRows = coverageEntries.map(([capability, data]) => `<tr>
+      <td><span class="pill">${esc(capability)}</span></td>
+      <td>${data.healthy || 0}</td>
+      <td>${data.total || 0}</td>
+      <td class="mono">${esc((data.healthy_providers || []).join(', ') || '—')}</td>
+      <td class="mono">${esc((data.providers || []).join(', ') || '—')}</td>
+    </tr>`);
+    $('#coverage tbody').innerHTML = coverageRows.length ? coverageRows.join('') : emptyRow(5, 'No capability coverage data');
 
     const clientRows = (stats.clients || []).map(row => `<tr>
       <td><span class="pill">${esc(row.modality || 'chat')}</span></td>
diff --git a/foundrygate/providers.py b/foundrygate/providers.py
index 16331e1..337799c 100644
--- a/foundrygate/providers.py
+++ b/foundrygate/providers.py
@@ -74,6 +74,7 @@ def __init__(self, name: str, cfg: dict):
         self.context_window = cfg.get("context_window")
         self.limits = dict(cfg.get("limits", {}))
         self.cache = dict(cfg.get("cache", {}))
+        self.image = dict(cfg.get("image", {}))
         self.health = ProviderHealth(name=name)
 
         self._client = httpx.AsyncClient(
diff --git a/tests/test_route_introspection.py b/tests/test_route_introspection.py
index 87c2cd0..9f96dd2 100644
--- a/tests/test_route_introspection.py
+++ b/tests/test_route_introspection.py
@@ -46,7 +46,9 @@ async def aclose(self):
     _refresh_local_worker_probes,
     _resolve_image_route_preview,
     _resolve_route_preview,
+    health,
     preview_image_route,
+    provider_inventory,
 )
 from foundrygate.router import Router
 
@@ -98,6 +100,7 @@ def __init__(
         tier: str = "default",
         healthy: bool = True,
         capabilities: dict | None = None,
+        image: dict | None = None,
     ):
         self.name = name
         self.model = model
@@ -108,6 +111,7 @@ def __init__(
         self.context_window = 0
         self.limits = {}
         self.cache = {"mode": "none", "read_discount": False}
+        self.image = image or {}
         self.health = types.SimpleNamespace(
             healthy=healthy,
             last_check=0.0,
@@ -227,6 +231,11 @@ def preview_config(tmp_path, monkeypatch):
                     "image_generation": True,
                     "image_editing": True,
                 },
+                image={
+                    "max_outputs": 1,
+                    "max_side_px": 1024,
+                    "supported_sizes": ["1024x1024"],
+                },
             ),
             "image-large": _ProviderStub(
                 name="image-large",
@@ -240,6 +249,11 @@ def preview_config(tmp_path, monkeypatch):
                     "image_generation": True,
                     "image_editing": True,
                 },
+                image={
+                    "max_outputs": 4,
+                    "max_side_px": 2048,
+                    "supported_sizes": ["1024x1024", "2048x2048"],
+                },
             ),
         },
         raising=False,
@@ -433,3 +447,28 @@ async def test_refresh_only_probes_local_worker_contracts(self, preview_config):
 
         assert local_worker.probe_calls == 1
         assert cloud_default.probe_calls == 0
+
+
+class TestProviderCoverage:
+    @pytest.mark.asyncio
+    async def test_health_reports_capability_coverage(self, preview_config):
+        response = await health()
+
+        assert response["summary"]["providers_total"] == 4
+        assert response["summary"]["providers_healthy"] == 4
+        assert response["coverage"]["image_generation"]["total"] == 2
+        assert response["coverage"]["image_generation"]["healthy"] == 2
+        assert response["coverage"]["image_editing"]["providers"] == [
+            "image-cloud",
+            "image-large",
+        ]
+        assert response["providers"]["image-cloud"]["image"]["max_outputs"] == 1
+
+    @pytest.mark.asyncio
+    async def test_provider_inventory_filters_by_capability(self, preview_config):
+        response = await provider_inventory(capability="image_editing")
+
+        provider_names = [provider["name"] for provider in response["providers"]]
+        assert provider_names == ["image-cloud", "image-large"]
+        assert response["coverage"]["image_editing"]["total"] == 2
+        assert response["providers"][0]["contract"] == "image-provider"