From 96c63630e8ff757e276c031f3ce05de85bba9976 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andre=CC=81=20Lange?= Date: Thu, 12 Mar 2026 17:42:36 +0100 Subject: [PATCH] feat(obs): add operator event metrics --- CHANGELOG.md | 1 + README.md | 5 ++ docs/TROUBLESHOOTING.md | 1 + foundrygate/main.py | 53 ++++++++++++++- foundrygate/metrics.py | 108 +++++++++++++++++++++++++++++++ scripts/foundrygate-auto-update | 9 ++- scripts/foundrygate-update-check | 6 +- tests/test_metrics_traces.py | 41 ++++++++++++ 8 files changed, 221 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 338b94e..da9ad59 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ The format is intentionally lightweight and human-readable. Group entries by rel - Added stronger update-alert metadata to `GET /api/update`, including update type, alert level, and recommended action for operators and dashboard consumers - Added an opt-in `auto_update` policy block plus `foundrygate-auto-update` so controlled deployments can gate helper-driven updates without enabling silent self-updates +- Added `GET /api/operator-events` plus operator-event metrics for update checks and helper-driven auto-update attempts ## v0.6.0 - 2026-03-12 diff --git a/README.md b/README.md index 453f00e..541169d 100644 --- a/README.md +++ b/README.md @@ -258,6 +258,7 @@ curl -fsS http://127.0.0.1:8090/v1/images/edits \ - `POST /api/route/image` - `GET /api/providers` - `GET /api/update` +- `GET /api/operator-events` - `GET /api/stats` - `GET /api/recent?limit=50` - `GET /api/traces?limit=50` @@ -284,6 +285,7 @@ curl -fsS http://127.0.0.1:8090/api/route/image \ curl -fsS http://127.0.0.1:8090/api/stats curl -fsS http://127.0.0.1:8090/api/update +curl -fsS http://127.0.0.1:8090/api/operator-events curl -fsS 'http://127.0.0.1:8090/api/providers?healthy=true' curl -fsS 'http://127.0.0.1:8090/api/recent?limit=10' curl -fsS 'http://127.0.0.1:8090/api/traces?limit=10' @@ -531,6 +533,7 @@ update_check: ``` The status is exposed through `GET /api/update`, the dashboard, and the helper script `foundrygate-update-check`. +Recent operator-side update checks and apply attempts are exposed through `GET /api/operator-events`. FoundryGate also supports an optional `auto_update` policy block for controlled environments. This stays strictly opt-in and only marks whether the current release state is eligible for a helper-driven update command. @@ -823,6 +826,7 @@ What it does: - exposes the cached status in `GET /api/update` - surfaces the same status in the dashboard and `foundrygate-update-check` - exposes opt-in auto-update eligibility and the configured apply command +- records operator-side update checks and apply attempts in `GET /api/operator-events` What it does not do: @@ -834,6 +838,7 @@ Manual check: ```bash curl -fsS http://127.0.0.1:8090/api/update +curl -fsS http://127.0.0.1:8090/api/operator-events ./scripts/foundrygate-update-check ./scripts/foundrygate-auto-update ./scripts/foundrygate-auto-update --apply diff --git a/docs/TROUBLESHOOTING.md b/docs/TROUBLESHOOTING.md index 99c6e3c..e960205 100644 --- a/docs/TROUBLESHOOTING.md +++ b/docs/TROUBLESHOOTING.md @@ -148,6 +148,7 @@ Check the cached runtime view first: ```bash curl -fsS http://127.0.0.1:8090/api/update +curl -fsS http://127.0.0.1:8090/api/operator-events ./scripts/foundrygate-update-check ./scripts/foundrygate-auto-update ``` diff --git a/foundrygate/main.py b/foundrygate/main.py index 7abae91..2f6071b 100644 --- a/foundrygate/main.py +++ b/foundrygate/main.py @@ -91,6 +91,13 @@ def _collect_routing_headers(request: Request) -> dict[str, str]: return {k.lower(): v for k, v in request.headers.items() if k.lower().startswith(prefixes)} +def _collect_operator_context(headers: dict[str, str]) -> tuple[str, str]: + """Return operator action and client tag hints from request headers.""" + action = headers.get("x-foundrygate-operator-action", "update-check").strip().lower() + client_tag = headers.get("x-foundrygate-client", "operator").strip().lower() or "operator" + return action, client_tag + + def _match_client_profile_rule(match: dict, headers: dict[str, str]) -> bool: """Evaluate one client profile match block.""" if not match: @@ -732,6 +739,8 @@ async def stats( client_tag: str | None = None, layer: str | None = None, success: bool | None = None, + operator_action: str | None = None, + operator_status: str | None = None, ): """Full statistics: totals, per-provider, routing breakdown, time series.""" filters = { @@ -742,12 +751,18 @@ async def stats( "layer": layer, "success": success, } + operator_filters = { + "action": operator_action, + "status": operator_status, + "client_tag": client_tag, + } return { "totals": _metrics.get_totals(**filters), "providers": _metrics.get_provider_summary(**filters), "modalities": _metrics.get_modality_breakdown(**filters), "routing": _metrics.get_routing_breakdown(**filters), "clients": _metrics.get_client_breakdown(**filters), + "operator_actions": _metrics.get_operator_breakdown(**operator_filters), "hourly": _metrics.get_hourly_series(24), "daily": _metrics.get_daily_totals(30), } @@ -802,12 +817,48 @@ async def traces( @app.get("/api/update") -async def update_status(force: bool = False): +async def update_status(request: Request, force: bool = False): """Return cached or fresh release update metadata.""" + headers = _collect_routing_headers(request) status = await _update_checker.get_status(force=force) + operator_action, client_tag = _collect_operator_context(headers) + auto_update = status.auto_update or {} + _metrics.log_operator_event( + event_type="update", + action=operator_action, + client_tag=client_tag, + status=status.status, + update_type=status.update_type, + target_version=status.latest_version or status.current_version, + eligible=bool(auto_update.get("eligible", False)), + recommended_action=status.recommended_action, + detail=auto_update.get("blocked_reason", ""), + ) return status.to_dict() +@app.get("/api/operator-events") +async def operator_events( + limit: int = 50, + action: str | None = None, + status: str | None = None, + client_tag: str | None = None, + update_type: str | None = None, + eligible: bool | None = None, +): + """Recent operator events such as update checks and apply attempts.""" + return { + "events": _metrics.get_operator_events( + limit, + action=action, + status=status, + client_tag=client_tag, + update_type=update_type, + eligible=eligible, + ) + } + + @app.post("/api/route") async def preview_route(request: Request): """Dry-run one routing decision without sending a provider request.""" diff --git a/foundrygate/metrics.py b/foundrygate/metrics.py index 9a46518..63031d5 100644 --- a/foundrygate/metrics.py +++ b/foundrygate/metrics.py @@ -57,6 +57,23 @@ def calc_cost( CREATE INDEX IF NOT EXISTS idx_req_ts ON requests(timestamp); CREATE INDEX IF NOT EXISTS idx_req_provider ON requests(provider); CREATE INDEX IF NOT EXISTS idx_req_layer ON requests(layer); + +CREATE TABLE IF NOT EXISTS operator_events ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + timestamp REAL NOT NULL, + event_type TEXT NOT NULL, + action TEXT NOT NULL, + client_tag TEXT DEFAULT '', + status TEXT DEFAULT '', + update_type TEXT DEFAULT '', + target_version TEXT DEFAULT '', + eligible INTEGER DEFAULT 0, + recommended_action TEXT DEFAULT '', + detail TEXT DEFAULT '' +); +CREATE INDEX IF NOT EXISTS idx_op_ts ON operator_events(timestamp); +CREATE INDEX IF NOT EXISTS idx_op_action ON operator_events(action); +CREATE INDEX IF NOT EXISTS idx_op_status ON operator_events(status); """ _OPTIONAL_COLUMNS: dict[str, str] = { @@ -164,6 +181,45 @@ def log_request( except Exception as e: logger.warning("Metrics write failed: %s", e) + def log_operator_event( + self, + *, + event_type: str, + action: str, + client_tag: str = "", + status: str = "", + update_type: str = "", + target_version: str = "", + eligible: bool = False, + recommended_action: str = "", + detail: str = "", + ) -> None: + """Persist one operator event such as an update check or apply attempt.""" + if not self._conn: + return + try: + self._conn.execute( + """INSERT INTO operator_events + (timestamp,event_type,action,client_tag,status,update_type, + target_version,eligible,recommended_action,detail) + VALUES (?,?,?,?,?,?,?,?,?,?)""", + ( + time.time(), + event_type, + action, + client_tag, + status, + update_type, + target_version, + 1 if eligible else 0, + recommended_action, + detail, + ), + ) + self._conn.commit() + except Exception as e: + logger.warning("Operator metrics write failed: %s", e) + def get_provider_summary(self, **filters: Any) -> list[dict]: where_sql, params = self._build_where_clause(filters) return self._q( @@ -269,6 +325,31 @@ def get_daily_totals(self, days: int = 30) -> list[dict]: (cutoff,), ) + def get_operator_events(self, limit: int = 50, **filters: Any) -> list[dict]: + where_sql, params = self._build_operator_where_clause(filters) + return self._q( + f"SELECT * FROM operator_events{where_sql} ORDER BY timestamp DESC LIMIT ?", + (*params, limit), + ) + + def get_operator_breakdown(self, **filters: Any) -> list[dict]: + where_sql, params = self._build_operator_where_clause(filters) + return self._q( + f""" + SELECT event_type, + action, + client_tag, + status, + update_type, + eligible, + COUNT(*) AS events + FROM operator_events{where_sql} + GROUP BY event_type, action, client_tag, status, update_type, eligible + ORDER BY events DESC, action ASC + """, + params, + ) + def get_recent(self, limit: int = 50, **filters: Any) -> list[dict]: where_sql, params = self._build_where_clause(filters) rows = self._q( @@ -334,6 +415,33 @@ def _build_where_clause(self, filters: dict[str, Any]) -> tuple[str, tuple[Any, return "", () return f" WHERE {' AND '.join(clauses)}", tuple(params) + def _build_operator_where_clause(self, filters: dict[str, Any]) -> tuple[str, tuple[Any, ...]]: + """Build a WHERE clause for operator-event filters.""" + clauses = [] + params: list[Any] = [] + mapping = { + "event_type": "event_type", + "action": "action", + "client_tag": "client_tag", + "status": "status", + "update_type": "update_type", + } + for key, column in mapping.items(): + value = filters.get(key) + if value in (None, ""): + continue + clauses.append(f"{column} = ?") + params.append(value) + + eligible = filters.get("eligible") + if eligible not in (None, ""): + clauses.append("eligible = ?") + params.append(1 if bool(eligible) else 0) + + if not clauses: + return "", () + return f" WHERE {' AND '.join(clauses)}", tuple(params) + def _q(self, sql: str, params: tuple = ()) -> list[dict]: if not self._conn: return [] diff --git a/scripts/foundrygate-auto-update b/scripts/foundrygate-auto-update index 9720127..21106f3 100755 --- a/scripts/foundrygate-auto-update +++ b/scripts/foundrygate-auto-update @@ -3,8 +3,15 @@ set -euo pipefail api_url="${FOUNDRYGATE_UPDATE_API_URL:-http://127.0.0.1:8090/api/update?force=true}" mode="${1:-}" +operator_action="auto-update-check" +if [ "$mode" = "--apply" ]; then + operator_action="auto-update-apply" +fi -payload="$(curl -fsS "$api_url")" +payload="$(curl -fsS \ + -H 'X-FoundryGate-Client: operator' \ + -H "X-FoundryGate-Operator-Action: ${operator_action}" \ + "$api_url")" if [ "$mode" = "--json" ]; then printf '%s\n' "$payload" diff --git a/scripts/foundrygate-update-check b/scripts/foundrygate-update-check index ce90ac3..5379a16 100755 --- a/scripts/foundrygate-update-check +++ b/scripts/foundrygate-update-check @@ -1,3 +1,7 @@ #!/usr/bin/env bash set -euo pipefail -curl -fsS "http://127.0.0.1:8090/api/update" | head -c 1600; echo +curl -fsS \ + -H 'X-FoundryGate-Client: operator' \ + -H 'X-FoundryGate-Operator-Action: update-check' \ + "http://127.0.0.1:8090/api/update" | head -c 1600 +echo diff --git a/tests/test_metrics_traces.py b/tests/test_metrics_traces.py index c2eb181..a270668 100644 --- a/tests/test_metrics_traces.py +++ b/tests/test_metrics_traces.py @@ -147,3 +147,44 @@ def test_metrics_store_filters_recent_and_breakdowns(tmp_path): assert totals["total_failures"] == 1 metrics.close() + + +def test_metrics_store_tracks_operator_events(tmp_path): + db_path = tmp_path / "operator.db" + metrics = MetricsStore(str(db_path)) + metrics.init() + + metrics.log_operator_event( + event_type="update", + action="auto-update-apply", + client_tag="operator", + status="ok", + update_type="minor", + target_version="v0.7.0", + eligible=True, + recommended_action="Upgrade to the latest release", + detail="", + ) + metrics.log_operator_event( + event_type="update", + action="update-check", + client_tag="operator", + status="unavailable", + update_type="unknown", + target_version="", + eligible=False, + recommended_action="Inspect release connectivity and retry later", + detail="network unavailable", + ) + + events = metrics.get_operator_events(10, action="auto-update-apply") + assert len(events) == 1 + assert events[0]["update_type"] == "minor" + assert events[0]["eligible"] == 1 + + breakdown = metrics.get_operator_breakdown(status="ok") + assert len(breakdown) == 1 + assert breakdown[0]["action"] == "auto-update-apply" + assert breakdown[0]["events"] == 1 + + metrics.close()