Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ The format is intentionally lightweight and human-readable. Group entries by rel

- Added stronger update-alert metadata to `GET /api/update`, including update type, alert level, and recommended action for operators and dashboard consumers
- Added an opt-in `auto_update` policy block plus `foundrygate-auto-update` so controlled deployments can gate helper-driven updates without enabling silent self-updates
- Added `GET /api/operator-events` plus operator-event metrics for update checks and helper-driven auto-update attempts

## v0.6.0 - 2026-03-12

Expand Down
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,7 @@ curl -fsS http://127.0.0.1:8090/v1/images/edits \
- `POST /api/route/image`
- `GET /api/providers`
- `GET /api/update`
- `GET /api/operator-events`
- `GET /api/stats`
- `GET /api/recent?limit=50`
- `GET /api/traces?limit=50`
Expand All @@ -284,6 +285,7 @@ curl -fsS http://127.0.0.1:8090/api/route/image \

curl -fsS http://127.0.0.1:8090/api/stats
curl -fsS http://127.0.0.1:8090/api/update
curl -fsS http://127.0.0.1:8090/api/operator-events
curl -fsS 'http://127.0.0.1:8090/api/providers?healthy=true'
curl -fsS 'http://127.0.0.1:8090/api/recent?limit=10'
curl -fsS 'http://127.0.0.1:8090/api/traces?limit=10'
Expand Down Expand Up @@ -531,6 +533,7 @@ update_check:
```

The status is exposed through `GET /api/update`, the dashboard, and the helper script `foundrygate-update-check`.
Recent operator-side update checks and apply attempts are exposed through `GET /api/operator-events`.

FoundryGate also supports an optional `auto_update` policy block for controlled environments. This stays strictly opt-in and only marks whether the current release state is eligible for a helper-driven update command.

Expand Down Expand Up @@ -823,6 +826,7 @@ What it does:
- exposes the cached status in `GET /api/update`
- surfaces the same status in the dashboard and `foundrygate-update-check`
- exposes opt-in auto-update eligibility and the configured apply command
- records operator-side update checks and apply attempts in `GET /api/operator-events`

What it does not do:

Expand All @@ -834,6 +838,7 @@ Manual check:

```bash
curl -fsS http://127.0.0.1:8090/api/update
curl -fsS http://127.0.0.1:8090/api/operator-events
./scripts/foundrygate-update-check
./scripts/foundrygate-auto-update
./scripts/foundrygate-auto-update --apply
Expand Down
1 change: 1 addition & 0 deletions docs/TROUBLESHOOTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,7 @@ Check the cached runtime view first:

```bash
curl -fsS http://127.0.0.1:8090/api/update
curl -fsS http://127.0.0.1:8090/api/operator-events
./scripts/foundrygate-update-check
./scripts/foundrygate-auto-update
```
Expand Down
53 changes: 52 additions & 1 deletion foundrygate/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,13 @@ def _collect_routing_headers(request: Request) -> dict[str, str]:
return {k.lower(): v for k, v in request.headers.items() if k.lower().startswith(prefixes)}


def _collect_operator_context(headers: dict[str, str]) -> tuple[str, str]:
"""Return operator action and client tag hints from request headers."""
action = headers.get("x-foundrygate-operator-action", "update-check").strip().lower()
client_tag = headers.get("x-foundrygate-client", "operator").strip().lower() or "operator"
return action, client_tag


def _match_client_profile_rule(match: dict, headers: dict[str, str]) -> bool:
"""Evaluate one client profile match block."""
if not match:
Expand Down Expand Up @@ -732,6 +739,8 @@ async def stats(
client_tag: str | None = None,
layer: str | None = None,
success: bool | None = None,
operator_action: str | None = None,
operator_status: str | None = None,
):
"""Full statistics: totals, per-provider, routing breakdown, time series."""
filters = {
Expand All @@ -742,12 +751,18 @@ async def stats(
"layer": layer,
"success": success,
}
operator_filters = {
"action": operator_action,
"status": operator_status,
"client_tag": client_tag,
}
return {
"totals": _metrics.get_totals(**filters),
"providers": _metrics.get_provider_summary(**filters),
"modalities": _metrics.get_modality_breakdown(**filters),
"routing": _metrics.get_routing_breakdown(**filters),
"clients": _metrics.get_client_breakdown(**filters),
"operator_actions": _metrics.get_operator_breakdown(**operator_filters),
"hourly": _metrics.get_hourly_series(24),
"daily": _metrics.get_daily_totals(30),
}
Expand Down Expand Up @@ -802,12 +817,48 @@ async def traces(


@app.get("/api/update")
async def update_status(force: bool = False):
async def update_status(request: Request, force: bool = False):
"""Return cached or fresh release update metadata."""
headers = _collect_routing_headers(request)
status = await _update_checker.get_status(force=force)
operator_action, client_tag = _collect_operator_context(headers)
auto_update = status.auto_update or {}
_metrics.log_operator_event(
event_type="update",
action=operator_action,
client_tag=client_tag,
status=status.status,
update_type=status.update_type,
target_version=status.latest_version or status.current_version,
eligible=bool(auto_update.get("eligible", False)),
recommended_action=status.recommended_action,
detail=auto_update.get("blocked_reason", ""),
)
return status.to_dict()


@app.get("/api/operator-events")
async def operator_events(
limit: int = 50,
action: str | None = None,
status: str | None = None,
client_tag: str | None = None,
update_type: str | None = None,
eligible: bool | None = None,
):
"""Recent operator events such as update checks and apply attempts."""
return {
"events": _metrics.get_operator_events(
limit,
action=action,
status=status,
client_tag=client_tag,
update_type=update_type,
eligible=eligible,
)
}


@app.post("/api/route")
async def preview_route(request: Request):
"""Dry-run one routing decision without sending a provider request."""
Expand Down
108 changes: 108 additions & 0 deletions foundrygate/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,23 @@ def calc_cost(
CREATE INDEX IF NOT EXISTS idx_req_ts ON requests(timestamp);
CREATE INDEX IF NOT EXISTS idx_req_provider ON requests(provider);
CREATE INDEX IF NOT EXISTS idx_req_layer ON requests(layer);

CREATE TABLE IF NOT EXISTS operator_events (
id INTEGER PRIMARY KEY AUTOINCREMENT,
timestamp REAL NOT NULL,
event_type TEXT NOT NULL,
action TEXT NOT NULL,
client_tag TEXT DEFAULT '',
status TEXT DEFAULT '',
update_type TEXT DEFAULT '',
target_version TEXT DEFAULT '',
eligible INTEGER DEFAULT 0,
recommended_action TEXT DEFAULT '',
detail TEXT DEFAULT ''
);
CREATE INDEX IF NOT EXISTS idx_op_ts ON operator_events(timestamp);
CREATE INDEX IF NOT EXISTS idx_op_action ON operator_events(action);
CREATE INDEX IF NOT EXISTS idx_op_status ON operator_events(status);
"""

_OPTIONAL_COLUMNS: dict[str, str] = {
Expand Down Expand Up @@ -164,6 +181,45 @@ def log_request(
except Exception as e:
logger.warning("Metrics write failed: %s", e)

def log_operator_event(
self,
*,
event_type: str,
action: str,
client_tag: str = "",
status: str = "",
update_type: str = "",
target_version: str = "",
eligible: bool = False,
recommended_action: str = "",
detail: str = "",
) -> None:
"""Persist one operator event such as an update check or apply attempt."""
if not self._conn:
return
try:
self._conn.execute(
"""INSERT INTO operator_events
(timestamp,event_type,action,client_tag,status,update_type,
target_version,eligible,recommended_action,detail)
VALUES (?,?,?,?,?,?,?,?,?,?)""",
(
time.time(),
event_type,
action,
client_tag,
status,
update_type,
target_version,
1 if eligible else 0,
recommended_action,
detail,
),
)
self._conn.commit()
except Exception as e:
logger.warning("Operator metrics write failed: %s", e)

def get_provider_summary(self, **filters: Any) -> list[dict]:
where_sql, params = self._build_where_clause(filters)
return self._q(
Expand Down Expand Up @@ -269,6 +325,31 @@ def get_daily_totals(self, days: int = 30) -> list[dict]:
(cutoff,),
)

def get_operator_events(self, limit: int = 50, **filters: Any) -> list[dict]:
where_sql, params = self._build_operator_where_clause(filters)
return self._q(
f"SELECT * FROM operator_events{where_sql} ORDER BY timestamp DESC LIMIT ?",
(*params, limit),
)

def get_operator_breakdown(self, **filters: Any) -> list[dict]:
where_sql, params = self._build_operator_where_clause(filters)
return self._q(
f"""
SELECT event_type,
action,
client_tag,
status,
update_type,
eligible,
COUNT(*) AS events
FROM operator_events{where_sql}
GROUP BY event_type, action, client_tag, status, update_type, eligible
ORDER BY events DESC, action ASC
""",
params,
)

def get_recent(self, limit: int = 50, **filters: Any) -> list[dict]:
where_sql, params = self._build_where_clause(filters)
rows = self._q(
Expand Down Expand Up @@ -334,6 +415,33 @@ def _build_where_clause(self, filters: dict[str, Any]) -> tuple[str, tuple[Any,
return "", ()
return f" WHERE {' AND '.join(clauses)}", tuple(params)

def _build_operator_where_clause(self, filters: dict[str, Any]) -> tuple[str, tuple[Any, ...]]:
"""Build a WHERE clause for operator-event filters."""
clauses = []
params: list[Any] = []
mapping = {
"event_type": "event_type",
"action": "action",
"client_tag": "client_tag",
"status": "status",
"update_type": "update_type",
}
for key, column in mapping.items():
value = filters.get(key)
if value in (None, ""):
continue
clauses.append(f"{column} = ?")
params.append(value)

eligible = filters.get("eligible")
if eligible not in (None, ""):
clauses.append("eligible = ?")
params.append(1 if bool(eligible) else 0)

if not clauses:
return "", ()
return f" WHERE {' AND '.join(clauses)}", tuple(params)

def _q(self, sql: str, params: tuple = ()) -> list[dict]:
if not self._conn:
return []
Expand Down
9 changes: 8 additions & 1 deletion scripts/foundrygate-auto-update
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,15 @@ set -euo pipefail

api_url="${FOUNDRYGATE_UPDATE_API_URL:-http://127.0.0.1:8090/api/update?force=true}"
mode="${1:-}"
operator_action="auto-update-check"
if [ "$mode" = "--apply" ]; then
operator_action="auto-update-apply"
fi

payload="$(curl -fsS "$api_url")"
payload="$(curl -fsS \
-H 'X-FoundryGate-Client: operator' \
-H "X-FoundryGate-Operator-Action: ${operator_action}" \
"$api_url")"

if [ "$mode" = "--json" ]; then
printf '%s\n' "$payload"
Expand Down
6 changes: 5 additions & 1 deletion scripts/foundrygate-update-check
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
#!/usr/bin/env bash
set -euo pipefail
curl -fsS "http://127.0.0.1:8090/api/update" | head -c 1600; echo
curl -fsS \
-H 'X-FoundryGate-Client: operator' \
-H 'X-FoundryGate-Operator-Action: update-check' \
"http://127.0.0.1:8090/api/update" | head -c 1600
echo
41 changes: 41 additions & 0 deletions tests/test_metrics_traces.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,3 +147,44 @@ def test_metrics_store_filters_recent_and_breakdowns(tmp_path):
assert totals["total_failures"] == 1

metrics.close()


def test_metrics_store_tracks_operator_events(tmp_path):
db_path = tmp_path / "operator.db"
metrics = MetricsStore(str(db_path))
metrics.init()

metrics.log_operator_event(
event_type="update",
action="auto-update-apply",
client_tag="operator",
status="ok",
update_type="minor",
target_version="v0.7.0",
eligible=True,
recommended_action="Upgrade to the latest release",
detail="",
)
metrics.log_operator_event(
event_type="update",
action="update-check",
client_tag="operator",
status="unavailable",
update_type="unknown",
target_version="",
eligible=False,
recommended_action="Inspect release connectivity and retry later",
detail="network unavailable",
)

events = metrics.get_operator_events(10, action="auto-update-apply")
assert len(events) == 1
assert events[0]["update_type"] == "minor"
assert events[0]["eligible"] == 1

breakdown = metrics.get_operator_breakdown(status="ok")
assert len(breakdown) == 1
assert breakdown[0]["action"] == "auto-update-apply"
assert breakdown[0]["events"] == 1

metrics.close()
Loading