diff --git a/CHANGELOG.md b/CHANGELOG.md index 989d825..512eee5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,7 @@ The format is intentionally lightweight and human-readable. Group entries by rel - Added `update_check.release_channel` and `auto_update.rollout_ring` so operators can distinguish stable vs preview checks and tighter rollout rings - Added `auto_update.min_release_age_hours` so helper-driven auto-updates can wait for a release to age before becoming eligible - Added `auto_update.maintenance_window` so helper-driven auto-updates can stay inside explicit local maintenance hours +- Added `auto_update.provider_scope` so rollout-health guardrails can evaluate only a selected provider subset ## v0.6.0 - 2026-03-12 diff --git a/README.md b/README.md index f93b8ff..121544a 100644 --- a/README.md +++ b/README.md @@ -549,6 +549,7 @@ Supported fields in `auto_update`: - `require_healthy_providers` - `max_unhealthy_providers` - `min_release_age_hours` +- `provider_scope` - `maintenance_window` - `apply_command` @@ -562,6 +563,9 @@ auto_update: require_healthy_providers: true max_unhealthy_providers: 0 min_release_age_hours: 24 + provider_scope: + allow_providers: ["local-worker", "deepseek-chat"] + deny_providers: ["openrouter-fallback"] maintenance_window: enabled: true timezone: "Europe/Berlin" @@ -577,6 +581,7 @@ What the current runtime does with it: - shows the same state in the dashboard - lets `foundrygate-auto-update --apply` run only when the current release state is eligible - can block helper-driven rollout when provider health is already degraded +- can scope rollout-health checks to a specific provider subset instead of the whole runtime - lets operators separate `stable` vs `preview` release checks and `stable` / `early` / `canary` rollout rings - can require that a release has aged for a minimum number of hours before helper-driven rollout - can restrict helper-driven rollout to explicit local maintenance windows diff --git a/config.yaml b/config.yaml index 9441c2c..4104477 100644 --- a/config.yaml +++ b/config.yaml @@ -893,6 +893,9 @@ auto_update: require_healthy_providers: true max_unhealthy_providers: 0 min_release_age_hours: 0 + provider_scope: + allow_providers: [] + deny_providers: ["openrouter-fallback"] maintenance_window: enabled: false timezone: "UTC" diff --git a/docs/PUBLISHING.md b/docs/PUBLISHING.md index d553291..69e21dd 100644 --- a/docs/PUBLISHING.md +++ b/docs/PUBLISHING.md @@ -66,6 +66,7 @@ If you want scheduled update application: - keep `allow_major: false` unless you are ready to absorb breaking changes automatically - keep `require_healthy_providers: true` unless you are intentionally allowing rollouts while the gateway is degraded - set `min_release_age_hours` above `0` if you want scheduled rollouts to wait before applying newly published releases +- use `provider_scope.allow_providers` / `deny_providers` if rollout health should only consider a subset of providers - add `maintenance_window` if scheduled updates should only run in explicit local maintenance hours - prefer the reviewed examples in [examples/foundrygate-auto-update.service](./examples/foundrygate-auto-update.service) and [examples/foundrygate-auto-update.timer](./examples/foundrygate-auto-update.timer) - use the cron example in [examples/foundrygate-auto-update.cron](./examples/foundrygate-auto-update.cron) only when `systemd` timers are not practical diff --git a/docs/TROUBLESHOOTING.md b/docs/TROUBLESHOOTING.md index da3c230..1daed6d 100644 --- a/docs/TROUBLESHOOTING.md +++ b/docs/TROUBLESHOOTING.md @@ -181,6 +181,7 @@ If `foundrygate-auto-update --apply` refuses to run, inspect the `auto_update` b - `auto_update.enabled: false` - the latest release is a major upgrade while `allow_major: false` +- `provider_scope.allow_providers` / `deny_providers` resolves to no matching providers - one or more providers are unhealthy while `require_healthy_providers: true` - the number of unhealthy providers exceeds `max_unhealthy_providers` - the current time is outside the configured `maintenance_window.days` or `maintenance_window.start_hour` / `end_hour` diff --git a/foundrygate/config.py b/foundrygate/config.py index 81b0d99..f6a2501 100644 --- a/foundrygate/config.py +++ b/foundrygate/config.py @@ -918,6 +918,44 @@ def _normalize_auto_update(data: dict[str, Any]) -> dict[str, Any]: if min_release_age_hours < 0: raise ConfigError("'auto_update.min_release_age_hours' must be non-negative") + provider_scope = raw.get("provider_scope", {}) + if provider_scope is None: + provider_scope = {} + if not isinstance(provider_scope, dict): + raise ConfigError("'auto_update.provider_scope' must be a mapping") + + provider_names = set((data.get("providers") or {}).keys()) + allow_providers = _normalize_string_list( + provider_scope.get("allow_providers", []), + field_name="allow_providers", + rule_name="auto_update.provider_scope", + allow_empty=True, + ) + deny_providers = _normalize_string_list( + provider_scope.get("deny_providers", []), + field_name="deny_providers", + rule_name="auto_update.provider_scope", + allow_empty=True, + ) + unknown_allowed = sorted(name for name in allow_providers if name not in provider_names) + if unknown_allowed: + raise ConfigError( + "'auto_update.provider_scope.allow_providers' references unknown providers: " + + ", ".join(unknown_allowed) + ) + unknown_denied = sorted(name for name in deny_providers if name not in provider_names) + if unknown_denied: + raise ConfigError( + "'auto_update.provider_scope.deny_providers' references unknown providers: " + + ", ".join(unknown_denied) + ) + overlap = sorted(set(allow_providers) & set(deny_providers)) + if overlap: + raise ConfigError( + "'auto_update.provider_scope' cannot allow and deny the same providers: " + + ", ".join(overlap) + ) + maintenance_window = raw.get("maintenance_window", {}) if maintenance_window is None: maintenance_window = {} @@ -969,6 +1007,10 @@ def _normalize_auto_update(data: dict[str, Any]) -> dict[str, Any]: "require_healthy_providers": require_healthy_providers, "max_unhealthy_providers": max_unhealthy_providers, "min_release_age_hours": min_release_age_hours, + "provider_scope": { + "allow_providers": allow_providers, + "deny_providers": deny_providers, + }, "maintenance_window": { "enabled": window_enabled, "timezone": timezone.strip(), @@ -1070,6 +1112,10 @@ def auto_update(self) -> dict: "require_healthy_providers": True, "max_unhealthy_providers": 0, "min_release_age_hours": 0, + "provider_scope": { + "allow_providers": [], + "deny_providers": [], + }, "maintenance_window": { "enabled": False, "timezone": "UTC", diff --git a/foundrygate/main.py b/foundrygate/main.py index c3485c5..6239a60 100644 --- a/foundrygate/main.py +++ b/foundrygate/main.py @@ -263,6 +263,28 @@ def _health_summary() -> dict[str, int]: } +def _rollout_provider_summary(provider_scope: dict[str, Any] | None) -> dict[str, Any]: + """Return provider-health totals for the configured rollout scope.""" + scope = dict(provider_scope or {}) + allow = set(scope.get("allow_providers") or []) + deny = set(scope.get("deny_providers") or []) + + rows = [] + for name, provider in _providers.items(): + if allow and name not in allow: + continue + if name in deny: + continue + rows.append((name, provider)) + + return { + "providers": [name for name, _ in rows], + "providers_total": len(rows), + "providers_healthy": sum(1 for _, provider in rows if provider.health.healthy), + "providers_unhealthy": sum(1 for _, provider in rows if not provider.health.healthy), + } + + def _estimate_request_dimensions(body: dict[str, Any]) -> dict[str, int | str]: """Return lightweight request-dimension estimates for debugging and routing preview.""" messages = body.get("messages", []) @@ -831,12 +853,21 @@ async def update_status(request: Request, force: bool = False): """Return cached or fresh release update metadata.""" headers = _collect_routing_headers(request) status = await _update_checker.get_status(force=force) + rollout_summary = _rollout_provider_summary((status.auto_update or {}).get("provider_scope")) status.auto_update = apply_auto_update_guardrails( status.auto_update or {}, - providers_healthy=_health_summary()["providers_healthy"], - providers_unhealthy=_health_summary()["providers_unhealthy"], + providers_total=rollout_summary["providers_total"], + providers_healthy=rollout_summary["providers_healthy"], + providers_unhealthy=rollout_summary["providers_unhealthy"], ) status.auto_update = apply_maintenance_window_guardrail(status.auto_update or {}) + status.auto_update.setdefault("provider_scope", {}) + status.auto_update["provider_scope"]["matched_providers"] = rollout_summary["providers"] + status.auto_update["provider_scope"]["summary"] = { + "providers_total": rollout_summary["providers_total"], + "providers_healthy": rollout_summary["providers_healthy"], + "providers_unhealthy": rollout_summary["providers_unhealthy"], + } operator_action, client_tag = _collect_operator_context(headers) auto_update = status.auto_update or {} _metrics.log_operator_event( diff --git a/foundrygate/updates.py b/foundrygate/updates.py index 1a51482..392c678 100644 --- a/foundrygate/updates.py +++ b/foundrygate/updates.py @@ -119,6 +119,7 @@ def release_age_hours(published_at: str, *, now: datetime | None = None) -> floa def apply_auto_update_guardrails( auto_update: dict[str, Any], *, + providers_total: int, providers_healthy: int, providers_unhealthy: int, ) -> dict[str, Any]: @@ -133,6 +134,11 @@ def apply_auto_update_guardrails( if not require_healthy_providers: return result + if providers_total <= 0: + result["eligible"] = False + result["blocked_reason"] = "No providers match rollout provider scope" + return result + if providers_healthy <= 0: result["eligible"] = False result["blocked_reason"] = "No healthy providers available" @@ -304,6 +310,7 @@ def __init__( ), "max_unhealthy_providers": int((auto_update or {}).get("max_unhealthy_providers", 0)), "min_release_age_hours": int((auto_update or {}).get("min_release_age_hours", 0)), + "provider_scope": dict((auto_update or {}).get("provider_scope") or {}), "maintenance_window": dict((auto_update or {}).get("maintenance_window") or {}), "apply_command": str((auto_update or {}).get("apply_command", "foundrygate-update")), } @@ -365,6 +372,7 @@ def _auto_update_status( ), "max_unhealthy_providers": int(self.auto_update.get("max_unhealthy_providers", 0)), "min_release_age_hours": int(self.auto_update.get("min_release_age_hours", 0)), + "provider_scope": dict(self.auto_update.get("provider_scope") or {}), "maintenance_window": dict(self.auto_update.get("maintenance_window") or {}), "eligible": eligible, "blocked_reason": blocked_reason, diff --git a/tests/test_config.py b/tests/test_config.py index 8c1ce91..68ef849 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -91,6 +91,10 @@ def test_auto_update_defaults_are_exposed(): assert cfg.auto_update["require_healthy_providers"] is True assert cfg.auto_update["max_unhealthy_providers"] == 0 assert cfg.auto_update["min_release_age_hours"] == 0 + assert cfg.auto_update["provider_scope"] == { + "allow_providers": [], + "deny_providers": ["openrouter-fallback"], + } assert cfg.auto_update["maintenance_window"]["enabled"] is False assert cfg.auto_update["maintenance_window"]["timezone"] == "UTC" assert cfg.auto_update["maintenance_window"]["days"] == ["sat", "sun"] diff --git a/tests/test_updates.py b/tests/test_updates.py index 231ef22..5ca2ee9 100644 --- a/tests/test_updates.py +++ b/tests/test_updates.py @@ -116,6 +116,7 @@ def test_auto_update_guardrails_block_when_too_many_providers_are_unhealthy(): "max_unhealthy_providers": 0, "blocked_reason": "", }, + providers_total=2, providers_healthy=1, providers_unhealthy=1, ) @@ -133,6 +134,7 @@ def test_auto_update_guardrails_allow_updates_when_health_budget_is_met(): "max_unhealthy_providers": 1, "blocked_reason": "", }, + providers_total=3, providers_healthy=2, providers_unhealthy=1, ) @@ -149,6 +151,7 @@ def test_auto_update_guardrails_block_when_no_provider_is_healthy(): "max_unhealthy_providers": 2, "blocked_reason": "", }, + providers_total=2, providers_healthy=0, providers_unhealthy=2, ) @@ -157,6 +160,24 @@ def test_auto_update_guardrails_block_when_no_provider_is_healthy(): assert guarded["blocked_reason"] == "No healthy providers available" +def test_auto_update_guardrails_block_when_provider_scope_matches_nothing(): + guarded = apply_auto_update_guardrails( + { + "enabled": True, + "eligible": True, + "require_healthy_providers": True, + "max_unhealthy_providers": 0, + "blocked_reason": "", + }, + providers_total=0, + providers_healthy=0, + providers_unhealthy=0, + ) + + assert guarded["eligible"] is False + assert guarded["blocked_reason"] == "No providers match rollout provider scope" + + def test_maintenance_window_guardrail_allows_updates_when_window_is_disabled(): guarded = apply_maintenance_window_guardrail( { @@ -273,7 +294,11 @@ async def test_update_checker_reports_latest_release(): current_version="0.4.0", enabled=True, repository="typelicious/FoundryGate", - auto_update={"enabled": True, "allow_major": False}, + auto_update={ + "enabled": True, + "allow_major": False, + "provider_scope": {"allow_providers": ["deepseek-chat"], "deny_providers": []}, + }, ) checker._client = _FakeClient( _FakeResponse( @@ -297,6 +322,10 @@ async def test_update_checker_reports_latest_release(): assert status.auto_update["eligible"] is True assert status.release_channel == "stable" assert status.auto_update["allowed_update_types"] == ["patch", "minor"] + assert status.auto_update["provider_scope"] == { + "allow_providers": ["deepseek-chat"], + "deny_providers": [], + } assert status.release_url.endswith("/v0.5.0")