diff --git a/CHANGELOG.md b/CHANGELOG.md index 512eee5..403eddb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,7 @@ The format is intentionally lightweight and human-readable. Group entries by rel - Added `auto_update.min_release_age_hours` so helper-driven auto-updates can wait for a release to age before becoming eligible - Added `auto_update.maintenance_window` so helper-driven auto-updates can stay inside explicit local maintenance hours - Added `auto_update.provider_scope` so rollout-health guardrails can evaluate only a selected provider subset +- Added `auto_update.verification` so helper-driven auto-updates can run a post-update check and emit a rollback hint on failure ## v0.6.0 - 2026-03-12 diff --git a/README.md b/README.md index 121544a..5a64a9c 100644 --- a/README.md +++ b/README.md @@ -550,6 +550,7 @@ Supported fields in `auto_update`: - `max_unhealthy_providers` - `min_release_age_hours` - `provider_scope` +- `verification` - `maintenance_window` - `apply_command` @@ -566,6 +567,11 @@ auto_update: provider_scope: allow_providers: ["local-worker", "deepseek-chat"] deny_providers: ["openrouter-fallback"] + verification: + enabled: true + command: "foundrygate-health" + timeout_seconds: 30 + rollback_command: "sudo systemctl restart foundrygate.service" maintenance_window: enabled: true timezone: "Europe/Berlin" @@ -582,6 +588,7 @@ What the current runtime does with it: - lets `foundrygate-auto-update --apply` run only when the current release state is eligible - can block helper-driven rollout when provider health is already degraded - can scope rollout-health checks to a specific provider subset instead of the whole runtime +- can require a post-update verification command and emit a rollback hint when that verification fails - lets operators separate `stable` vs `preview` release checks and `stable` / `early` / `canary` rollout rings - can require that a release has aged for a minimum number of hours before helper-driven rollout - can restrict helper-driven rollout to explicit local maintenance windows diff --git a/config.yaml b/config.yaml index 4104477..3caf9ea 100644 --- a/config.yaml +++ b/config.yaml @@ -896,6 +896,11 @@ auto_update: provider_scope: allow_providers: [] deny_providers: ["openrouter-fallback"] + verification: + enabled: false + command: "foundrygate-health" + timeout_seconds: 30 + rollback_command: "" maintenance_window: enabled: false timezone: "UTC" diff --git a/docs/PUBLISHING.md b/docs/PUBLISHING.md index 69e21dd..0a7bbe0 100644 --- a/docs/PUBLISHING.md +++ b/docs/PUBLISHING.md @@ -67,6 +67,7 @@ If you want scheduled update application: - keep `require_healthy_providers: true` unless you are intentionally allowing rollouts while the gateway is degraded - set `min_release_age_hours` above `0` if you want scheduled rollouts to wait before applying newly published releases - use `provider_scope.allow_providers` / `deny_providers` if rollout health should only consider a subset of providers +- enable `verification` if helper-driven updates must pass a post-update health or smoke check before the rollout is considered clean - add `maintenance_window` if scheduled updates should only run in explicit local maintenance hours - prefer the reviewed examples in [examples/foundrygate-auto-update.service](./examples/foundrygate-auto-update.service) and [examples/foundrygate-auto-update.timer](./examples/foundrygate-auto-update.timer) - use the cron example in [examples/foundrygate-auto-update.cron](./examples/foundrygate-auto-update.cron) only when `systemd` timers are not practical diff --git a/docs/TROUBLESHOOTING.md b/docs/TROUBLESHOOTING.md index 1daed6d..517fab1 100644 --- a/docs/TROUBLESHOOTING.md +++ b/docs/TROUBLESHOOTING.md @@ -184,6 +184,7 @@ If `foundrygate-auto-update --apply` refuses to run, inspect the `auto_update` b - `provider_scope.allow_providers` / `deny_providers` resolves to no matching providers - one or more providers are unhealthy while `require_healthy_providers: true` - the number of unhealthy providers exceeds `max_unhealthy_providers` +- the configured `verification.command` failed after the update command ran - the current time is outside the configured `maintenance_window.days` or `maintenance_window.start_hour` / `end_hour` - `maintenance_window.timezone` is invalid for the host runtime - the release lookup itself is unavailable diff --git a/foundrygate/config.py b/foundrygate/config.py index f6a2501..d16c307 100644 --- a/foundrygate/config.py +++ b/foundrygate/config.py @@ -956,6 +956,32 @@ def _normalize_auto_update(data: dict[str, Any]) -> dict[str, Any]: + ", ".join(overlap) ) + verification = raw.get("verification", {}) + if verification is None: + verification = {} + if not isinstance(verification, dict): + raise ConfigError("'auto_update.verification' must be a mapping") + + verification_enabled = verification.get("enabled", False) + if not isinstance(verification_enabled, bool): + raise ConfigError("'auto_update.verification.enabled' must be a boolean") + + verification_command = verification.get("command", "foundrygate-health") + if not isinstance(verification_command, str) or not verification_command.strip(): + raise ConfigError("'auto_update.verification.command' must be a non-empty string") + + verification_timeout_seconds = verification.get("timeout_seconds", 30) + if isinstance(verification_timeout_seconds, bool) or not isinstance( + verification_timeout_seconds, int + ): + raise ConfigError("'auto_update.verification.timeout_seconds' must be an integer") + if verification_timeout_seconds <= 0: + raise ConfigError("'auto_update.verification.timeout_seconds' must be positive") + + rollback_command = verification.get("rollback_command", "") + if not isinstance(rollback_command, str): + raise ConfigError("'auto_update.verification.rollback_command' must be a string") + maintenance_window = raw.get("maintenance_window", {}) if maintenance_window is None: maintenance_window = {} @@ -1011,6 +1037,12 @@ def _normalize_auto_update(data: dict[str, Any]) -> dict[str, Any]: "allow_providers": allow_providers, "deny_providers": deny_providers, }, + "verification": { + "enabled": verification_enabled, + "command": verification_command.strip(), + "timeout_seconds": verification_timeout_seconds, + "rollback_command": rollback_command.strip(), + }, "maintenance_window": { "enabled": window_enabled, "timezone": timezone.strip(), @@ -1116,6 +1148,12 @@ def auto_update(self) -> dict: "allow_providers": [], "deny_providers": [], }, + "verification": { + "enabled": False, + "command": "foundrygate-health", + "timeout_seconds": 30, + "rollback_command": "", + }, "maintenance_window": { "enabled": False, "timezone": "UTC", diff --git a/foundrygate/updates.py b/foundrygate/updates.py index 392c678..cf7f268 100644 --- a/foundrygate/updates.py +++ b/foundrygate/updates.py @@ -311,6 +311,7 @@ def __init__( "max_unhealthy_providers": int((auto_update or {}).get("max_unhealthy_providers", 0)), "min_release_age_hours": int((auto_update or {}).get("min_release_age_hours", 0)), "provider_scope": dict((auto_update or {}).get("provider_scope") or {}), + "verification": dict((auto_update or {}).get("verification") or {}), "maintenance_window": dict((auto_update or {}).get("maintenance_window") or {}), "apply_command": str((auto_update or {}).get("apply_command", "foundrygate-update")), } @@ -373,6 +374,7 @@ def _auto_update_status( "max_unhealthy_providers": int(self.auto_update.get("max_unhealthy_providers", 0)), "min_release_age_hours": int(self.auto_update.get("min_release_age_hours", 0)), "provider_scope": dict(self.auto_update.get("provider_scope") or {}), + "verification": dict(self.auto_update.get("verification") or {}), "maintenance_window": dict(self.auto_update.get("maintenance_window") or {}), "eligible": eligible, "blocked_reason": blocked_reason, diff --git a/scripts/foundrygate-auto-update b/scripts/foundrygate-auto-update index 21106f3..4328ea6 100755 --- a/scripts/foundrygate-auto-update +++ b/scripts/foundrygate-auto-update @@ -47,6 +47,10 @@ for value in ( "true" if eligible else "false", blocked, apply_command, + "true" if bool((auto.get("verification") or {}).get("enabled")) else "false", + ((auto.get("verification") or {}).get("command") or "foundrygate-health"), + str((auto.get("verification") or {}).get("timeout_seconds") or 30), + ((auto.get("verification") or {}).get("rollback_command") or ""), ): print(value) PY @@ -60,6 +64,10 @@ auto_enabled="${parsed[5]}" auto_eligible="${parsed[6]}" auto_blocked_reason="${parsed[7]}" apply_command="${parsed[8]}" +verify_enabled="${parsed[9]}" +verify_command="${parsed[10]}" +verify_timeout="${parsed[11]}" +rollback_command="${parsed[12]}" if [ "$mode" = "--apply" ]; then if [ "$auto_enabled" != "true" ]; then @@ -72,7 +80,31 @@ if [ "$mode" = "--apply" ]; then fi echo "Applying ${update_type} update to ${latest} via: ${apply_command}" - exec /bin/sh -lc "$apply_command" + /bin/sh -lc "$apply_command" + + if [ "$verify_enabled" = "true" ]; then + echo "Running post-update verification via: ${verify_command}" + export FOUNDRYGATE_VERIFY_COMMAND="$verify_command" + export FOUNDRYGATE_VERIFY_TIMEOUT="$verify_timeout" + if ! python3 - <<'PY' +import os +import subprocess +import sys + +command = os.environ["FOUNDRYGATE_VERIFY_COMMAND"] +timeout = int(os.environ["FOUNDRYGATE_VERIFY_TIMEOUT"]) +completed = subprocess.run(command, shell=True, timeout=timeout) +sys.exit(completed.returncode) +PY + then + echo "post-update verification failed" >&2 + if [ -n "$rollback_command" ]; then + echo "rollback suggested: ${rollback_command}" >&2 + fi + exit 1 + fi + fi + exit 0 fi printf 'Current: %s\nLatest: %s\nStatus: %s\nUpdate type: %s\nAction: %s\n' \ @@ -84,6 +116,12 @@ if [ "$auto_enabled" = "true" ]; then else printf 'Auto-update: blocked (%s)\n' "${auto_blocked_reason:-blocked}" fi + if [ "$verify_enabled" = "true" ]; then + printf 'Post-update verification: %s (timeout %ss)\n' "$verify_command" "$verify_timeout" + if [ -n "$rollback_command" ]; then + printf 'Rollback hint: %s\n' "$rollback_command" + fi + fi else printf 'Auto-update: disabled\n' fi diff --git a/tests/test_config.py b/tests/test_config.py index 68ef849..5382067 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -95,6 +95,12 @@ def test_auto_update_defaults_are_exposed(): "allow_providers": [], "deny_providers": ["openrouter-fallback"], } + assert cfg.auto_update["verification"] == { + "enabled": False, + "command": "foundrygate-health", + "timeout_seconds": 30, + "rollback_command": "", + } assert cfg.auto_update["maintenance_window"]["enabled"] is False assert cfg.auto_update["maintenance_window"]["timezone"] == "UTC" assert cfg.auto_update["maintenance_window"]["days"] == ["sat", "sun"] diff --git a/tests/test_updates.py b/tests/test_updates.py index 5ca2ee9..878942e 100644 --- a/tests/test_updates.py +++ b/tests/test_updates.py @@ -298,6 +298,12 @@ async def test_update_checker_reports_latest_release(): "enabled": True, "allow_major": False, "provider_scope": {"allow_providers": ["deepseek-chat"], "deny_providers": []}, + "verification": { + "enabled": True, + "command": "foundrygate-health", + "timeout_seconds": 30, + "rollback_command": "sudo systemctl restart foundrygate.service", + }, }, ) checker._client = _FakeClient( @@ -326,6 +332,12 @@ async def test_update_checker_reports_latest_release(): "allow_providers": ["deepseek-chat"], "deny_providers": [], } + assert status.auto_update["verification"] == { + "enabled": True, + "command": "foundrygate-health", + "timeout_seconds": 30, + "rollback_command": "sudo systemctl restart foundrygate.service", + } assert status.release_url.endswith("/v0.5.0")