From 0abc3666982fd74253d4421ff6ffcb848cfbd52a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andre=CC=81=20Lange?= Date: Thu, 12 Mar 2026 19:22:27 +0100 Subject: [PATCH 1/2] feat(ops): add post-update verification gate --- CHANGELOG.md | 1 + README.md | 7 ++++++ config.yaml | 5 +++++ docs/PUBLISHING.md | 1 + docs/TROUBLESHOOTING.md | 1 + foundrygate/config.py | 38 +++++++++++++++++++++++++++++++ foundrygate/updates.py | 2 ++ scripts/foundrygate-auto-update | 40 ++++++++++++++++++++++++++++++++- tests/test_config.py | 6 +++++ tests/test_updates.py | 12 ++++++++++ 10 files changed, 112 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 512eee5..403eddb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,7 @@ The format is intentionally lightweight and human-readable. Group entries by rel - Added `auto_update.min_release_age_hours` so helper-driven auto-updates can wait for a release to age before becoming eligible - Added `auto_update.maintenance_window` so helper-driven auto-updates can stay inside explicit local maintenance hours - Added `auto_update.provider_scope` so rollout-health guardrails can evaluate only a selected provider subset +- Added `auto_update.verification` so helper-driven auto-updates can run a post-update check and emit a rollback hint on failure ## v0.6.0 - 2026-03-12 diff --git a/README.md b/README.md index 121544a..5a64a9c 100644 --- a/README.md +++ b/README.md @@ -550,6 +550,7 @@ Supported fields in `auto_update`: - `max_unhealthy_providers` - `min_release_age_hours` - `provider_scope` +- `verification` - `maintenance_window` - `apply_command` @@ -566,6 +567,11 @@ auto_update: provider_scope: allow_providers: ["local-worker", "deepseek-chat"] deny_providers: ["openrouter-fallback"] + verification: + enabled: true + command: "foundrygate-health" + timeout_seconds: 30 + rollback_command: "sudo systemctl restart foundrygate.service" maintenance_window: enabled: true timezone: "Europe/Berlin" @@ -582,6 +588,7 @@ What the current runtime does with it: - lets `foundrygate-auto-update --apply` run only when the current release state is eligible - can block helper-driven rollout when provider health is already degraded - can scope rollout-health checks to a specific provider subset instead of the whole runtime +- can require a post-update verification command and emit a rollback hint when that verification fails - lets operators separate `stable` vs `preview` release checks and `stable` / `early` / `canary` rollout rings - can require that a release has aged for a minimum number of hours before helper-driven rollout - can restrict helper-driven rollout to explicit local maintenance windows diff --git a/config.yaml b/config.yaml index 4104477..3caf9ea 100644 --- a/config.yaml +++ b/config.yaml @@ -896,6 +896,11 @@ auto_update: provider_scope: allow_providers: [] deny_providers: ["openrouter-fallback"] + verification: + enabled: false + command: "foundrygate-health" + timeout_seconds: 30 + rollback_command: "" maintenance_window: enabled: false timezone: "UTC" diff --git a/docs/PUBLISHING.md b/docs/PUBLISHING.md index 69e21dd..0a7bbe0 100644 --- a/docs/PUBLISHING.md +++ b/docs/PUBLISHING.md @@ -67,6 +67,7 @@ If you want scheduled update application: - keep `require_healthy_providers: true` unless you are intentionally allowing rollouts while the gateway is degraded - set `min_release_age_hours` above `0` if you want scheduled rollouts to wait before applying newly published releases - use `provider_scope.allow_providers` / `deny_providers` if rollout health should only consider a subset of providers +- enable `verification` if helper-driven updates must pass a post-update health or smoke check before the rollout is considered clean - add `maintenance_window` if scheduled updates should only run in explicit local maintenance hours - prefer the reviewed examples in [examples/foundrygate-auto-update.service](./examples/foundrygate-auto-update.service) and [examples/foundrygate-auto-update.timer](./examples/foundrygate-auto-update.timer) - use the cron example in [examples/foundrygate-auto-update.cron](./examples/foundrygate-auto-update.cron) only when `systemd` timers are not practical diff --git a/docs/TROUBLESHOOTING.md b/docs/TROUBLESHOOTING.md index 1daed6d..517fab1 100644 --- a/docs/TROUBLESHOOTING.md +++ b/docs/TROUBLESHOOTING.md @@ -184,6 +184,7 @@ If `foundrygate-auto-update --apply` refuses to run, inspect the `auto_update` b - `provider_scope.allow_providers` / `deny_providers` resolves to no matching providers - one or more providers are unhealthy while `require_healthy_providers: true` - the number of unhealthy providers exceeds `max_unhealthy_providers` +- the configured `verification.command` failed after the update command ran - the current time is outside the configured `maintenance_window.days` or `maintenance_window.start_hour` / `end_hour` - `maintenance_window.timezone` is invalid for the host runtime - the release lookup itself is unavailable diff --git a/foundrygate/config.py b/foundrygate/config.py index f6a2501..d16c307 100644 --- a/foundrygate/config.py +++ b/foundrygate/config.py @@ -956,6 +956,32 @@ def _normalize_auto_update(data: dict[str, Any]) -> dict[str, Any]: + ", ".join(overlap) ) + verification = raw.get("verification", {}) + if verification is None: + verification = {} + if not isinstance(verification, dict): + raise ConfigError("'auto_update.verification' must be a mapping") + + verification_enabled = verification.get("enabled", False) + if not isinstance(verification_enabled, bool): + raise ConfigError("'auto_update.verification.enabled' must be a boolean") + + verification_command = verification.get("command", "foundrygate-health") + if not isinstance(verification_command, str) or not verification_command.strip(): + raise ConfigError("'auto_update.verification.command' must be a non-empty string") + + verification_timeout_seconds = verification.get("timeout_seconds", 30) + if isinstance(verification_timeout_seconds, bool) or not isinstance( + verification_timeout_seconds, int + ): + raise ConfigError("'auto_update.verification.timeout_seconds' must be an integer") + if verification_timeout_seconds <= 0: + raise ConfigError("'auto_update.verification.timeout_seconds' must be positive") + + rollback_command = verification.get("rollback_command", "") + if not isinstance(rollback_command, str): + raise ConfigError("'auto_update.verification.rollback_command' must be a string") + maintenance_window = raw.get("maintenance_window", {}) if maintenance_window is None: maintenance_window = {} @@ -1011,6 +1037,12 @@ def _normalize_auto_update(data: dict[str, Any]) -> dict[str, Any]: "allow_providers": allow_providers, "deny_providers": deny_providers, }, + "verification": { + "enabled": verification_enabled, + "command": verification_command.strip(), + "timeout_seconds": verification_timeout_seconds, + "rollback_command": rollback_command.strip(), + }, "maintenance_window": { "enabled": window_enabled, "timezone": timezone.strip(), @@ -1116,6 +1148,12 @@ def auto_update(self) -> dict: "allow_providers": [], "deny_providers": [], }, + "verification": { + "enabled": False, + "command": "foundrygate-health", + "timeout_seconds": 30, + "rollback_command": "", + }, "maintenance_window": { "enabled": False, "timezone": "UTC", diff --git a/foundrygate/updates.py b/foundrygate/updates.py index 81e44e1..4883513 100644 --- a/foundrygate/updates.py +++ b/foundrygate/updates.py @@ -311,6 +311,7 @@ def __init__( "max_unhealthy_providers": int((auto_update or {}).get("max_unhealthy_providers", 0)), "min_release_age_hours": int((auto_update or {}).get("min_release_age_hours", 0)), "provider_scope": dict((auto_update or {}).get("provider_scope") or {}), + "verification": dict((auto_update or {}).get("verification") or {}), "maintenance_window": dict((auto_update or {}).get("maintenance_window") or {}), "apply_command": str((auto_update or {}).get("apply_command", "foundrygate-update")), } @@ -373,6 +374,7 @@ def _auto_update_status( "max_unhealthy_providers": int(self.auto_update.get("max_unhealthy_providers", 0)), "min_release_age_hours": int(self.auto_update.get("min_release_age_hours", 0)), "provider_scope": dict(self.auto_update.get("provider_scope") or {}), + "verification": dict(self.auto_update.get("verification") or {}), "maintenance_window": dict(self.auto_update.get("maintenance_window") or {}), "eligible": eligible, "blocked_reason": blocked_reason, diff --git a/scripts/foundrygate-auto-update b/scripts/foundrygate-auto-update index 21106f3..4328ea6 100755 --- a/scripts/foundrygate-auto-update +++ b/scripts/foundrygate-auto-update @@ -47,6 +47,10 @@ for value in ( "true" if eligible else "false", blocked, apply_command, + "true" if bool((auto.get("verification") or {}).get("enabled")) else "false", + ((auto.get("verification") or {}).get("command") or "foundrygate-health"), + str((auto.get("verification") or {}).get("timeout_seconds") or 30), + ((auto.get("verification") or {}).get("rollback_command") or ""), ): print(value) PY @@ -60,6 +64,10 @@ auto_enabled="${parsed[5]}" auto_eligible="${parsed[6]}" auto_blocked_reason="${parsed[7]}" apply_command="${parsed[8]}" +verify_enabled="${parsed[9]}" +verify_command="${parsed[10]}" +verify_timeout="${parsed[11]}" +rollback_command="${parsed[12]}" if [ "$mode" = "--apply" ]; then if [ "$auto_enabled" != "true" ]; then @@ -72,7 +80,31 @@ if [ "$mode" = "--apply" ]; then fi echo "Applying ${update_type} update to ${latest} via: ${apply_command}" - exec /bin/sh -lc "$apply_command" + /bin/sh -lc "$apply_command" + + if [ "$verify_enabled" = "true" ]; then + echo "Running post-update verification via: ${verify_command}" + export FOUNDRYGATE_VERIFY_COMMAND="$verify_command" + export FOUNDRYGATE_VERIFY_TIMEOUT="$verify_timeout" + if ! python3 - <<'PY' +import os +import subprocess +import sys + +command = os.environ["FOUNDRYGATE_VERIFY_COMMAND"] +timeout = int(os.environ["FOUNDRYGATE_VERIFY_TIMEOUT"]) +completed = subprocess.run(command, shell=True, timeout=timeout) +sys.exit(completed.returncode) +PY + then + echo "post-update verification failed" >&2 + if [ -n "$rollback_command" ]; then + echo "rollback suggested: ${rollback_command}" >&2 + fi + exit 1 + fi + fi + exit 0 fi printf 'Current: %s\nLatest: %s\nStatus: %s\nUpdate type: %s\nAction: %s\n' \ @@ -84,6 +116,12 @@ if [ "$auto_enabled" = "true" ]; then else printf 'Auto-update: blocked (%s)\n' "${auto_blocked_reason:-blocked}" fi + if [ "$verify_enabled" = "true" ]; then + printf 'Post-update verification: %s (timeout %ss)\n' "$verify_command" "$verify_timeout" + if [ -n "$rollback_command" ]; then + printf 'Rollback hint: %s\n' "$rollback_command" + fi + fi else printf 'Auto-update: disabled\n' fi diff --git a/tests/test_config.py b/tests/test_config.py index 68ef849..5382067 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -95,6 +95,12 @@ def test_auto_update_defaults_are_exposed(): "allow_providers": [], "deny_providers": ["openrouter-fallback"], } + assert cfg.auto_update["verification"] == { + "enabled": False, + "command": "foundrygate-health", + "timeout_seconds": 30, + "rollback_command": "", + } assert cfg.auto_update["maintenance_window"]["enabled"] is False assert cfg.auto_update["maintenance_window"]["timezone"] == "UTC" assert cfg.auto_update["maintenance_window"]["days"] == ["sat", "sun"] diff --git a/tests/test_updates.py b/tests/test_updates.py index 586ad12..06802b7 100644 --- a/tests/test_updates.py +++ b/tests/test_updates.py @@ -296,6 +296,12 @@ async def test_update_checker_reports_latest_release(): "enabled": True, "allow_major": False, "provider_scope": {"allow_providers": ["deepseek-chat"], "deny_providers": []}, + "verification": { + "enabled": True, + "command": "foundrygate-health", + "timeout_seconds": 30, + "rollback_command": "sudo systemctl restart foundrygate.service", + }, }, ) checker._client = _FakeClient( @@ -324,6 +330,12 @@ async def test_update_checker_reports_latest_release(): "allow_providers": ["deepseek-chat"], "deny_providers": [], } + assert status.auto_update["verification"] == { + "enabled": True, + "command": "foundrygate-health", + "timeout_seconds": 30, + "rollback_command": "sudo systemctl restart foundrygate.service", + } assert status.release_url.endswith("/v0.5.0") From be0f19cf059f22cfc5c0a624bfe958bb0400feac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andre=CC=81=20Lange?= Date: Thu, 12 Mar 2026 19:51:20 +0100 Subject: [PATCH 2/2] fix(ci): restore python 3.10 compatibility --- foundrygate/updates.py | 6 +++--- tests/test_updates.py | 20 +++++++++++--------- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/foundrygate/updates.py b/foundrygate/updates.py index 4883513..cf7f268 100644 --- a/foundrygate/updates.py +++ b/foundrygate/updates.py @@ -4,7 +4,7 @@ import time from dataclasses import dataclass -from datetime import UTC, datetime +from datetime import datetime, timezone from typing import Any from zoneinfo import ZoneInfo, ZoneInfoNotFoundError @@ -112,7 +112,7 @@ def release_age_hours(published_at: str, *, now: datetime | None = None) -> floa published = datetime.fromisoformat(published_at.replace("Z", "+00:00")) except ValueError: return None - current = now or datetime.now(UTC) + current = now or datetime.now(timezone.utc) return max(0.0, (current - published).total_seconds() / 3600) @@ -209,7 +209,7 @@ def apply_maintenance_window_guardrail( result["maintenance_window"] = window return result - current = (now or datetime.now(UTC)).astimezone(zone) + current = (now or datetime.now(timezone.utc)).astimezone(zone) day_name = current.strftime("%a").lower()[:3] allowed_days = list(window.get("days") or []) start_hour = int(window.get("start_hour", 0)) diff --git a/tests/test_updates.py b/tests/test_updates.py index 06802b7..878942e 100644 --- a/tests/test_updates.py +++ b/tests/test_updates.py @@ -2,7 +2,7 @@ from __future__ import annotations -from datetime import UTC, datetime, timedelta +from datetime import datetime, timedelta, timezone import pytest @@ -86,7 +86,7 @@ def test_select_release_payload_uses_first_preview_release(): def test_release_age_hours_reports_elapsed_time(): - now = datetime(2026, 3, 12, 18, 0, tzinfo=UTC) + now = datetime(2026, 3, 12, 18, 0, tzinfo=timezone.utc) published = (now - timedelta(hours=6)).isoformat().replace("+00:00", "Z") assert release_age_hours(published, now=now) == 6.0 @@ -99,7 +99,9 @@ def test_release_age_guardrail_blocks_new_releases(): "min_release_age_hours": 24, "blocked_reason": "", }, - published_at=(datetime.now(UTC) - timedelta(hours=2)).isoformat().replace("+00:00", "Z"), + published_at=(datetime.now(timezone.utc) - timedelta(hours=2)) + .isoformat() + .replace("+00:00", "Z"), ) assert guarded["eligible"] is False assert guarded["blocked_reason"].startswith("Release is too new") @@ -190,7 +192,7 @@ def test_maintenance_window_guardrail_allows_updates_when_window_is_disabled(): "end_hour": 24, }, }, - now=datetime(2026, 3, 12, 12, 0, tzinfo=UTC), + now=datetime(2026, 3, 12, 12, 0, tzinfo=timezone.utc), ) assert guarded["eligible"] is True @@ -211,7 +213,7 @@ def test_maintenance_window_guardrail_blocks_outside_allowed_days(): "end_hour": 24, }, }, - now=datetime(2026, 3, 12, 12, 0, tzinfo=UTC), + now=datetime(2026, 3, 12, 12, 0, tzinfo=timezone.utc), ) assert guarded["eligible"] is False @@ -233,7 +235,7 @@ def test_maintenance_window_guardrail_blocks_outside_allowed_hours(): "end_hour": 5, }, }, - now=datetime(2026, 3, 12, 12, 0, tzinfo=UTC), + now=datetime(2026, 3, 12, 12, 0, tzinfo=timezone.utc), ) assert guarded["eligible"] is False @@ -255,7 +257,7 @@ def test_maintenance_window_guardrail_allows_inside_matching_window(): "end_hour": 14, }, }, - now=datetime(2026, 3, 12, 12, 0, tzinfo=UTC), + now=datetime(2026, 3, 12, 12, 0, tzinfo=timezone.utc), ) assert guarded["eligible"] is True @@ -278,7 +280,7 @@ def test_maintenance_window_guardrail_blocks_unknown_timezone(): "end_hour": 24, }, }, - now=datetime(2026, 3, 12, 12, 0, tzinfo=UTC), + now=datetime(2026, 3, 12, 12, 0, tzinfo=timezone.utc), ) assert guarded["eligible"] is False @@ -491,7 +493,7 @@ async def test_min_release_age_blocks_auto_update_until_release_has_aged(): { "tag_name": "v0.6.1", "html_url": "https://github.com/typelicious/FoundryGate/releases/tag/v0.6.1", - "published_at": (datetime.now(UTC) - timedelta(hours=1)) + "published_at": (datetime.now(timezone.utc) - timedelta(hours=1)) .isoformat() .replace("+00:00", "Z"), },