From 7f2615e03dbcf8a4793f2a1c860a5f17237d6b9d Mon Sep 17 00:00:00 2001 From: jopemachine Date: Mon, 2 Mar 2026 01:27:45 +0000 Subject: [PATCH 1/8] wip --- .../deployment/strategy/rolling_update.py | 146 ++- .../manager/sokovan/deployment/strategy/BUILD | 3 + .../sokovan/deployment/strategy/__init__.py | 0 .../strategy/test_rolling_update.py | 1128 +++++++++++++++++ 4 files changed, 1273 insertions(+), 4 deletions(-) create mode 100644 tests/unit/manager/sokovan/deployment/strategy/BUILD create mode 100644 tests/unit/manager/sokovan/deployment/strategy/__init__.py create mode 100644 tests/unit/manager/sokovan/deployment/strategy/test_rolling_update.py diff --git a/src/ai/backend/manager/sokovan/deployment/strategy/rolling_update.py b/src/ai/backend/manager/sokovan/deployment/strategy/rolling_update.py index fbcb764355c..d64ea24e980 100644 --- a/src/ai/backend/manager/sokovan/deployment/strategy/rolling_update.py +++ b/src/ai/backend/manager/sokovan/deployment/strategy/rolling_update.py @@ -1,4 +1,4 @@ -"""Rolling update strategy evaluation for a single deployment cycle (BEP-1049). +"""Rolling update FSM evaluation for a single deployment cycle (BEP-1049). Classifies routes by revision (old/new) and status, then decides the next sub-step and route mutations based on ``max_surge`` / ``max_unavailable``. @@ -6,15 +6,24 @@ from __future__ import annotations +import logging from collections.abc import Sequence +from ai.backend.logging import BraceStyleAdapter from ai.backend.manager.data.deployment.types import ( DeploymentInfo, + DeploymentSubStep, RouteInfo, + RouteStatus, ) from ai.backend.manager.models.deployment_policy import RollingUpdateSpec +from ai.backend.manager.models.routing import RoutingRow +from ai.backend.manager.repositories.base import Creator +from ai.backend.manager.repositories.deployment.creators import RouteCreatorSpec -from .types import CycleEvaluationResult +from .types import CycleEvaluationResult, RouteChanges + +log = BraceStyleAdapter(logging.getLogger(__name__)) def rolling_update_evaluate( @@ -22,5 +31,134 @@ def rolling_update_evaluate( routes: Sequence[RouteInfo], spec: RollingUpdateSpec, ) -> CycleEvaluationResult: - """Evaluate one cycle of rolling update for a single deployment.""" - raise NotImplementedError("Rolling update strategy is not yet implemented") + """Evaluate one cycle of rolling update for a single deployment. + + FSM flow: + 1. Classify routes into old / new by revision_id. + 2. If any new route is PROVISIONING → PROVISIONING (wait). + 3. If no old routes remain and new_healthy >= desired → completed. + 4. If all new routes failed → ROLLED_BACK. + 5. Compute allowed surge/unavailable, decide create/terminate → PROGRESSING. + """ + deploying_rev = deployment.deploying_revision_id + desired = deployment.replica_spec.target_replica_count + + # ── 1. Classify routes ── + old_active: list[RouteInfo] = [] + new_provisioning: list[RouteInfo] = [] + new_healthy: list[RouteInfo] = [] + new_failed: list[RouteInfo] = [] + + for r in routes: + is_new = r.revision_id == deploying_rev + if not is_new: + if r.status.is_active(): + old_active.append(r) + continue + + if r.status == RouteStatus.PROVISIONING: + new_provisioning.append(r) + elif r.status == RouteStatus.HEALTHY: + new_healthy.append(r) + elif r.status in (RouteStatus.FAILED_TO_START, RouteStatus.TERMINATED): + new_failed.append(r) + elif r.status.is_active(): + new_healthy.append(r) + + total_new_live = len(new_provisioning) + len(new_healthy) + + # ── 2. PROVISIONING: wait for in-flight routes ── + if new_provisioning: + log.debug( + "deployment {}: {} new routes still provisioning", + deployment.id, + len(new_provisioning), + ) + return CycleEvaluationResult(sub_step=DeploymentSubStep.PROVISIONING) + + # ── 3. Completed: all old replaced, enough new healthy ── + if not old_active and len(new_healthy) >= desired: + log.info( + "deployment {}: rolling update complete ({} healthy routes)", + deployment.id, + len(new_healthy), + ) + return CycleEvaluationResult( + sub_step=DeploymentSubStep.PROGRESSING, + completed=True, + ) + + # ── 4. Rolled back: every new route failed ── + if total_new_live == 0 and new_failed: + log.warning( + "deployment {}: all {} new routes failed — rolling back", + deployment.id, + len(new_failed), + ) + return CycleEvaluationResult(sub_step=DeploymentSubStep.ROLLED_BACK) + + # ── 5. PROGRESSING: compute surge / unavailable budget ── + max_surge = spec.max_surge + max_unavailable = spec.max_unavailable + + # Total pods allowed at peak = desired + max_surge + max_total = desired + max_surge + current_total = len(old_active) + total_new_live + + # Minimum available pods = desired - max_unavailable + min_available = max(0, desired - max_unavailable) + + route_changes = RouteChanges() + + # Decide how many new routes to create + can_create = max_total - current_total + still_needed = desired - total_new_live + to_create = max(0, min(can_create, still_needed)) + + if to_create > 0: + route_changes.scale_out_specs = _build_route_creators(deployment, to_create) + + # Decide how many old routes to terminate + available_count = len(new_healthy) + len(old_active) + can_terminate = available_count - min_available + to_terminate = max(0, min(can_terminate, len(old_active))) + + if to_terminate > 0: + # Terminate old routes with lowest termination priority first + sorted_old = sorted(old_active, key=lambda r: r.status.termination_priority()) + for r in sorted_old[:to_terminate]: + route_changes.scale_in_route_ids.append(r.route_id) + + log.debug( + "deployment {}: PROGRESSING create={}, terminate={}, " + "old_active={}, new_healthy={}, new_prov={}", + deployment.id, + to_create, + to_terminate, + len(old_active), + len(new_healthy), + len(new_provisioning), + ) + + return CycleEvaluationResult( + sub_step=DeploymentSubStep.PROGRESSING, + route_changes=route_changes, + ) + + +def _build_route_creators( + deployment: DeploymentInfo, + count: int, +) -> list[Creator[RoutingRow]]: + """Build route creator specs for new revision routes.""" + creators: list[Creator[RoutingRow]] = [] + for _ in range(count): + spec = RouteCreatorSpec( + endpoint_id=deployment.id, + session_owner_id=deployment.metadata.session_owner, + domain=deployment.metadata.domain, + project_id=deployment.metadata.project, + revision_id=deployment.deploying_revision_id, + ) + creators.append(Creator(spec=spec)) + return creators diff --git a/tests/unit/manager/sokovan/deployment/strategy/BUILD b/tests/unit/manager/sokovan/deployment/strategy/BUILD new file mode 100644 index 00000000000..57341b1358b --- /dev/null +++ b/tests/unit/manager/sokovan/deployment/strategy/BUILD @@ -0,0 +1,3 @@ +python_tests( + name="tests", +) diff --git a/tests/unit/manager/sokovan/deployment/strategy/__init__.py b/tests/unit/manager/sokovan/deployment/strategy/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/unit/manager/sokovan/deployment/strategy/test_rolling_update.py b/tests/unit/manager/sokovan/deployment/strategy/test_rolling_update.py new file mode 100644 index 00000000000..ce285060a09 --- /dev/null +++ b/tests/unit/manager/sokovan/deployment/strategy/test_rolling_update.py @@ -0,0 +1,1128 @@ +"""Comprehensive tests for the rolling update FSM evaluation (BEP-1049). + +Tests cover: +- Various max_surge / max_unavailable combinations +- Single and multi-replica scenarios +- FSM state transitions: PROVISIONING, PROGRESSING, ROLLED_BACK, completed +- Edge cases: no routes, all failed, mixed statuses +- Termination priority ordering +""" + +from __future__ import annotations + +from datetime import UTC, datetime +from uuid import UUID, uuid4 + +from ai.backend.common.data.endpoint.types import EndpointLifecycle +from ai.backend.common.types import SessionId +from ai.backend.manager.data.deployment.types import ( + DeploymentInfo, + DeploymentMetadata, + DeploymentNetworkSpec, + DeploymentState, + DeploymentSubStep, + ReplicaSpec, + RouteInfo, + RouteStatus, + RouteTrafficStatus, +) +from ai.backend.manager.models.deployment_policy import RollingUpdateSpec +from ai.backend.manager.repositories.deployment.creators import RouteCreatorSpec +from ai.backend.manager.sokovan.deployment.strategy.rolling_update import ( + rolling_update_evaluate, +) +from ai.backend.manager.sokovan.deployment.strategy.types import CycleEvaluationResult + +ENDPOINT_ID = UUID("aaaaaaaa-0000-0000-0000-aaaaaaaaaaaa") +OLD_REV = UUID("11111111-1111-1111-1111-111111111111") +NEW_REV = UUID("22222222-2222-2222-2222-222222222222") +PROJECT_ID = UUID("cccccccc-cccc-cccc-cccc-cccccccccccc") +USER_ID = UUID("dddddddd-dddd-dddd-dddd-dddddddddddd") + + +def make_deployment( + *, + desired: int = 1, + deploying_revision_id: UUID = NEW_REV, + current_revision_id: UUID = OLD_REV, + endpoint_id: UUID = ENDPOINT_ID, +) -> DeploymentInfo: + return DeploymentInfo( + id=endpoint_id, + metadata=DeploymentMetadata( + name="test-deploy", + domain="default", + project=PROJECT_ID, + resource_group="default", + created_user=USER_ID, + session_owner=USER_ID, + created_at=datetime.now(UTC), + revision_history_limit=5, + ), + state=DeploymentState( + lifecycle=EndpointLifecycle.DEPLOYING, + retry_count=0, + ), + replica_spec=ReplicaSpec( + replica_count=desired, + ), + network=DeploymentNetworkSpec(open_to_public=False), + model_revisions=[], + current_revision_id=current_revision_id, + deploying_revision_id=deploying_revision_id, + ) + + +def make_route( + *, + revision_id: UUID, + status: RouteStatus = RouteStatus.HEALTHY, + endpoint_id: UUID = ENDPOINT_ID, + route_id: UUID | None = None, +) -> RouteInfo: + return RouteInfo( + route_id=route_id or uuid4(), + endpoint_id=endpoint_id, + session_id=SessionId(uuid4()), + status=status, + traffic_ratio=1.0 if status.is_active() else 0.0, + created_at=datetime.now(UTC), + revision_id=revision_id, + traffic_status=RouteTrafficStatus.ACTIVE + if status.is_active() + else RouteTrafficStatus.INACTIVE, + ) + + +# --------------------------------------------------------------------------- +# Helper +# --------------------------------------------------------------------------- + + +def _count_scale_out(result: CycleEvaluationResult) -> int: + return len(result.route_changes.scale_out_specs) + + +def _scale_in_ids(result: CycleEvaluationResult) -> list[UUID]: + return result.route_changes.scale_in_route_ids + + +# =========================================================================== +# 1. Basic FSM states +# =========================================================================== + + +class TestBasicFSMStates: + """Test fundamental FSM transitions.""" + + def test_no_routes_initial_cycle_creates_new(self) -> None: + """First cycle with 0 routes → PROGRESSING, creates desired count.""" + deployment = make_deployment(desired=1) + spec = RollingUpdateSpec(max_surge=1, max_unavailable=0) + + result = rolling_update_evaluate(deployment, [], spec) + + assert result.sub_step == DeploymentSubStep.PROGRESSING + assert not result.completed + assert _count_scale_out(result) == 1 + assert len(_scale_in_ids(result)) == 0 + + def test_new_provisioning_waits(self) -> None: + """New routes in PROVISIONING → wait (PROVISIONING sub-step).""" + deployment = make_deployment(desired=1) + spec = RollingUpdateSpec(max_surge=1, max_unavailable=0) + routes = [ + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=NEW_REV, status=RouteStatus.PROVISIONING), + ] + + result = rolling_update_evaluate(deployment, routes, spec) + + assert result.sub_step == DeploymentSubStep.PROVISIONING + assert not result.completed + assert _count_scale_out(result) == 0 + assert len(_scale_in_ids(result)) == 0 + + def test_completed_when_all_new_healthy_and_no_old(self) -> None: + """All old gone + new_healthy >= desired → completed.""" + deployment = make_deployment(desired=2) + spec = RollingUpdateSpec(max_surge=1, max_unavailable=0) + routes = [ + make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), + ] + + result = rolling_update_evaluate(deployment, routes, spec) + + assert result.completed + assert result.sub_step == DeploymentSubStep.PROGRESSING + + def test_rollback_when_all_new_failed(self) -> None: + """All new routes failed → ROLLED_BACK.""" + deployment = make_deployment(desired=1) + spec = RollingUpdateSpec(max_surge=1, max_unavailable=0) + routes = [ + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=NEW_REV, status=RouteStatus.FAILED_TO_START), + ] + + result = rolling_update_evaluate(deployment, routes, spec) + + assert result.sub_step == DeploymentSubStep.ROLLED_BACK + assert not result.completed + + def test_rollback_with_terminated_new_routes(self) -> None: + """New routes in TERMINATED also count as failed.""" + deployment = make_deployment(desired=1) + spec = RollingUpdateSpec(max_surge=1, max_unavailable=0) + routes = [ + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=NEW_REV, status=RouteStatus.TERMINATED), + ] + + result = rolling_update_evaluate(deployment, routes, spec) + + assert result.sub_step == DeploymentSubStep.ROLLED_BACK + + +# =========================================================================== +# 2. max_surge variations +# =========================================================================== + + +class TestMaxSurge: + """Test max_surge parameter controls.""" + + def test_surge_1_desired_1_creates_1(self) -> None: + """surge=1, desired=1: 1 old → create 1 new (total=2 allowed).""" + deployment = make_deployment(desired=1) + spec = RollingUpdateSpec(max_surge=1, max_unavailable=0) + routes = [make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY)] + + result = rolling_update_evaluate(deployment, routes, spec) + + assert result.sub_step == DeploymentSubStep.PROGRESSING + assert _count_scale_out(result) == 1 + + def test_surge_2_desired_3_creates_2(self) -> None: + """surge=2, desired=3: 3 old → max_total=5, can create 2.""" + deployment = make_deployment(desired=3) + spec = RollingUpdateSpec(max_surge=2, max_unavailable=0) + routes = [ + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + ] + + result = rolling_update_evaluate(deployment, routes, spec) + + assert _count_scale_out(result) == 2 + + def test_surge_0_desired_3_no_create_without_unavailable(self) -> None: + """surge=0, unavailable=0: cannot create new (no budget).""" + deployment = make_deployment(desired=3) + spec = RollingUpdateSpec(max_surge=0, max_unavailable=0) + routes = [ + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + ] + + result = rolling_update_evaluate(deployment, routes, spec) + + # max_total = 3+0 = 3, current_total = 3, can_create = 0 + assert _count_scale_out(result) == 0 + # min_available = 3-0 = 3, available=3, can_terminate = 0 + assert len(_scale_in_ids(result)) == 0 + + def test_surge_3_desired_2_caps_at_desired(self) -> None: + """surge=3, desired=2: creates at most desired - already_new.""" + deployment = make_deployment(desired=2) + spec = RollingUpdateSpec(max_surge=3, max_unavailable=0) + routes = [ + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + ] + + result = rolling_update_evaluate(deployment, routes, spec) + + # max_total = 5, current_total = 2, can_create = 3 + # still_needed = 2 - 0 = 2 → min(3,2) = 2 + assert _count_scale_out(result) == 2 + + def test_surge_already_at_max_no_create(self) -> None: + """Already at max_total → no new creates.""" + deployment = make_deployment(desired=2) + spec = RollingUpdateSpec(max_surge=1, max_unavailable=0) + routes = [ + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), + ] + + result = rolling_update_evaluate(deployment, routes, spec) + + # max_total = 3, current = 3 → can_create = 0 + assert _count_scale_out(result) == 0 + + +# =========================================================================== +# 3. max_unavailable variations +# =========================================================================== + + +class TestMaxUnavailable: + """Test max_unavailable parameter controls.""" + + def test_unavailable_0_no_terminate_until_new_healthy(self) -> None: + """unavailable=0: only terminate when new routes are healthy.""" + deployment = make_deployment(desired=2) + spec = RollingUpdateSpec(max_surge=1, max_unavailable=0) + routes = [ + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + ] + + result = rolling_update_evaluate(deployment, routes, spec) + + # min_available = 2-0 = 2, available = 0(new_healthy) + 2(old) = 2 + # can_terminate = 2 - 2 = 0 + assert len(_scale_in_ids(result)) == 0 + + def test_unavailable_1_terminates_1_old(self) -> None: + """unavailable=1: can terminate 1 old even without new ready.""" + deployment = make_deployment(desired=3) + spec = RollingUpdateSpec(max_surge=1, max_unavailable=1) + routes = [ + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + ] + + result = rolling_update_evaluate(deployment, routes, spec) + + # min_available = 3-1 = 2, available = 0+3 = 3, can_terminate = 1 + assert len(_scale_in_ids(result)) == 1 + + def test_unavailable_2_terminates_2_old(self) -> None: + """unavailable=2: can terminate up to 2 old routes.""" + deployment = make_deployment(desired=3) + spec = RollingUpdateSpec(max_surge=0, max_unavailable=2) + routes = [ + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + ] + + result = rolling_update_evaluate(deployment, routes, spec) + + # min_available = 3-2 = 1, available = 0+3 = 3, can_terminate = 2 + assert len(_scale_in_ids(result)) == 2 + # max_total = 3+0 = 3, current = 3, can_create = 0 + # But still_needed = 3 → min(0, 3) = 0 + assert _count_scale_out(result) == 0 + + def test_unavailable_with_new_healthy_allows_more_termination(self) -> None: + """With new healthy routes, more old can be terminated.""" + deployment = make_deployment(desired=3) + spec = RollingUpdateSpec(max_surge=1, max_unavailable=0) + routes = [ + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), + ] + + result = rolling_update_evaluate(deployment, routes, spec) + + # min_available = 3, available = 1(new_healthy)+3(old) = 4 + # can_terminate = 4 - 3 = 1 + assert len(_scale_in_ids(result)) == 1 + + def test_unavailable_exceeds_desired_floors_to_zero(self) -> None: + """unavailable > desired → min_available floors to 0.""" + deployment = make_deployment(desired=1) + spec = RollingUpdateSpec(max_surge=0, max_unavailable=5) + routes = [make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY)] + + result = rolling_update_evaluate(deployment, routes, spec) + + # min_available = max(0, 1-5) = 0, available = 0+1 = 1 + # can_terminate = 1 - 0 = 1 + assert len(_scale_in_ids(result)) == 1 + + +# =========================================================================== +# 4. Combined surge + unavailable +# =========================================================================== + + +class TestCombinedSurgeAndUnavailable: + """Test combinations of max_surge and max_unavailable.""" + + def test_surge_1_unavailable_1_desired_3(self) -> None: + """surge=1, unavailable=1, desired=3 with 3 old routes.""" + deployment = make_deployment(desired=3) + spec = RollingUpdateSpec(max_surge=1, max_unavailable=1) + routes = [ + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + ] + + result = rolling_update_evaluate(deployment, routes, spec) + + # max_total = 4, current = 3, can_create = 1, still_needed = 3 → create 1 + assert _count_scale_out(result) == 1 + # min_available = 2, available = 0+3 = 3, can_terminate = 1 + assert len(_scale_in_ids(result)) == 1 + + def test_surge_2_unavailable_1_desired_4(self) -> None: + """surge=2, unavailable=1, desired=4 with 4 old routes.""" + deployment = make_deployment(desired=4) + spec = RollingUpdateSpec(max_surge=2, max_unavailable=1) + routes = [ + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + ] + + result = rolling_update_evaluate(deployment, routes, spec) + + # max_total = 6, current = 4, can_create = 2, still_needed = 4 → 2 + assert _count_scale_out(result) == 2 + # min_available = 3, available = 0+4 = 4, can_terminate = 1 + assert len(_scale_in_ids(result)) == 1 + + def test_aggressive_strategy_surge_3_unavail_2_desired_3(self) -> None: + """Aggressive: surge=3, unavailable=2, desired=3 with 3 old.""" + deployment = make_deployment(desired=3) + spec = RollingUpdateSpec(max_surge=3, max_unavailable=2) + routes = [ + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + ] + + result = rolling_update_evaluate(deployment, routes, spec) + + # max_total = 6, current = 3, can_create = 3, still_needed = 3 → 3 + assert _count_scale_out(result) == 3 + # min_available = 1, available = 0+3 = 3, can_terminate = 2 + assert len(_scale_in_ids(result)) == 2 + + +# =========================================================================== +# 5. Multi-cycle progression +# =========================================================================== + + +class TestMultiCycleProgression: + """Simulate multiple evaluation cycles.""" + + def test_cycle_2_after_new_routes_become_healthy(self) -> None: + """After new routes become healthy, old ones can be terminated.""" + deployment = make_deployment(desired=3) + spec = RollingUpdateSpec(max_surge=1, max_unavailable=0) + routes = [ + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), + ] + + result = rolling_update_evaluate(deployment, routes, spec) + + # max_total = 4, current = 3, can_create = 1, still_needed = 2 → 1 + assert _count_scale_out(result) == 1 + # min_available = 3, available = 1+2 = 3, can_terminate = 0 + # Wait, that's wrong: available = 1(new_healthy) + 2(old) = 3 + # can_terminate = 3 - 3 = 0 + assert len(_scale_in_ids(result)) == 0 + + def test_cycle_3_with_2_new_healthy(self) -> None: + """2 new healthy, 2 old: can terminate 1 old and create 1 new.""" + deployment = make_deployment(desired=3) + spec = RollingUpdateSpec(max_surge=1, max_unavailable=0) + routes = [ + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), + ] + + result = rolling_update_evaluate(deployment, routes, spec) + + # max_total = 4, current = 4, can_create = 0 + assert _count_scale_out(result) == 0 + # min_available = 3, available = 2+2 = 4, can_terminate = 1 + assert len(_scale_in_ids(result)) == 1 + + def test_final_cycle_completes(self) -> None: + """3 new healthy, 0 old → completed.""" + deployment = make_deployment(desired=3) + spec = RollingUpdateSpec(max_surge=1, max_unavailable=0) + routes = [ + make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), + ] + + result = rolling_update_evaluate(deployment, routes, spec) + + assert result.completed + + def test_not_completed_when_old_still_exists(self) -> None: + """Even with enough new, old still exists → not completed.""" + deployment = make_deployment(desired=2) + spec = RollingUpdateSpec(max_surge=1, max_unavailable=0) + routes = [ + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), + ] + + result = rolling_update_evaluate(deployment, routes, spec) + + assert not result.completed + # Should terminate the old route + assert len(_scale_in_ids(result)) == 1 + + +# =========================================================================== +# 6. Mixed route statuses +# =========================================================================== + + +class TestMixedRouteStatuses: + """Test with routes in various statuses.""" + + def test_degraded_new_counts_as_healthy(self) -> None: + """DEGRADED new routes count as active (is_active=True).""" + deployment = make_deployment(desired=1) + spec = RollingUpdateSpec(max_surge=1, max_unavailable=0) + routes = [ + make_route(revision_id=NEW_REV, status=RouteStatus.DEGRADED), + ] + + result = rolling_update_evaluate(deployment, routes, spec) + + assert result.completed + + def test_unhealthy_new_counts_as_healthy(self) -> None: + """UNHEALTHY new routes count as active.""" + deployment = make_deployment(desired=1) + spec = RollingUpdateSpec(max_surge=1, max_unavailable=0) + routes = [ + make_route(revision_id=NEW_REV, status=RouteStatus.UNHEALTHY), + ] + + result = rolling_update_evaluate(deployment, routes, spec) + + assert result.completed + + def test_old_terminating_not_counted_as_active(self) -> None: + """Old routes in TERMINATING are not counted as old_active.""" + deployment = make_deployment(desired=1) + spec = RollingUpdateSpec(max_surge=1, max_unavailable=0) + routes = [ + make_route(revision_id=OLD_REV, status=RouteStatus.TERMINATING), + make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), + ] + + result = rolling_update_evaluate(deployment, routes, spec) + + # old_active = 0 (terminating doesn't count), new_healthy = 1 >= desired + assert result.completed + + def test_old_terminated_not_counted(self) -> None: + """Old routes in TERMINATED are not counted as old_active.""" + deployment = make_deployment(desired=1) + spec = RollingUpdateSpec(max_surge=1, max_unavailable=0) + routes = [ + make_route(revision_id=OLD_REV, status=RouteStatus.TERMINATED), + make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), + ] + + result = rolling_update_evaluate(deployment, routes, spec) + + assert result.completed + + def test_mixed_old_statuses_counts_only_active(self) -> None: + """Only active old routes are counted in old_active.""" + deployment = make_deployment(desired=2) + spec = RollingUpdateSpec(max_surge=1, max_unavailable=0) + routes = [ + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=OLD_REV, status=RouteStatus.TERMINATING), + make_route(revision_id=OLD_REV, status=RouteStatus.TERMINATED), + ] + + result = rolling_update_evaluate(deployment, routes, spec) + + # old_active = 1 (only HEALTHY), total_new_live = 0 + # max_total = 3, current = 1, can_create = 2, still_needed = 2 → 2 + assert _count_scale_out(result) == 2 + + def test_mix_of_failed_and_healthy_new_not_rollback(self) -> None: + """Some new failed, some new healthy → no rollback (live routes exist).""" + deployment = make_deployment(desired=2) + spec = RollingUpdateSpec(max_surge=2, max_unavailable=0) + routes = [ + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=NEW_REV, status=RouteStatus.FAILED_TO_START), + ] + + result = rolling_update_evaluate(deployment, routes, spec) + + # total_new_live = 1 (healthy) > 0, so NOT rolled back + assert result.sub_step == DeploymentSubStep.PROGRESSING + assert not result.completed + + +# =========================================================================== +# 7. Termination priority ordering +# =========================================================================== + + +class TestTerminationPriority: + """Test that old routes are terminated in priority order.""" + + def test_unhealthy_terminated_before_healthy(self) -> None: + """UNHEALTHY old routes should be terminated before HEALTHY ones.""" + unhealthy_id = UUID("00000000-0000-0000-0000-000000000001") + healthy_id = UUID("00000000-0000-0000-0000-000000000002") + + deployment = make_deployment(desired=2) + spec = RollingUpdateSpec(max_surge=0, max_unavailable=1) + routes = [ + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY, route_id=healthy_id), + make_route(revision_id=OLD_REV, status=RouteStatus.UNHEALTHY, route_id=unhealthy_id), + ] + + result = rolling_update_evaluate(deployment, routes, spec) + + assert len(_scale_in_ids(result)) == 1 + assert _scale_in_ids(result)[0] == unhealthy_id + + def test_degraded_before_healthy(self) -> None: + """DEGRADED old routes terminated before HEALTHY ones.""" + degraded_id = UUID("00000000-0000-0000-0000-000000000001") + healthy_id = UUID("00000000-0000-0000-0000-000000000002") + + deployment = make_deployment(desired=2) + spec = RollingUpdateSpec(max_surge=0, max_unavailable=1) + routes = [ + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY, route_id=healthy_id), + make_route(revision_id=OLD_REV, status=RouteStatus.DEGRADED, route_id=degraded_id), + ] + + result = rolling_update_evaluate(deployment, routes, spec) + + assert len(_scale_in_ids(result)) == 1 + assert _scale_in_ids(result)[0] == degraded_id + + def test_priority_order_unhealthy_degraded_provisioning_healthy(self) -> None: + """Full priority order: unhealthy < degraded < provisioning < healthy.""" + unhealthy_id = UUID("00000000-0000-0000-0000-000000000001") + degraded_id = UUID("00000000-0000-0000-0000-000000000002") + provisioning_id = UUID("00000000-0000-0000-0000-000000000003") + healthy_id = UUID("00000000-0000-0000-0000-000000000004") + + deployment = make_deployment(desired=4) + spec = RollingUpdateSpec(max_surge=0, max_unavailable=3) + routes = [ + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY, route_id=healthy_id), + make_route( + revision_id=OLD_REV, status=RouteStatus.PROVISIONING, route_id=provisioning_id + ), + make_route(revision_id=OLD_REV, status=RouteStatus.DEGRADED, route_id=degraded_id), + make_route(revision_id=OLD_REV, status=RouteStatus.UNHEALTHY, route_id=unhealthy_id), + ] + + result = rolling_update_evaluate(deployment, routes, spec) + + terminated = _scale_in_ids(result) + assert len(terminated) == 3 + assert terminated[0] == unhealthy_id + assert terminated[1] == degraded_id + assert terminated[2] == provisioning_id + + +# =========================================================================== +# 8. Edge cases +# =========================================================================== + + +class TestEdgeCases: + """Edge cases and boundary conditions.""" + + def test_desired_0_no_routes_completed(self) -> None: + """desired=0, no routes → completed (vacuously true).""" + deployment = make_deployment(desired=0) + spec = RollingUpdateSpec(max_surge=1, max_unavailable=0) + + result = rolling_update_evaluate(deployment, [], spec) + + assert result.completed + + def test_more_new_healthy_than_desired_still_completes(self) -> None: + """new_healthy > desired and no old → completed.""" + deployment = make_deployment(desired=2) + spec = RollingUpdateSpec(max_surge=2, max_unavailable=0) + routes = [ + make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), + ] + + result = rolling_update_evaluate(deployment, routes, spec) + + assert result.completed + + def test_no_routes_no_failed_creates_new(self) -> None: + """Empty routes list → PROGRESSING with scale out.""" + deployment = make_deployment(desired=3) + spec = RollingUpdateSpec(max_surge=2, max_unavailable=1) + + result = rolling_update_evaluate(deployment, [], spec) + + assert result.sub_step == DeploymentSubStep.PROGRESSING + # max_total = 5, current = 0, can_create = 5, still_needed = 3 → 3 + assert _count_scale_out(result) == 3 + + def test_only_failed_new_no_old_rolls_back(self) -> None: + """Only failed new routes, no old → ROLLED_BACK.""" + deployment = make_deployment(desired=2) + spec = RollingUpdateSpec(max_surge=1, max_unavailable=0) + routes = [ + make_route(revision_id=NEW_REV, status=RouteStatus.FAILED_TO_START), + make_route(revision_id=NEW_REV, status=RouteStatus.FAILED_TO_START), + ] + + result = rolling_update_evaluate(deployment, routes, spec) + + assert result.sub_step == DeploymentSubStep.ROLLED_BACK + + def test_all_old_inactive_no_new_creates_desired(self) -> None: + """All old routes are inactive (terminated), no new → create desired.""" + deployment = make_deployment(desired=2) + spec = RollingUpdateSpec(max_surge=1, max_unavailable=0) + routes = [ + make_route(revision_id=OLD_REV, status=RouteStatus.TERMINATED), + make_route(revision_id=OLD_REV, status=RouteStatus.TERMINATED), + ] + + result = rolling_update_evaluate(deployment, routes, spec) + + # old_active = 0, no new → max_total = 3, current = 0, can_create = 3 + # still_needed = 2, min(3, 2) = 2 + assert _count_scale_out(result) == 2 + + def test_large_desired_surge_1_unavailable_0_creates_exactly_1(self) -> None: + """Large desired with conservative settings creates exactly 1.""" + deployment = make_deployment(desired=10) + spec = RollingUpdateSpec(max_surge=1, max_unavailable=0) + routes = [make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY) for _ in range(10)] + + result = rolling_update_evaluate(deployment, routes, spec) + + assert _count_scale_out(result) == 1 + assert len(_scale_in_ids(result)) == 0 + + def test_deploying_rev_none_all_routes_classified_as_old(self) -> None: + """If deploying_revision_id is None, all routes are old (r.revision_id != None).""" + deployment = make_deployment(desired=1, deploying_revision_id=None) # type: ignore[arg-type] + spec = RollingUpdateSpec(max_surge=1, max_unavailable=0) + routes = [make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY)] + + result = rolling_update_evaluate(deployment, routes, spec) + + # All classified as old, no new → PROGRESSING with create + assert result.sub_step == DeploymentSubStep.PROGRESSING + assert _count_scale_out(result) == 1 + + def test_route_without_revision_classified_as_old(self) -> None: + """Routes with revision_id=None are classified as old.""" + deployment = make_deployment(desired=1) + spec = RollingUpdateSpec(max_surge=1, max_unavailable=0) + routes = [make_route(revision_id=None, status=RouteStatus.HEALTHY)] # type: ignore[arg-type] + + result = rolling_update_evaluate(deployment, routes, spec) + + # revision_id=None != NEW_REV, so classified as old + assert _count_scale_out(result) == 1 + + def test_provisioning_prioritized_over_completion_check(self) -> None: + """PROVISIONING check comes before completion check.""" + deployment = make_deployment(desired=1) + spec = RollingUpdateSpec(max_surge=1, max_unavailable=0) + routes = [ + make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=NEW_REV, status=RouteStatus.PROVISIONING), + ] + + result = rolling_update_evaluate(deployment, routes, spec) + + # Even though new_healthy >= desired, PROVISIONING takes precedence + assert result.sub_step == DeploymentSubStep.PROVISIONING + assert not result.completed + + +# =========================================================================== +# 9. Route creator specs validation +# =========================================================================== + + +class TestRouteCreatorSpecs: + """Validate that route creator specs have correct fields.""" + + def test_creator_specs_use_deploying_revision(self) -> None: + """Created routes should use the deploying revision.""" + deployment = make_deployment(desired=1) + spec = RollingUpdateSpec(max_surge=1, max_unavailable=0) + + result = rolling_update_evaluate(deployment, [], spec) + + assert _count_scale_out(result) == 1 + creator_spec = result.route_changes.scale_out_specs[0].spec + assert isinstance(creator_spec, RouteCreatorSpec) + assert creator_spec.revision_id == NEW_REV + assert creator_spec.endpoint_id == ENDPOINT_ID + assert creator_spec.session_owner_id == USER_ID + assert creator_spec.domain == "default" + assert creator_spec.project_id == PROJECT_ID + + def test_multiple_creators_all_correct(self) -> None: + """Multiple creators all have correct metadata.""" + deployment = make_deployment(desired=3) + spec = RollingUpdateSpec(max_surge=3, max_unavailable=0) + + result = rolling_update_evaluate(deployment, [], spec) + + assert _count_scale_out(result) == 3 + for creator in result.route_changes.scale_out_specs: + creator_spec = creator.spec + assert isinstance(creator_spec, RouteCreatorSpec) + assert creator_spec.revision_id == NEW_REV + assert creator_spec.endpoint_id == ENDPOINT_ID + + +# =========================================================================== +# 10. Realistic multi-step scenario (desired=5) +# =========================================================================== + + +class TestRealisticScenario: + """Simulate a realistic rolling update with desired=5, surge=2, unavail=1.""" + + def test_step_by_step_rolling_update(self) -> None: + """Full simulation of a rolling update across multiple cycles.""" + deployment = make_deployment(desired=5) + spec = RollingUpdateSpec(max_surge=2, max_unavailable=1) + + # Cycle 1: 5 old, 0 new + old_routes = [make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY) for _ in range(5)] + r1 = rolling_update_evaluate(deployment, old_routes, spec) + + # max_total = 7, current = 5, can_create = 2, still_needed = 5 → 2 + assert _count_scale_out(r1) == 2 + # min_available = 4, available = 0+5 = 5, can_terminate = 1 + assert len(_scale_in_ids(r1)) == 1 + + # Cycle 2: 4 old, 2 new healthy + routes_c2 = [ + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), + ] + r2 = rolling_update_evaluate(deployment, routes_c2, spec) + + # max_total = 7, current = 6, can_create = 1, still_needed = 3 → 1 + assert _count_scale_out(r2) == 1 + # min_available = 4, available = 2+4 = 6, can_terminate = 2 + assert len(_scale_in_ids(r2)) == 2 + + # Cycle 3: 2 old, 3 new healthy + routes_c3 = [ + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), + ] + r3 = rolling_update_evaluate(deployment, routes_c3, spec) + + # max_total = 7, current = 5, can_create = 2, still_needed = 2 → 2 + assert _count_scale_out(r3) == 2 + # min_available = 4, available = 3+2 = 5, can_terminate = 1 + assert len(_scale_in_ids(r3)) == 1 + + # Cycle 4: 1 old, 5 new healthy + routes_c4 = [ + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), + ] + r4 = rolling_update_evaluate(deployment, routes_c4, spec) + + # can_create = 0 (still_needed = 0), can_terminate = 1 + assert _count_scale_out(r4) == 0 + assert len(_scale_in_ids(r4)) == 1 + assert not r4.completed + + # Cycle 5: 0 old, 5 new healthy → completed + routes_c5 = [ + make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), + ] + r5 = rolling_update_evaluate(deployment, routes_c5, spec) + + assert r5.completed + + +# =========================================================================== +# 11. Deadlock and stall detection +# =========================================================================== + + +class TestDeadlockAndStall: + """Test scenarios where the FSM could potentially stall.""" + + def test_surge_0_unavailable_0_deadlock(self) -> None: + """Both surge=0 and unavailable=0 → no progress possible (deadlock). + + This is a configuration error: at least one must be > 0 for progress. + The FSM correctly returns PROGRESSING with no changes. + """ + deployment = make_deployment(desired=3) + spec = RollingUpdateSpec(max_surge=0, max_unavailable=0) + routes = [ + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + ] + + result = rolling_update_evaluate(deployment, routes, spec) + + assert result.sub_step == DeploymentSubStep.PROGRESSING + assert _count_scale_out(result) == 0 + assert len(_scale_in_ids(result)) == 0 + # This is a known deadlock — no progress is possible. + + def test_surge_0_unavailable_1_terminates_first_then_creates(self) -> None: + """surge=0, unavailable=1 → terminate 1, then next cycle creates 1. + + This pattern kills old routes before creating new ones (downtime-tolerant). + """ + deployment = make_deployment(desired=3) + spec = RollingUpdateSpec(max_surge=0, max_unavailable=1) + + # Cycle 1: 3 old → terminate 1, create 0 + routes = [ + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + ] + r1 = rolling_update_evaluate(deployment, routes, spec) + assert _count_scale_out(r1) == 0 + assert len(_scale_in_ids(r1)) == 1 + + # Cycle 2: 2 old → now we can create 1 + routes_c2 = [ + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + ] + r2 = rolling_update_evaluate(deployment, routes_c2, spec) + # max_total = 3, current = 2, can_create = 1, still_needed = 3 → 1 + assert _count_scale_out(r2) == 1 + # min_available = 2, available = 0+2 = 2, can_terminate = 0 + assert len(_scale_in_ids(r2)) == 0 + + def test_partial_new_failure_continues_progress(self) -> None: + """Some new routes fail while others succeed → continue, no rollback.""" + deployment = make_deployment(desired=3) + spec = RollingUpdateSpec(max_surge=2, max_unavailable=0) + routes = [ + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=NEW_REV, status=RouteStatus.FAILED_TO_START), + ] + + result = rolling_update_evaluate(deployment, routes, spec) + + # total_new_live = 1 > 0, so NOT rolled back + assert result.sub_step == DeploymentSubStep.PROGRESSING + # still_needed = 3-1 = 2, max_total=5, current=4 → can_create = 1 + assert _count_scale_out(result) == 1 + + def test_new_routes_exceed_desired_no_extra_create(self) -> None: + """More new_live than desired → no extra creation (still_needed < 0).""" + deployment = make_deployment(desired=2) + spec = RollingUpdateSpec(max_surge=2, max_unavailable=0) + routes = [ + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), + ] + + result = rolling_update_evaluate(deployment, routes, spec) + + # still_needed = 2-3 = -1 → to_create = max(0, ...) = 0 + assert _count_scale_out(result) == 0 + # min_available = 2, available = 3+1 = 4, can_terminate = 2 → min(2, 1) = 1 + assert len(_scale_in_ids(result)) == 1 + + +# =========================================================================== +# 12. desired_replica_count vs replica_count +# =========================================================================== + + +class TestDesiredReplicaCount: + """Test that the correct desired count is used.""" + + def test_desired_replica_count_overrides_replica_count(self) -> None: + """When desired_replica_count is set, it takes precedence.""" + deployment = make_deployment(desired=3) + # Override desired_replica_count + deployment.replica_spec = ReplicaSpec( + replica_count=1, + desired_replica_count=3, + ) + spec = RollingUpdateSpec(max_surge=1, max_unavailable=0) + routes = [make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY)] + + result = rolling_update_evaluate(deployment, routes, spec) + + # desired is 3 (from desired_replica_count), not 1 + # max_total = 4, current = 1, can_create = 3, still_needed = 3 → 3 + assert _count_scale_out(result) == 3 + + def test_replica_count_used_when_no_desired(self) -> None: + """When desired_replica_count is None, uses replica_count.""" + deployment = make_deployment(desired=2) + deployment.replica_spec = ReplicaSpec( + replica_count=2, + desired_replica_count=None, + ) + spec = RollingUpdateSpec(max_surge=1, max_unavailable=0) + routes = [ + make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), + ] + + result = rolling_update_evaluate(deployment, routes, spec) + + assert result.completed + + +# =========================================================================== +# 13. Scale-down during rolling update +# =========================================================================== + + +class TestScaleDownDuringRollingUpdate: + """Test behavior when desired is reduced during rolling update.""" + + def test_desired_reduced_terminates_excess_old(self) -> None: + """If desired is lowered, more old can be terminated.""" + deployment = make_deployment(desired=1) + spec = RollingUpdateSpec(max_surge=1, max_unavailable=0) + routes = [ + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + ] + + result = rolling_update_evaluate(deployment, routes, spec) + + # max_total = 2, current = 3 → can_create = max(0, -1) = 0 + assert _count_scale_out(result) == 0 + # Wait: still_needed = 1 - 0 = 1, but can_create is capped by max_total + # max_total = 2, current = 3 → can_create = -1 → to_create = max(0, min(-1, 1)) = 0 + # min_available = 1, available = 0+3 = 3, can_terminate = 2 + assert len(_scale_in_ids(result)) == 2 + + def test_desired_increased_creates_more(self) -> None: + """If desired is raised, more new routes are created.""" + deployment = make_deployment(desired=5) + spec = RollingUpdateSpec(max_surge=2, max_unavailable=0) + routes = [ + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + ] + + result = rolling_update_evaluate(deployment, routes, spec) + + # max_total = 7, current = 2, can_create = 5, still_needed = 5 → 5 + assert _count_scale_out(result) == 5 + + +# =========================================================================== +# 14. Concurrent provisioning and termination +# =========================================================================== + + +class TestConcurrentOperations: + """Test that provisioning blocks further changes correctly.""" + + def test_provisioning_blocks_all_further_actions(self) -> None: + """Any new route in PROVISIONING → wait, even if old can be terminated.""" + deployment = make_deployment(desired=3) + spec = RollingUpdateSpec(max_surge=2, max_unavailable=1) + routes = [ + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + make_route(revision_id=NEW_REV, status=RouteStatus.PROVISIONING), + ] + + result = rolling_update_evaluate(deployment, routes, spec) + + # PROVISIONING takes priority over all other decisions + assert result.sub_step == DeploymentSubStep.PROVISIONING + assert _count_scale_out(result) == 0 + assert len(_scale_in_ids(result)) == 0 + + def test_multiple_provisioning_routes_still_waits(self) -> None: + """Multiple PROVISIONING routes → still PROVISIONING.""" + deployment = make_deployment(desired=3) + spec = RollingUpdateSpec(max_surge=3, max_unavailable=3) + routes = [ + make_route(revision_id=NEW_REV, status=RouteStatus.PROVISIONING), + make_route(revision_id=NEW_REV, status=RouteStatus.PROVISIONING), + make_route(revision_id=NEW_REV, status=RouteStatus.PROVISIONING), + ] + + result = rolling_update_evaluate(deployment, routes, spec) + + assert result.sub_step == DeploymentSubStep.PROVISIONING + + def test_old_provisioning_counted_as_active(self) -> None: + """Old routes in PROVISIONING are counted as old_active.""" + deployment = make_deployment(desired=2) + spec = RollingUpdateSpec(max_surge=1, max_unavailable=0) + routes = [ + make_route(revision_id=OLD_REV, status=RouteStatus.PROVISIONING), + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), + ] + + result = rolling_update_evaluate(deployment, routes, spec) + + # old_active = 2 (both PROVISIONING and HEALTHY are active) + assert result.sub_step == DeploymentSubStep.PROGRESSING + assert not result.completed From 12347c571d8c4b29da7f84c74353eeff56bd86a8 Mon Sep 17 00:00:00 2001 From: jopemachine Date: Mon, 2 Mar 2026 07:23:33 +0000 Subject: [PATCH 2/8] docs: Add news fragment --- changes/9567.feature.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 changes/9567.feature.md diff --git a/changes/9567.feature.md b/changes/9567.feature.md new file mode 100644 index 00000000000..1065196f9bd --- /dev/null +++ b/changes/9567.feature.md @@ -0,0 +1 @@ +Implement Rolling Update deployment strategy From c8bc1f10c79c2c190669a16d9cab61141543e0bd Mon Sep 17 00:00:00 2001 From: jopemachine Date: Mon, 2 Mar 2026 01:53:45 +0000 Subject: [PATCH 3/8] wip --- .../deployment/db_source/db_source.py | 6 +- .../repositories/deployment/repository.py | 3 +- .../manager/sokovan/deployment/coordinator.py | 19 +- .../sokovan/deployment/strategy/blue_green.py | 161 +++++++- .../sokovan/deployment/strategy/evaluator.py | 11 +- .../sokovan/deployment/strategy/types.py | 1 + .../deployment/strategy/test_blue_green.py | 371 ++++++++++++++++++ 7 files changed, 560 insertions(+), 12 deletions(-) create mode 100644 tests/unit/manager/sokovan/deployment/strategy/test_blue_green.py diff --git a/src/ai/backend/manager/repositories/deployment/db_source/db_source.py b/src/ai/backend/manager/repositories/deployment/db_source/db_source.py index 38aa75d45f2..ecebecdff56 100644 --- a/src/ai/backend/manager/repositories/deployment/db_source/db_source.py +++ b/src/ai/backend/manager/repositories/deployment/db_source/db_source.py @@ -1410,8 +1410,9 @@ async def scale_routes( self, scale_out_creators: Sequence[Creator[RoutingRow]], scale_in_updater: BatchUpdater[RoutingRow] | None, + promote_updater: BatchUpdater[RoutingRow] | None = None, ) -> None: - """Scale out/in routes based on provided creators and updater.""" + """Scale out/in/promote routes based on provided creators and updaters.""" async with self._begin_session_read_committed() as db_sess: # Scale out routes for creator in scale_out_creators: @@ -1419,6 +1420,9 @@ async def scale_routes( # Scale in routes if scale_in_updater: await execute_batch_updater(db_sess, scale_in_updater) + # Promote routes (blue-green) + if promote_updater: + await execute_batch_updater(db_sess, promote_updater) # Route operations diff --git a/src/ai/backend/manager/repositories/deployment/repository.py b/src/ai/backend/manager/repositories/deployment/repository.py index d78f05ffe04..46014a2c330 100644 --- a/src/ai/backend/manager/repositories/deployment/repository.py +++ b/src/ai/backend/manager/repositories/deployment/repository.py @@ -553,8 +553,9 @@ async def scale_routes( self, scale_out_creators: Sequence[Creator[RoutingRow]], scale_in_updater: BatchUpdater[RoutingRow] | None, + promote_updater: BatchUpdater[RoutingRow] | None = None, ) -> None: - await self._db_source.scale_routes(scale_out_creators, scale_in_updater) + await self._db_source.scale_routes(scale_out_creators, scale_in_updater, promote_updater) # Route operations diff --git a/src/ai/backend/manager/sokovan/deployment/coordinator.py b/src/ai/backend/manager/sokovan/deployment/coordinator.py index d617fda9569..f78f03e74f0 100644 --- a/src/ai/backend/manager/sokovan/deployment/coordinator.py +++ b/src/ai/backend/manager/sokovan/deployment/coordinator.py @@ -498,7 +498,7 @@ async def _apply_route_changes( ) -> None: """Apply aggregated route mutations from the evaluation result.""" changes = eval_result.route_changes - if not changes.rollout_specs and not changes.drain_route_ids: + if not changes.rollout_specs and not changes.drain_route_ids and not changes.promote_route_ids: return scale_in_updater: BatchUpdater[RoutingRow] | None = None @@ -512,11 +512,24 @@ async def _apply_route_changes( conditions=[RouteConditions.by_ids(changes.drain_route_ids)], ) - await self._deployment_repository.scale_routes(changes.rollout_specs, scale_in_updater) + promote_updater: BatchUpdater[RoutingRow] | None = None + if changes.promote_route_ids: + promote_updater = BatchUpdater( + spec=RouteBatchUpdaterSpec( + traffic_status=RouteTrafficStatus.ACTIVE, + traffic_ratio=1.0, + ), + conditions=[RouteConditions.by_ids(changes.promote_route_ids)], + ) + + await self._deployment_repository.scale_routes( + changes.rollout_specs, scale_in_updater, promote_updater + ) log.debug( - "Applied route changes: {} created, {} terminated", + "Applied route changes: {} created, {} terminated, {} promoted", len(changes.rollout_specs), len(changes.drain_route_ids), + len(changes.promote_route_ids), ) async def _transition_completed_deployments( diff --git a/src/ai/backend/manager/sokovan/deployment/strategy/blue_green.py b/src/ai/backend/manager/sokovan/deployment/strategy/blue_green.py index 6e76625ed8b..0791f881b4a 100644 --- a/src/ai/backend/manager/sokovan/deployment/strategy/blue_green.py +++ b/src/ai/backend/manager/sokovan/deployment/strategy/blue_green.py @@ -1,20 +1,30 @@ """Blue-green deployment strategy evaluation for a single deployment cycle (BEP-1049). -Provisions a full set of new-revision routes, validates them, then atomically -switches traffic from the old revision to the new one. +Provisions a full set of new-revision routes (INACTIVE), validates them, then +atomically switches traffic from the old revision to the new one. """ from __future__ import annotations +import logging from collections.abc import Sequence +from ai.backend.logging import BraceStyleAdapter from ai.backend.manager.data.deployment.types import ( DeploymentInfo, + DeploymentSubStep, RouteInfo, + RouteStatus, + RouteTrafficStatus, ) from ai.backend.manager.models.deployment_policy import BlueGreenSpec +from ai.backend.manager.models.routing import RoutingRow +from ai.backend.manager.repositories.base import Creator +from ai.backend.manager.repositories.deployment.creators import RouteCreatorSpec -from .types import CycleEvaluationResult +from .types import CycleEvaluationResult, RouteChanges + +log = BraceStyleAdapter(logging.getLogger(__name__)) def blue_green_evaluate( @@ -22,5 +32,146 @@ def blue_green_evaluate( routes: Sequence[RouteInfo], spec: BlueGreenSpec, ) -> CycleEvaluationResult: - """Evaluate one cycle of blue-green deployment for a single deployment.""" - raise NotImplementedError("Blue-green deployment strategy is not yet implemented") + """Evaluate one cycle of blue-green deployment for a single deployment. + + FSM flow: + 1. Classify routes into blue (old) / green (new) by revision_id. + 2. If no green routes → create all green (INACTIVE) → PROVISIONING. + 3. If any green PROVISIONING → PROVISIONING (wait). + 4. If all green failed → scale_in green → ROLLED_BACK. + 5. If not all green healthy → PROGRESSING (wait). + 6. If all green healthy + auto_promote=False → PROGRESSING (manual wait). + 7. If all green healthy + auto_promote=True + delay>0 → PROGRESSING (delay wait). + 8. If all green healthy + auto_promote=True + delay=0 → promote + completed. + """ + deploying_rev = deployment.deploying_revision_id + desired = deployment.replica_spec.target_replica_count + + # ── 1. Classify routes ── + blue_active: list[RouteInfo] = [] + green_provisioning: list[RouteInfo] = [] + green_healthy: list[RouteInfo] = [] + green_failed: list[RouteInfo] = [] + + for r in routes: + is_green = r.revision_id == deploying_rev + if not is_green: + if r.status.is_active(): + blue_active.append(r) + continue + + if r.status == RouteStatus.PROVISIONING: + green_provisioning.append(r) + elif r.status == RouteStatus.HEALTHY: + green_healthy.append(r) + elif r.status in (RouteStatus.FAILED_TO_START, RouteStatus.TERMINATED): + green_failed.append(r) + elif r.status.is_active(): + green_healthy.append(r) + + total_green_live = len(green_provisioning) + len(green_healthy) + + # ── 2. No green routes → create all green (INACTIVE) ── + if total_green_live == 0 and not green_failed: + log.debug( + "deployment {}: no green routes — creating {} INACTIVE routes", + deployment.id, + desired, + ) + route_changes = RouteChanges( + scale_out_specs=_build_route_creators(deployment, desired), + ) + return CycleEvaluationResult( + sub_step=DeploymentSubStep.PROVISIONING, + route_changes=route_changes, + ) + + # ── 3. Green PROVISIONING → wait ── + if green_provisioning: + log.debug( + "deployment {}: {} green routes still provisioning", + deployment.id, + len(green_provisioning), + ) + return CycleEvaluationResult(sub_step=DeploymentSubStep.PROVISIONING) + + # ── 4. All green failed → rollback ── + if total_green_live == 0 and green_failed: + log.warning( + "deployment {}: all {} green routes failed — rolling back", + deployment.id, + len(green_failed), + ) + route_changes = RouteChanges( + scale_in_route_ids=[r.route_id for r in green_failed], + ) + return CycleEvaluationResult( + sub_step=DeploymentSubStep.ROLLED_BACK, + route_changes=route_changes, + ) + + # ── 5. Not all green healthy → PROGRESSING (wait) ── + if len(green_healthy) < desired: + log.debug( + "deployment {}: green healthy={}/{} — waiting", + deployment.id, + len(green_healthy), + desired, + ) + return CycleEvaluationResult(sub_step=DeploymentSubStep.PROGRESSING) + + # ── All green healthy from here ── + + # ── 6. auto_promote=False → PROGRESSING (manual wait) ── + if not spec.auto_promote: + log.debug( + "deployment {}: all green healthy, waiting for manual promotion", + deployment.id, + ) + return CycleEvaluationResult(sub_step=DeploymentSubStep.PROGRESSING) + + # ── 7. auto_promote=True + delay>0 → PROGRESSING (delay wait) ── + if spec.promote_delay_seconds > 0: + log.debug( + "deployment {}: all green healthy, waiting for promote delay ({}s)", + deployment.id, + spec.promote_delay_seconds, + ) + return CycleEvaluationResult(sub_step=DeploymentSubStep.PROGRESSING) + + # ── 8. Promotion: green → ACTIVE, blue → TERMINATING ── + log.info( + "deployment {}: promoting {} green routes, terminating {} blue routes", + deployment.id, + len(green_healthy), + len(blue_active), + ) + route_changes = RouteChanges( + promote_route_ids=[r.route_id for r in green_healthy], + scale_in_route_ids=[r.route_id for r in blue_active], + ) + return CycleEvaluationResult( + sub_step=DeploymentSubStep.PROGRESSING, + completed=True, + route_changes=route_changes, + ) + + +def _build_route_creators( + deployment: DeploymentInfo, + count: int, +) -> list[Creator[RoutingRow]]: + """Build route creator specs for green routes (INACTIVE, traffic_ratio=0.0).""" + creators: list[Creator[RoutingRow]] = [] + for _ in range(count): + creator_spec = RouteCreatorSpec( + endpoint_id=deployment.id, + session_owner_id=deployment.metadata.session_owner, + domain=deployment.metadata.domain, + project_id=deployment.metadata.project, + revision_id=deployment.deploying_revision_id, + traffic_status=RouteTrafficStatus.INACTIVE, + traffic_ratio=0.0, + ) + creators.append(Creator(spec=creator_spec)) + return creators diff --git a/src/ai/backend/manager/sokovan/deployment/strategy/evaluator.py b/src/ai/backend/manager/sokovan/deployment/strategy/evaluator.py index 9d84ced2104..79c7000c033 100644 --- a/src/ai/backend/manager/sokovan/deployment/strategy/evaluator.py +++ b/src/ai/backend/manager/sokovan/deployment/strategy/evaluator.py @@ -92,6 +92,7 @@ async def evaluate( changes = cycle_result.route_changes result.route_changes.rollout_specs.extend(changes.rollout_specs) result.route_changes.drain_route_ids.extend(changes.drain_route_ids) + result.route_changes.promote_route_ids.extend(changes.promote_route_ids) self._record_route_changes(deployment, changes) # Group by sub-step @@ -109,8 +110,8 @@ async def evaluate( @staticmethod def _record_route_changes(deployment: DeploymentInfo, changes: RouteChanges) -> None: - """Record rollout/drain operations as sub-steps for observability.""" - if not changes.rollout_specs and not changes.drain_route_ids: + """Record rollout/drain/promote operations as sub-steps for observability.""" + if not changes.rollout_specs and not changes.drain_route_ids and not changes.promote_route_ids: return pool = DeploymentRecorderContext.current_pool() recorder = pool.recorder(deployment.id) @@ -127,6 +128,12 @@ def _record_route_changes(deployment: DeploymentInfo, changes: RouteChanges) -> success_detail=f"{len(changes.drain_route_ids)} route(s)", ): pass + if changes.promote_route_ids: + with recorder.step( + "promote", + success_detail=f"{len(changes.promote_route_ids)} route(s)", + ): + pass def _evaluate_single( self, diff --git a/src/ai/backend/manager/sokovan/deployment/strategy/types.py b/src/ai/backend/manager/sokovan/deployment/strategy/types.py index 615d6e8238f..8c32682e6a0 100644 --- a/src/ai/backend/manager/sokovan/deployment/strategy/types.py +++ b/src/ai/backend/manager/sokovan/deployment/strategy/types.py @@ -20,6 +20,7 @@ class RouteChanges: rollout_specs: list[Creator[RoutingRow]] = field(default_factory=list) drain_route_ids: list[UUID] = field(default_factory=list) + promote_route_ids: list[UUID] = field(default_factory=list) @dataclass diff --git a/tests/unit/manager/sokovan/deployment/strategy/test_blue_green.py b/tests/unit/manager/sokovan/deployment/strategy/test_blue_green.py new file mode 100644 index 00000000000..28c7c67ccc5 --- /dev/null +++ b/tests/unit/manager/sokovan/deployment/strategy/test_blue_green.py @@ -0,0 +1,371 @@ +"""Unit tests for the blue-green deployment strategy FSM (BEP-1049).""" + +from __future__ import annotations + +from datetime import UTC, datetime +from uuid import UUID, uuid4 + +from ai.backend.common.data.endpoint.types import EndpointLifecycle +from ai.backend.common.types import SessionId +from ai.backend.manager.data.deployment.types import ( + DeploymentInfo, + DeploymentMetadata, + DeploymentNetworkSpec, + DeploymentState, + DeploymentSubStep, + ReplicaSpec, + RouteInfo, + RouteStatus, + RouteTrafficStatus, +) +from ai.backend.manager.models.deployment_policy import BlueGreenSpec +from ai.backend.manager.repositories.deployment.creators import RouteCreatorSpec +from ai.backend.manager.sokovan.deployment.strategy.blue_green import blue_green_evaluate + +# ── Helpers ── + + +def _make_deployment( + *, + desired: int = 3, + deploying_revision_id: UUID | None = None, + current_revision_id: UUID | None = None, +) -> DeploymentInfo: + endpoint_id = uuid4() + return DeploymentInfo( + id=endpoint_id, + metadata=DeploymentMetadata( + name="test-deployment", + domain="default", + project=uuid4(), + resource_group="default", + created_user=uuid4(), + session_owner=uuid4(), + created_at=datetime.now(UTC), + revision_history_limit=5, + ), + state=DeploymentState( + lifecycle=EndpointLifecycle.DEPLOYING, + retry_count=0, + ), + replica_spec=ReplicaSpec(replica_count=desired), + network=DeploymentNetworkSpec( + open_to_public=False, + ), + model_revisions=[], + current_revision_id=current_revision_id or uuid4(), + deploying_revision_id=deploying_revision_id or uuid4(), + ) + + +def _make_route( + *, + endpoint_id: UUID, + revision_id: UUID | None = None, + status: RouteStatus = RouteStatus.HEALTHY, + traffic_status: RouteTrafficStatus = RouteTrafficStatus.ACTIVE, + traffic_ratio: float = 1.0, +) -> RouteInfo: + return RouteInfo( + route_id=uuid4(), + endpoint_id=endpoint_id, + session_id=SessionId(uuid4()), + status=status, + traffic_ratio=traffic_ratio, + created_at=datetime.now(UTC), + revision_id=revision_id, + traffic_status=traffic_status, + ) + + +def _blue_routes( + deployment: DeploymentInfo, + count: int, + *, + status: RouteStatus = RouteStatus.HEALTHY, +) -> list[RouteInfo]: + return [ + _make_route( + endpoint_id=deployment.id, + revision_id=deployment.current_revision_id, + status=status, + traffic_status=RouteTrafficStatus.ACTIVE, + traffic_ratio=1.0, + ) + for _ in range(count) + ] + + +def _green_routes( + deployment: DeploymentInfo, + count: int, + *, + status: RouteStatus = RouteStatus.HEALTHY, + traffic_status: RouteTrafficStatus = RouteTrafficStatus.INACTIVE, + traffic_ratio: float = 0.0, +) -> list[RouteInfo]: + return [ + _make_route( + endpoint_id=deployment.id, + revision_id=deployment.deploying_revision_id, + status=status, + traffic_status=traffic_status, + traffic_ratio=traffic_ratio, + ) + for _ in range(count) + ] + + +# ── Test Classes ── + + +class TestBlueGreenNoGreenRoutes: + """When no green routes exist, all should be created as INACTIVE.""" + + def test_creates_all_green_inactive(self) -> None: + deployment = _make_deployment(desired=3) + blues = _blue_routes(deployment, 3) + + result = blue_green_evaluate(deployment, blues, BlueGreenSpec()) + + assert result.sub_step == DeploymentSubStep.PROVISIONING + assert not result.completed + assert len(result.route_changes.scale_out_specs) == 3 + assert not result.route_changes.scale_in_route_ids + assert not result.route_changes.promote_route_ids + + def test_creator_spec_has_inactive_traffic(self) -> None: + deployment = _make_deployment(desired=2) + blues = _blue_routes(deployment, 2) + + result = blue_green_evaluate(deployment, blues, BlueGreenSpec()) + + for creator in result.route_changes.scale_out_specs: + spec = creator.spec + assert isinstance(spec, RouteCreatorSpec) + assert spec.traffic_status == RouteTrafficStatus.INACTIVE + assert spec.traffic_ratio == 0.0 + assert spec.revision_id == deployment.deploying_revision_id + + def test_no_blue_routes_fresh_deployment(self) -> None: + """First deployment with no existing routes.""" + deployment = _make_deployment(desired=3) + + result = blue_green_evaluate(deployment, [], BlueGreenSpec()) + + assert result.sub_step == DeploymentSubStep.PROVISIONING + assert len(result.route_changes.scale_out_specs) == 3 + + +class TestBlueGreenProvisioning: + """When green routes are still PROVISIONING, the FSM should wait.""" + + def test_all_green_provisioning(self) -> None: + deployment = _make_deployment(desired=3) + blues = _blue_routes(deployment, 3) + greens = _green_routes(deployment, 3, status=RouteStatus.PROVISIONING) + + result = blue_green_evaluate(deployment, blues + greens, BlueGreenSpec()) + + assert result.sub_step == DeploymentSubStep.PROVISIONING + assert not result.completed + assert not result.route_changes.scale_out_specs + assert not result.route_changes.scale_in_route_ids + assert not result.route_changes.promote_route_ids + + def test_partial_provisioning_partial_healthy(self) -> None: + deployment = _make_deployment(desired=3) + blues = _blue_routes(deployment, 3) + greens = _green_routes(deployment, 1, status=RouteStatus.HEALTHY) + _green_routes( + deployment, 2, status=RouteStatus.PROVISIONING + ) + + result = blue_green_evaluate(deployment, blues + greens, BlueGreenSpec()) + + assert result.sub_step == DeploymentSubStep.PROVISIONING + assert not result.completed + + +class TestBlueGreenAllGreenFailed: + """When all green routes have failed, rollback should occur.""" + + def test_all_green_failed_rollback(self) -> None: + deployment = _make_deployment(desired=3) + blues = _blue_routes(deployment, 3) + greens = _green_routes(deployment, 3, status=RouteStatus.FAILED_TO_START) + + result = blue_green_evaluate(deployment, blues + greens, BlueGreenSpec()) + + assert result.sub_step == DeploymentSubStep.ROLLED_BACK + assert not result.completed + green_ids = {r.route_id for r in greens} + assert set(result.route_changes.scale_in_route_ids) == green_ids + assert not result.route_changes.promote_route_ids + + def test_all_green_terminated_rollback(self) -> None: + deployment = _make_deployment(desired=2) + blues = _blue_routes(deployment, 2) + greens = _green_routes(deployment, 2, status=RouteStatus.TERMINATED) + + result = blue_green_evaluate(deployment, blues + greens, BlueGreenSpec()) + + assert result.sub_step == DeploymentSubStep.ROLLED_BACK + assert not result.completed + + +class TestBlueGreenMixedGreen: + """When green routes are in mixed states (healthy + failed, no provisioning).""" + + def test_healthy_and_failed_mixed(self) -> None: + deployment = _make_deployment(desired=3) + blues = _blue_routes(deployment, 3) + greens = _green_routes(deployment, 1, status=RouteStatus.HEALTHY) + _green_routes( + deployment, 2, status=RouteStatus.FAILED_TO_START + ) + + result = blue_green_evaluate(deployment, blues + greens, BlueGreenSpec()) + + assert result.sub_step == DeploymentSubStep.PROGRESSING + assert not result.completed + + +class TestBlueGreenPromotion: + """When all green routes are healthy and promotion should happen.""" + + def test_auto_promote_true_delay_zero_completed(self) -> None: + deployment = _make_deployment(desired=3) + blues = _blue_routes(deployment, 3) + greens = _green_routes(deployment, 3, status=RouteStatus.HEALTHY) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + + result = blue_green_evaluate(deployment, blues + greens, spec) + + assert result.sub_step == DeploymentSubStep.PROGRESSING + assert result.completed + # Green route IDs should be promoted + green_ids = {r.route_id for r in greens} + assert set(result.route_changes.promote_route_ids) == green_ids + # Blue route IDs should be scaled in + blue_ids = {r.route_id for r in blues} + assert set(result.route_changes.scale_in_route_ids) == blue_ids + # No new routes created + assert not result.route_changes.scale_out_specs + + def test_auto_promote_false_manual_wait(self) -> None: + deployment = _make_deployment(desired=3) + blues = _blue_routes(deployment, 3) + greens = _green_routes(deployment, 3, status=RouteStatus.HEALTHY) + spec = BlueGreenSpec(auto_promote=False, promote_delay_seconds=0) + + result = blue_green_evaluate(deployment, blues + greens, spec) + + assert result.sub_step == DeploymentSubStep.PROGRESSING + assert not result.completed + assert not result.route_changes.promote_route_ids + assert not result.route_changes.scale_in_route_ids + + def test_auto_promote_true_delay_positive_wait(self) -> None: + deployment = _make_deployment(desired=3) + blues = _blue_routes(deployment, 3) + greens = _green_routes(deployment, 3, status=RouteStatus.HEALTHY) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=60) + + result = blue_green_evaluate(deployment, blues + greens, spec) + + assert result.sub_step == DeploymentSubStep.PROGRESSING + assert not result.completed + assert not result.route_changes.promote_route_ids + assert not result.route_changes.scale_in_route_ids + + +class TestBlueGreenSingleReplica: + """Edge case: desired=1 single replica.""" + + def test_single_replica_no_green(self) -> None: + deployment = _make_deployment(desired=1) + blues = _blue_routes(deployment, 1) + + result = blue_green_evaluate(deployment, blues, BlueGreenSpec()) + + assert result.sub_step == DeploymentSubStep.PROVISIONING + assert len(result.route_changes.scale_out_specs) == 1 + + def test_single_replica_promotion(self) -> None: + deployment = _make_deployment(desired=1) + blues = _blue_routes(deployment, 1) + greens = _green_routes(deployment, 1, status=RouteStatus.HEALTHY) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + + result = blue_green_evaluate(deployment, blues + greens, spec) + + assert result.completed + assert len(result.route_changes.promote_route_ids) == 1 + assert len(result.route_changes.scale_in_route_ids) == 1 + + +class TestBlueGreenManyReplicas: + """Edge case: desired=5 many replicas.""" + + def test_many_replicas_creates_all(self) -> None: + deployment = _make_deployment(desired=5) + blues = _blue_routes(deployment, 5) + + result = blue_green_evaluate(deployment, blues, BlueGreenSpec()) + + assert result.sub_step == DeploymentSubStep.PROVISIONING + assert len(result.route_changes.scale_out_specs) == 5 + + def test_many_replicas_promotion(self) -> None: + deployment = _make_deployment(desired=5) + blues = _blue_routes(deployment, 5) + greens = _green_routes(deployment, 5, status=RouteStatus.HEALTHY) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + + result = blue_green_evaluate(deployment, blues + greens, spec) + + assert result.completed + assert len(result.route_changes.promote_route_ids) == 5 + assert len(result.route_changes.scale_in_route_ids) == 5 + + +class TestBlueGreenNoBlueRoutes: + """When there are no blue routes (fresh deployment).""" + + def test_promotion_no_blue(self) -> None: + """Promotion with no blue routes to terminate.""" + deployment = _make_deployment(desired=3) + greens = _green_routes(deployment, 3, status=RouteStatus.HEALTHY) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + + result = blue_green_evaluate(deployment, greens, spec) + + assert result.completed + green_ids = {r.route_id for r in greens} + assert set(result.route_changes.promote_route_ids) == green_ids + assert not result.route_changes.scale_in_route_ids + + +class TestBlueGreenPromotionRouteIdVerification: + """Verify promote and scale_in route IDs are exact matches.""" + + def test_promote_ids_match_green_healthy(self) -> None: + deployment = _make_deployment(desired=3) + blues = _blue_routes(deployment, 3) + greens = _green_routes(deployment, 3, status=RouteStatus.HEALTHY) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + + result = blue_green_evaluate(deployment, blues + greens, spec) + + expected_promote = [r.route_id for r in greens] + assert result.route_changes.promote_route_ids == expected_promote + + def test_scale_in_ids_match_blue_active(self) -> None: + deployment = _make_deployment(desired=3) + blues = _blue_routes(deployment, 3) + greens = _green_routes(deployment, 3, status=RouteStatus.HEALTHY) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + + result = blue_green_evaluate(deployment, blues + greens, spec) + + expected_scale_in = [r.route_id for r in blues] + assert result.route_changes.scale_in_route_ids == expected_scale_in From febce7331cc56c4d8b1d747266a182d2f85c4898 Mon Sep 17 00:00:00 2001 From: jopemachine Date: Mon, 2 Mar 2026 02:22:52 +0000 Subject: [PATCH 4/8] docs: Add news fragment --- changes/9568.feature.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 changes/9568.feature.md diff --git a/changes/9568.feature.md b/changes/9568.feature.md new file mode 100644 index 00000000000..c956a901688 --- /dev/null +++ b/changes/9568.feature.md @@ -0,0 +1 @@ +Implement Blue-Green deployment strategy From 506588bb05bd3b6d53f85e9a1ff750a9c537a5d1 Mon Sep 17 00:00:00 2001 From: jopemachine Date: Mon, 2 Mar 2026 02:30:49 +0000 Subject: [PATCH 5/8] wip --- .../deployment/strategy/test_blue_green.py | 1522 +++++++++++++++-- 1 file changed, 1348 insertions(+), 174 deletions(-) diff --git a/tests/unit/manager/sokovan/deployment/strategy/test_blue_green.py b/tests/unit/manager/sokovan/deployment/strategy/test_blue_green.py index 28c7c67ccc5..da7bec443a3 100644 --- a/tests/unit/manager/sokovan/deployment/strategy/test_blue_green.py +++ b/tests/unit/manager/sokovan/deployment/strategy/test_blue_green.py @@ -1,4 +1,17 @@ -"""Unit tests for the blue-green deployment strategy FSM (BEP-1049).""" +"""Comprehensive tests for the blue-green deployment strategy FSM (BEP-1049). + +Tests cover: +- FSM state transitions: PROVISIONING, PROGRESSING, ROLLED_BACK, completed +- auto_promote / promote_delay_seconds combinations +- Single and multi-replica scenarios +- Edge cases: no routes, all failed, mixed statuses, desired=0 +- Multi-cycle progression simulation +- Route creator specs validation +- desired_replica_count vs replica_count +- Scale-down during blue-green deployment +- Concurrent provisioning checks +- Realistic multi-step scenarios +""" from __future__ import annotations @@ -21,26 +34,31 @@ from ai.backend.manager.models.deployment_policy import BlueGreenSpec from ai.backend.manager.repositories.deployment.creators import RouteCreatorSpec from ai.backend.manager.sokovan.deployment.strategy.blue_green import blue_green_evaluate +from ai.backend.manager.sokovan.deployment.strategy.types import CycleEvaluationResult -# ── Helpers ── +ENDPOINT_ID = UUID("aaaaaaaa-0000-0000-0000-aaaaaaaaaaaa") +OLD_REV = UUID("11111111-1111-1111-1111-111111111111") +NEW_REV = UUID("22222222-2222-2222-2222-222222222222") +PROJECT_ID = UUID("cccccccc-cccc-cccc-cccc-cccccccccccc") +USER_ID = UUID("dddddddd-dddd-dddd-dddd-dddddddddddd") -def _make_deployment( +def make_deployment( *, desired: int = 3, - deploying_revision_id: UUID | None = None, - current_revision_id: UUID | None = None, + deploying_revision_id: UUID = NEW_REV, + current_revision_id: UUID = OLD_REV, + endpoint_id: UUID = ENDPOINT_ID, ) -> DeploymentInfo: - endpoint_id = uuid4() return DeploymentInfo( id=endpoint_id, metadata=DeploymentMetadata( - name="test-deployment", + name="test-deploy", domain="default", - project=uuid4(), + project=PROJECT_ID, resource_group="default", - created_user=uuid4(), - session_owner=uuid4(), + created_user=USER_ID, + session_owner=USER_ID, created_at=datetime.now(UTC), revision_history_limit=5, ), @@ -48,26 +66,33 @@ def _make_deployment( lifecycle=EndpointLifecycle.DEPLOYING, retry_count=0, ), - replica_spec=ReplicaSpec(replica_count=desired), - network=DeploymentNetworkSpec( - open_to_public=False, + replica_spec=ReplicaSpec( + replica_count=desired, ), + network=DeploymentNetworkSpec(open_to_public=False), model_revisions=[], - current_revision_id=current_revision_id or uuid4(), - deploying_revision_id=deploying_revision_id or uuid4(), + current_revision_id=current_revision_id, + deploying_revision_id=deploying_revision_id, ) -def _make_route( +def make_route( *, - endpoint_id: UUID, - revision_id: UUID | None = None, + revision_id: UUID, status: RouteStatus = RouteStatus.HEALTHY, - traffic_status: RouteTrafficStatus = RouteTrafficStatus.ACTIVE, - traffic_ratio: float = 1.0, + endpoint_id: UUID = ENDPOINT_ID, + route_id: UUID | None = None, + traffic_status: RouteTrafficStatus | None = None, + traffic_ratio: float | None = None, ) -> RouteInfo: + if traffic_status is None: + traffic_status = ( + RouteTrafficStatus.ACTIVE if status.is_active() else RouteTrafficStatus.INACTIVE + ) + if traffic_ratio is None: + traffic_ratio = 1.0 if status.is_active() else 0.0 return RouteInfo( - route_id=uuid4(), + route_id=route_id or uuid4(), endpoint_id=endpoint_id, session_id=SessionId(uuid4()), status=status, @@ -78,16 +103,31 @@ def _make_route( ) +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _count_scale_out(result: CycleEvaluationResult) -> int: + return len(result.route_changes.scale_out_specs) + + +def _scale_in_ids(result: CycleEvaluationResult) -> list[UUID]: + return result.route_changes.scale_in_route_ids + + +def _promote_ids(result: CycleEvaluationResult) -> list[UUID]: + return result.route_changes.promote_route_ids + + def _blue_routes( - deployment: DeploymentInfo, count: int, *, status: RouteStatus = RouteStatus.HEALTHY, ) -> list[RouteInfo]: return [ - _make_route( - endpoint_id=deployment.id, - revision_id=deployment.current_revision_id, + make_route( + revision_id=OLD_REV, status=status, traffic_status=RouteTrafficStatus.ACTIVE, traffic_ratio=1.0, @@ -97,7 +137,6 @@ def _blue_routes( def _green_routes( - deployment: DeploymentInfo, count: int, *, status: RouteStatus = RouteStatus.HEALTHY, @@ -105,9 +144,8 @@ def _green_routes( traffic_ratio: float = 0.0, ) -> list[RouteInfo]: return [ - _make_route( - endpoint_id=deployment.id, - revision_id=deployment.deploying_revision_id, + make_route( + revision_id=NEW_REV, status=status, traffic_status=traffic_status, traffic_ratio=traffic_ratio, @@ -116,256 +154,1392 @@ def _green_routes( ] -# ── Test Classes ── +# =========================================================================== +# 1. Basic FSM states +# =========================================================================== -class TestBlueGreenNoGreenRoutes: - """When no green routes exist, all should be created as INACTIVE.""" +class TestBasicFSMStates: + """Test fundamental FSM transitions.""" - def test_creates_all_green_inactive(self) -> None: - deployment = _make_deployment(desired=3) - blues = _blue_routes(deployment, 3) + def test_no_routes_initial_cycle_creates_green(self) -> None: + """First cycle with 0 routes → PROVISIONING, creates desired count.""" + deployment = make_deployment(desired=3) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) - result = blue_green_evaluate(deployment, blues, BlueGreenSpec()) + result = blue_green_evaluate(deployment, [], spec) assert result.sub_step == DeploymentSubStep.PROVISIONING assert not result.completed - assert len(result.route_changes.scale_out_specs) == 3 - assert not result.route_changes.scale_in_route_ids - assert not result.route_changes.promote_route_ids + assert _count_scale_out(result) == 3 + assert len(_scale_in_ids(result)) == 0 + assert len(_promote_ids(result)) == 0 - def test_creator_spec_has_inactive_traffic(self) -> None: - deployment = _make_deployment(desired=2) - blues = _blue_routes(deployment, 2) + def test_green_provisioning_waits(self) -> None: + """Green routes in PROVISIONING → wait (PROVISIONING sub-step).""" + deployment = make_deployment(desired=3) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + routes = _blue_routes(3) + _green_routes(3, status=RouteStatus.PROVISIONING) - result = blue_green_evaluate(deployment, blues, BlueGreenSpec()) + result = blue_green_evaluate(deployment, routes, spec) - for creator in result.route_changes.scale_out_specs: - spec = creator.spec - assert isinstance(spec, RouteCreatorSpec) - assert spec.traffic_status == RouteTrafficStatus.INACTIVE - assert spec.traffic_ratio == 0.0 - assert spec.revision_id == deployment.deploying_revision_id + assert result.sub_step == DeploymentSubStep.PROVISIONING + assert not result.completed + assert _count_scale_out(result) == 0 + assert len(_scale_in_ids(result)) == 0 + assert len(_promote_ids(result)) == 0 + + def test_completed_when_all_green_healthy_auto_promote(self) -> None: + """All green healthy + auto_promote + delay=0 → completed.""" + deployment = make_deployment(desired=3) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + routes = _blue_routes(3) + _green_routes(3, status=RouteStatus.HEALTHY) - def test_no_blue_routes_fresh_deployment(self) -> None: - """First deployment with no existing routes.""" - deployment = _make_deployment(desired=3) + result = blue_green_evaluate(deployment, routes, spec) - result = blue_green_evaluate(deployment, [], BlueGreenSpec()) + assert result.completed + assert result.sub_step == DeploymentSubStep.PROGRESSING + assert len(_promote_ids(result)) == 3 + assert len(_scale_in_ids(result)) == 3 - assert result.sub_step == DeploymentSubStep.PROVISIONING - assert len(result.route_changes.scale_out_specs) == 3 + def test_rollback_when_all_green_failed(self) -> None: + """All green routes failed → ROLLED_BACK.""" + deployment = make_deployment(desired=3) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + routes = _blue_routes(3) + _green_routes(3, status=RouteStatus.FAILED_TO_START) + + result = blue_green_evaluate(deployment, routes, spec) + + assert result.sub_step == DeploymentSubStep.ROLLED_BACK + assert not result.completed + + def test_rollback_with_terminated_green_routes(self) -> None: + """Green routes in TERMINATED also count as failed.""" + deployment = make_deployment(desired=2) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + routes = _blue_routes(2) + _green_routes(2, status=RouteStatus.TERMINATED) + + result = blue_green_evaluate(deployment, routes, spec) + + assert result.sub_step == DeploymentSubStep.ROLLED_BACK + assert not result.completed + + +# =========================================================================== +# 2. auto_promote variations +# =========================================================================== + + +class TestAutoPromote: + """Test auto_promote parameter controls.""" + + def test_auto_promote_true_delay_zero_promotes(self) -> None: + """auto_promote=True, delay=0 → promote immediately.""" + deployment = make_deployment(desired=3) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + routes = _blue_routes(3) + _green_routes(3, status=RouteStatus.HEALTHY) + + result = blue_green_evaluate(deployment, routes, spec) + + assert result.completed + assert len(_promote_ids(result)) == 3 + assert len(_scale_in_ids(result)) == 3 + + def test_auto_promote_false_waits_for_manual(self) -> None: + """auto_promote=False → PROGRESSING, waiting for manual promotion.""" + deployment = make_deployment(desired=3) + spec = BlueGreenSpec(auto_promote=False, promote_delay_seconds=0) + routes = _blue_routes(3) + _green_routes(3, status=RouteStatus.HEALTHY) + + result = blue_green_evaluate(deployment, routes, spec) + + assert result.sub_step == DeploymentSubStep.PROGRESSING + assert not result.completed + assert len(_promote_ids(result)) == 0 + assert len(_scale_in_ids(result)) == 0 + + def test_auto_promote_true_delay_positive_waits(self) -> None: + """auto_promote=True, delay>0 → PROGRESSING (delay wait).""" + deployment = make_deployment(desired=3) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=60) + routes = _blue_routes(3) + _green_routes(3, status=RouteStatus.HEALTHY) + + result = blue_green_evaluate(deployment, routes, spec) + + assert result.sub_step == DeploymentSubStep.PROGRESSING + assert not result.completed + assert len(_promote_ids(result)) == 0 + assert len(_scale_in_ids(result)) == 0 + + def test_auto_promote_false_delay_positive_still_waits(self) -> None: + """auto_promote=False, delay>0 → PROGRESSING (manual overrides delay).""" + deployment = make_deployment(desired=3) + spec = BlueGreenSpec(auto_promote=False, promote_delay_seconds=120) + routes = _blue_routes(3) + _green_routes(3, status=RouteStatus.HEALTHY) + + result = blue_green_evaluate(deployment, routes, spec) + + assert result.sub_step == DeploymentSubStep.PROGRESSING + assert not result.completed + assert len(_promote_ids(result)) == 0 + + def test_auto_promote_true_delay_1_second_waits(self) -> None: + """auto_promote=True, delay=1 → still waits (any positive delay).""" + deployment = make_deployment(desired=2) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=1) + routes = _blue_routes(2) + _green_routes(2, status=RouteStatus.HEALTHY) + + result = blue_green_evaluate(deployment, routes, spec) + + assert not result.completed + assert len(_promote_ids(result)) == 0 + + def test_default_spec_auto_promote_false(self) -> None: + """Default BlueGreenSpec has auto_promote=False.""" + deployment = make_deployment(desired=2) + spec = BlueGreenSpec() + routes = _blue_routes(2) + _green_routes(2, status=RouteStatus.HEALTHY) + + result = blue_green_evaluate(deployment, routes, spec) + + assert not result.completed + assert result.sub_step == DeploymentSubStep.PROGRESSING + + +# =========================================================================== +# 3. Provisioning states +# =========================================================================== -class TestBlueGreenProvisioning: - """When green routes are still PROVISIONING, the FSM should wait.""" +class TestProvisioningStates: + """Test PROVISIONING sub-step behaviors.""" def test_all_green_provisioning(self) -> None: - deployment = _make_deployment(desired=3) - blues = _blue_routes(deployment, 3) - greens = _green_routes(deployment, 3, status=RouteStatus.PROVISIONING) + """All green routes PROVISIONING → PROVISIONING sub-step.""" + deployment = make_deployment(desired=3) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + routes = _blue_routes(3) + _green_routes(3, status=RouteStatus.PROVISIONING) - result = blue_green_evaluate(deployment, blues + greens, BlueGreenSpec()) + result = blue_green_evaluate(deployment, routes, spec) assert result.sub_step == DeploymentSubStep.PROVISIONING assert not result.completed - assert not result.route_changes.scale_out_specs - assert not result.route_changes.scale_in_route_ids - assert not result.route_changes.promote_route_ids + assert _count_scale_out(result) == 0 def test_partial_provisioning_partial_healthy(self) -> None: - deployment = _make_deployment(desired=3) - blues = _blue_routes(deployment, 3) - greens = _green_routes(deployment, 1, status=RouteStatus.HEALTHY) + _green_routes( - deployment, 2, status=RouteStatus.PROVISIONING + """Some green PROVISIONING + some HEALTHY → PROVISIONING (wait).""" + deployment = make_deployment(desired=3) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + routes = ( + _blue_routes(3) + + _green_routes(1, status=RouteStatus.HEALTHY) + + _green_routes(2, status=RouteStatus.PROVISIONING) + ) + + result = blue_green_evaluate(deployment, routes, spec) + + assert result.sub_step == DeploymentSubStep.PROVISIONING + assert not result.completed + + def test_single_provisioning_among_many_healthy(self) -> None: + """Even 1 PROVISIONING green among many healthy → PROVISIONING.""" + deployment = make_deployment(desired=5) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + routes = ( + _blue_routes(5) + + _green_routes(4, status=RouteStatus.HEALTHY) + + _green_routes(1, status=RouteStatus.PROVISIONING) ) - result = blue_green_evaluate(deployment, blues + greens, BlueGreenSpec()) + result = blue_green_evaluate(deployment, routes, spec) assert result.sub_step == DeploymentSubStep.PROVISIONING assert not result.completed + def test_no_green_with_blue_creates_all(self) -> None: + """Blue routes exist, no green → create all desired green routes.""" + deployment = make_deployment(desired=3) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + routes = _blue_routes(3) + + result = blue_green_evaluate(deployment, routes, spec) + + assert result.sub_step == DeploymentSubStep.PROVISIONING + assert _count_scale_out(result) == 3 + + def test_no_green_no_blue_creates_all(self) -> None: + """Fresh deployment with no routes → create all desired.""" + deployment = make_deployment(desired=3) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) -class TestBlueGreenAllGreenFailed: - """When all green routes have failed, rollback should occur.""" + result = blue_green_evaluate(deployment, [], spec) - def test_all_green_failed_rollback(self) -> None: - deployment = _make_deployment(desired=3) - blues = _blue_routes(deployment, 3) - greens = _green_routes(deployment, 3, status=RouteStatus.FAILED_TO_START) + assert result.sub_step == DeploymentSubStep.PROVISIONING + assert _count_scale_out(result) == 3 + + +# =========================================================================== +# 4. Rollback scenarios +# =========================================================================== + + +class TestRollbackScenarios: + """Test rollback behavior when green routes fail.""" + + def test_all_green_failed_to_start_rollback(self) -> None: + """All green FAILED_TO_START → ROLLED_BACK with scale_in for failed routes.""" + deployment = make_deployment(desired=3) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + greens = _green_routes(3, status=RouteStatus.FAILED_TO_START) + routes = _blue_routes(3) + greens - result = blue_green_evaluate(deployment, blues + greens, BlueGreenSpec()) + result = blue_green_evaluate(deployment, routes, spec) assert result.sub_step == DeploymentSubStep.ROLLED_BACK assert not result.completed green_ids = {r.route_id for r in greens} - assert set(result.route_changes.scale_in_route_ids) == green_ids - assert not result.route_changes.promote_route_ids + assert set(_scale_in_ids(result)) == green_ids + assert len(_promote_ids(result)) == 0 def test_all_green_terminated_rollback(self) -> None: - deployment = _make_deployment(desired=2) - blues = _blue_routes(deployment, 2) - greens = _green_routes(deployment, 2, status=RouteStatus.TERMINATED) + """All green TERMINATED → ROLLED_BACK.""" + deployment = make_deployment(desired=3) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + greens = _green_routes(3, status=RouteStatus.TERMINATED) + routes = _blue_routes(3) + greens + + result = blue_green_evaluate(deployment, routes, spec) + + assert result.sub_step == DeploymentSubStep.ROLLED_BACK + green_ids = {r.route_id for r in greens} + assert set(_scale_in_ids(result)) == green_ids + + def test_mixed_failed_and_terminated_green_rollback(self) -> None: + """Mixed FAILED_TO_START + TERMINATED green → ROLLED_BACK.""" + deployment = make_deployment(desired=4) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + greens = _green_routes(2, status=RouteStatus.FAILED_TO_START) + _green_routes( + 2, status=RouteStatus.TERMINATED + ) + routes = _blue_routes(4) + greens - result = blue_green_evaluate(deployment, blues + greens, BlueGreenSpec()) + result = blue_green_evaluate(deployment, routes, spec) + + assert result.sub_step == DeploymentSubStep.ROLLED_BACK + green_ids = {r.route_id for r in greens} + assert set(_scale_in_ids(result)) == green_ids + + def test_rollback_no_blue_routes(self) -> None: + """All green failed with no blue routes → ROLLED_BACK.""" + deployment = make_deployment(desired=2) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + greens = _green_routes(2, status=RouteStatus.FAILED_TO_START) + + result = blue_green_evaluate(deployment, greens, spec) assert result.sub_step == DeploymentSubStep.ROLLED_BACK assert not result.completed + def test_rollback_preserves_blue_routes(self) -> None: + """On rollback, blue routes are NOT scale_in'd — only green routes.""" + deployment = make_deployment(desired=3) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + blues = _blue_routes(3) + greens = _green_routes(3, status=RouteStatus.FAILED_TO_START) + routes = blues + greens + + result = blue_green_evaluate(deployment, routes, spec) + + assert result.sub_step == DeploymentSubStep.ROLLED_BACK + blue_ids = {r.route_id for r in blues} + green_ids = {r.route_id for r in greens} + assert set(_scale_in_ids(result)) == green_ids + assert blue_ids.isdisjoint(set(_scale_in_ids(result))) + -class TestBlueGreenMixedGreen: - """When green routes are in mixed states (healthy + failed, no provisioning).""" +# =========================================================================== +# 5. Mixed green statuses (healthy + failed, no provisioning) +# =========================================================================== - def test_healthy_and_failed_mixed(self) -> None: - deployment = _make_deployment(desired=3) - blues = _blue_routes(deployment, 3) - greens = _green_routes(deployment, 1, status=RouteStatus.HEALTHY) + _green_routes( - deployment, 2, status=RouteStatus.FAILED_TO_START + +class TestMixedGreenStatuses: + """Test with green routes in various mixed states.""" + + def test_healthy_and_failed_mixed_progressing(self) -> None: + """Some green healthy, some failed (no provisioning) → PROGRESSING.""" + deployment = make_deployment(desired=3) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + routes = ( + _blue_routes(3) + + _green_routes(1, status=RouteStatus.HEALTHY) + + _green_routes(2, status=RouteStatus.FAILED_TO_START) ) - result = blue_green_evaluate(deployment, blues + greens, BlueGreenSpec()) + result = blue_green_evaluate(deployment, routes, spec) assert result.sub_step == DeploymentSubStep.PROGRESSING assert not result.completed + def test_healthy_and_terminated_mixed_progressing(self) -> None: + """Some green healthy, some terminated → PROGRESSING (not enough healthy).""" + deployment = make_deployment(desired=3) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + routes = ( + _blue_routes(3) + + _green_routes(2, status=RouteStatus.HEALTHY) + + _green_routes(1, status=RouteStatus.TERMINATED) + ) -class TestBlueGreenPromotion: - """When all green routes are healthy and promotion should happen.""" + result = blue_green_evaluate(deployment, routes, spec) - def test_auto_promote_true_delay_zero_completed(self) -> None: - deployment = _make_deployment(desired=3) - blues = _blue_routes(deployment, 3) - greens = _green_routes(deployment, 3, status=RouteStatus.HEALTHY) + assert result.sub_step == DeploymentSubStep.PROGRESSING + assert not result.completed + + def test_degraded_green_counts_as_healthy(self) -> None: + """DEGRADED green routes count as active (is_active=True).""" + deployment = make_deployment(desired=1) spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + routes = _blue_routes(1) + _green_routes(1, status=RouteStatus.DEGRADED) - result = blue_green_evaluate(deployment, blues + greens, spec) + result = blue_green_evaluate(deployment, routes, spec) + + assert result.completed + assert len(_promote_ids(result)) == 1 + + def test_unhealthy_green_counts_as_healthy(self) -> None: + """UNHEALTHY green routes count as active.""" + deployment = make_deployment(desired=1) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + routes = _blue_routes(1) + _green_routes(1, status=RouteStatus.UNHEALTHY) + + result = blue_green_evaluate(deployment, routes, spec) + + assert result.completed + assert len(_promote_ids(result)) == 1 + + def test_mix_degraded_and_healthy_green_promoted(self) -> None: + """Mix of DEGRADED and HEALTHY green → all promoted on completion.""" + deployment = make_deployment(desired=3) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + greens = _green_routes(2, status=RouteStatus.HEALTHY) + _green_routes( + 1, status=RouteStatus.DEGRADED + ) + routes = _blue_routes(3) + greens + + result = blue_green_evaluate(deployment, routes, spec) - assert result.sub_step == DeploymentSubStep.PROGRESSING assert result.completed - # Green route IDs should be promoted green_ids = {r.route_id for r in greens} - assert set(result.route_changes.promote_route_ids) == green_ids - # Blue route IDs should be scaled in - blue_ids = {r.route_id for r in blues} - assert set(result.route_changes.scale_in_route_ids) == blue_ids - # No new routes created - assert not result.route_changes.scale_out_specs - - def test_auto_promote_false_manual_wait(self) -> None: - deployment = _make_deployment(desired=3) - blues = _blue_routes(deployment, 3) - greens = _green_routes(deployment, 3, status=RouteStatus.HEALTHY) - spec = BlueGreenSpec(auto_promote=False, promote_delay_seconds=0) + assert set(_promote_ids(result)) == green_ids - result = blue_green_evaluate(deployment, blues + greens, spec) + def test_mix_unhealthy_and_healthy_green_promoted(self) -> None: + """Mix of UNHEALTHY and HEALTHY green → all promoted on completion.""" + deployment = make_deployment(desired=2) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + greens = _green_routes(1, status=RouteStatus.HEALTHY) + _green_routes( + 1, status=RouteStatus.UNHEALTHY + ) + routes = _blue_routes(2) + greens - assert result.sub_step == DeploymentSubStep.PROGRESSING + result = blue_green_evaluate(deployment, routes, spec) + + assert result.completed + green_ids = {r.route_id for r in greens} + assert set(_promote_ids(result)) == green_ids + + +# =========================================================================== +# 6. Blue route status variations +# =========================================================================== + + +class TestBlueRouteStatuses: + """Test how different blue route statuses are handled.""" + + def test_blue_terminating_not_counted_as_active(self) -> None: + """Blue routes in TERMINATING are not counted as blue_active.""" + deployment = make_deployment(desired=1) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + greens = _green_routes(1, status=RouteStatus.HEALTHY) + routes = [ + make_route( + revision_id=OLD_REV, + status=RouteStatus.TERMINATING, + traffic_status=RouteTrafficStatus.INACTIVE, + traffic_ratio=0.0, + ), + ] + greens + + result = blue_green_evaluate(deployment, routes, spec) + + assert result.completed + # Only green promoted, no blue in scale_in (terminating is not active) + assert len(_promote_ids(result)) == 1 + assert len(_scale_in_ids(result)) == 0 + + def test_blue_terminated_not_counted(self) -> None: + """Blue routes in TERMINATED are not counted as blue_active.""" + deployment = make_deployment(desired=1) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + greens = _green_routes(1, status=RouteStatus.HEALTHY) + routes = [ + make_route( + revision_id=OLD_REV, + status=RouteStatus.TERMINATED, + traffic_status=RouteTrafficStatus.INACTIVE, + traffic_ratio=0.0, + ), + ] + greens + + result = blue_green_evaluate(deployment, routes, spec) + + assert result.completed + assert len(_scale_in_ids(result)) == 0 + + def test_blue_failed_not_counted_as_active(self) -> None: + """Blue routes in FAILED_TO_START are not counted as blue_active.""" + deployment = make_deployment(desired=1) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + greens = _green_routes(1, status=RouteStatus.HEALTHY) + routes = [ + make_route( + revision_id=OLD_REV, + status=RouteStatus.FAILED_TO_START, + traffic_status=RouteTrafficStatus.INACTIVE, + traffic_ratio=0.0, + ), + ] + greens + + result = blue_green_evaluate(deployment, routes, spec) + + assert result.completed + assert len(_scale_in_ids(result)) == 0 + + def test_mixed_blue_statuses_only_active_scale_in(self) -> None: + """Only active blue routes are included in scale_in.""" + deployment = make_deployment(desired=2) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + active_blue = make_route( + revision_id=OLD_REV, + status=RouteStatus.HEALTHY, + traffic_status=RouteTrafficStatus.ACTIVE, + traffic_ratio=1.0, + ) + inactive_blue = make_route( + revision_id=OLD_REV, + status=RouteStatus.TERMINATING, + traffic_status=RouteTrafficStatus.INACTIVE, + traffic_ratio=0.0, + ) + greens = _green_routes(2, status=RouteStatus.HEALTHY) + routes = [active_blue, inactive_blue] + greens + + result = blue_green_evaluate(deployment, routes, spec) + + assert result.completed + assert _scale_in_ids(result) == [active_blue.route_id] + + def test_blue_degraded_counted_as_active(self) -> None: + """Blue routes in DEGRADED are counted as active → included in scale_in.""" + deployment = make_deployment(desired=1) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + blue = make_route( + revision_id=OLD_REV, + status=RouteStatus.DEGRADED, + traffic_status=RouteTrafficStatus.ACTIVE, + traffic_ratio=1.0, + ) + greens = _green_routes(1, status=RouteStatus.HEALTHY) + routes = [blue] + greens + + result = blue_green_evaluate(deployment, routes, spec) + + assert result.completed + assert _scale_in_ids(result) == [blue.route_id] + + def test_blue_unhealthy_counted_as_active(self) -> None: + """Blue routes in UNHEALTHY are counted as active → included in scale_in.""" + deployment = make_deployment(desired=1) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + blue = make_route( + revision_id=OLD_REV, + status=RouteStatus.UNHEALTHY, + traffic_status=RouteTrafficStatus.ACTIVE, + traffic_ratio=1.0, + ) + greens = _green_routes(1, status=RouteStatus.HEALTHY) + routes = [blue] + greens + + result = blue_green_evaluate(deployment, routes, spec) + + assert result.completed + assert _scale_in_ids(result) == [blue.route_id] + + +# =========================================================================== +# 7. Multi-cycle progression +# =========================================================================== + + +class TestMultiCycleProgression: + """Simulate multiple evaluation cycles.""" + + def test_cycle_1_no_green_creates_all(self) -> None: + """Cycle 1: blue only → creates desired green routes.""" + deployment = make_deployment(desired=3) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + routes = _blue_routes(3) + + result = blue_green_evaluate(deployment, routes, spec) + + assert result.sub_step == DeploymentSubStep.PROVISIONING + assert _count_scale_out(result) == 3 + + def test_cycle_2_green_provisioning_waits(self) -> None: + """Cycle 2: green PROVISIONING → wait.""" + deployment = make_deployment(desired=3) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + routes = _blue_routes(3) + _green_routes(3, status=RouteStatus.PROVISIONING) + + result = blue_green_evaluate(deployment, routes, spec) + + assert result.sub_step == DeploymentSubStep.PROVISIONING assert not result.completed - assert not result.route_changes.promote_route_ids - assert not result.route_changes.scale_in_route_ids - def test_auto_promote_true_delay_positive_wait(self) -> None: - deployment = _make_deployment(desired=3) - blues = _blue_routes(deployment, 3) - greens = _green_routes(deployment, 3, status=RouteStatus.HEALTHY) - spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=60) + def test_cycle_3_partial_green_healthy_waits(self) -> None: + """Cycle 3: some green healthy, some provisioning → still PROVISIONING.""" + deployment = make_deployment(desired=3) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + routes = ( + _blue_routes(3) + + _green_routes(2, status=RouteStatus.HEALTHY) + + _green_routes(1, status=RouteStatus.PROVISIONING) + ) + + result = blue_green_evaluate(deployment, routes, spec) + + assert result.sub_step == DeploymentSubStep.PROVISIONING + + def test_cycle_4_all_green_healthy_promotes(self) -> None: + """Cycle 4: all green healthy → completed with promotion.""" + deployment = make_deployment(desired=3) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + routes = _blue_routes(3) + _green_routes(3, status=RouteStatus.HEALTHY) + + result = blue_green_evaluate(deployment, routes, spec) + + assert result.completed + assert len(_promote_ids(result)) == 3 + assert len(_scale_in_ids(result)) == 3 + + def test_not_completed_when_green_less_than_desired(self) -> None: + """Green healthy < desired → PROGRESSING (not enough).""" + deployment = make_deployment(desired=3) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + routes = _blue_routes(3) + _green_routes(2, status=RouteStatus.HEALTHY) - result = blue_green_evaluate(deployment, blues + greens, spec) + result = blue_green_evaluate(deployment, routes, spec) assert result.sub_step == DeploymentSubStep.PROGRESSING assert not result.completed - assert not result.route_changes.promote_route_ids - assert not result.route_changes.scale_in_route_ids -class TestBlueGreenSingleReplica: - """Edge case: desired=1 single replica.""" +# =========================================================================== +# 8. Promotion route ID verification +# =========================================================================== - def test_single_replica_no_green(self) -> None: - deployment = _make_deployment(desired=1) - blues = _blue_routes(deployment, 1) - result = blue_green_evaluate(deployment, blues, BlueGreenSpec()) +class TestPromotionRouteIdVerification: + """Verify promote and scale_in route IDs are exact matches.""" - assert result.sub_step == DeploymentSubStep.PROVISIONING - assert len(result.route_changes.scale_out_specs) == 1 + def test_promote_ids_match_green_healthy(self) -> None: + """Promoted route IDs must exactly match green healthy route IDs.""" + deployment = make_deployment(desired=3) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + blues = _blue_routes(3) + greens = _green_routes(3, status=RouteStatus.HEALTHY) + routes = blues + greens + + result = blue_green_evaluate(deployment, routes, spec) + + expected_promote = [r.route_id for r in greens] + assert _promote_ids(result) == expected_promote - def test_single_replica_promotion(self) -> None: - deployment = _make_deployment(desired=1) - blues = _blue_routes(deployment, 1) - greens = _green_routes(deployment, 1, status=RouteStatus.HEALTHY) + def test_scale_in_ids_match_blue_active(self) -> None: + """Scale-in route IDs must exactly match blue active route IDs.""" + deployment = make_deployment(desired=3) spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + blues = _blue_routes(3) + greens = _green_routes(3, status=RouteStatus.HEALTHY) + routes = blues + greens + + result = blue_green_evaluate(deployment, routes, spec) + + expected_scale_in = [r.route_id for r in blues] + assert _scale_in_ids(result) == expected_scale_in + + def test_no_cross_contamination_between_promote_and_scale_in(self) -> None: + """Promote IDs and scale_in IDs must be disjoint sets.""" + deployment = make_deployment(desired=3) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + blues = _blue_routes(3) + greens = _green_routes(3, status=RouteStatus.HEALTHY) + routes = blues + greens + + result = blue_green_evaluate(deployment, routes, spec) - result = blue_green_evaluate(deployment, blues + greens, spec) + promote_set = set(_promote_ids(result)) + scale_in_set = set(_scale_in_ids(result)) + assert promote_set.isdisjoint(scale_in_set) + + def test_promote_ids_order_matches_green_order(self) -> None: + """Promote IDs order should match the order green routes were processed.""" + deployment = make_deployment(desired=3) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + greens = _green_routes(3, status=RouteStatus.HEALTHY) + routes = _blue_routes(3) + greens + + result = blue_green_evaluate(deployment, routes, spec) + + assert _promote_ids(result) == [r.route_id for r in greens] + + +# =========================================================================== +# 9. Route creator specs validation +# =========================================================================== + + +class TestRouteCreatorSpecs: + """Validate that route creator specs have correct fields.""" + + def test_creator_specs_use_deploying_revision(self) -> None: + """Created routes should use the deploying revision.""" + deployment = make_deployment(desired=1) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + + result = blue_green_evaluate(deployment, [], spec) + + assert _count_scale_out(result) == 1 + creator_spec = result.route_changes.scale_out_specs[0].spec + assert isinstance(creator_spec, RouteCreatorSpec) + assert creator_spec.revision_id == NEW_REV + assert creator_spec.endpoint_id == ENDPOINT_ID + assert creator_spec.session_owner_id == USER_ID + assert creator_spec.domain == "default" + assert creator_spec.project_id == PROJECT_ID + + def test_creator_specs_have_inactive_traffic(self) -> None: + """Green routes must be created with INACTIVE traffic status.""" + deployment = make_deployment(desired=2) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + routes = _blue_routes(2) + + result = blue_green_evaluate(deployment, routes, spec) + + for creator in result.route_changes.scale_out_specs: + creator_spec = creator.spec + assert isinstance(creator_spec, RouteCreatorSpec) + assert creator_spec.traffic_status == RouteTrafficStatus.INACTIVE + assert creator_spec.traffic_ratio == 0.0 + + def test_multiple_creators_all_correct(self) -> None: + """Multiple creators all have correct metadata.""" + deployment = make_deployment(desired=5) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + + result = blue_green_evaluate(deployment, [], spec) + + assert _count_scale_out(result) == 5 + for creator in result.route_changes.scale_out_specs: + creator_spec = creator.spec + assert isinstance(creator_spec, RouteCreatorSpec) + assert creator_spec.revision_id == NEW_REV + assert creator_spec.endpoint_id == ENDPOINT_ID + assert creator_spec.traffic_status == RouteTrafficStatus.INACTIVE + assert creator_spec.traffic_ratio == 0.0 + + def test_creator_specs_different_route_ids(self) -> None: + """Each creator should produce a unique route (verified by spec fields).""" + deployment = make_deployment(desired=3) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + + result = blue_green_evaluate(deployment, [], spec) + + assert _count_scale_out(result) == 3 + # All creators should have the same deploying revision but be separate instances + for creator in result.route_changes.scale_out_specs: + assert isinstance(creator.spec, RouteCreatorSpec) + assert creator.spec.revision_id == NEW_REV + + +# =========================================================================== +# 10. Edge cases +# =========================================================================== + + +class TestEdgeCases: + """Edge cases and boundary conditions.""" + + def test_desired_1_single_replica_full_lifecycle(self) -> None: + """desired=1 → create 1 green, promote 1 green, terminate 1 blue.""" + deployment = make_deployment(desired=1) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + + # Cycle 1: no green → create 1 + routes = _blue_routes(1) + r1 = blue_green_evaluate(deployment, routes, spec) + assert _count_scale_out(r1) == 1 + + # Cycle 2: green healthy → promote + routes = _blue_routes(1) + _green_routes(1, status=RouteStatus.HEALTHY) + r2 = blue_green_evaluate(deployment, routes, spec) + assert r2.completed + assert len(_promote_ids(r2)) == 1 + assert len(_scale_in_ids(r2)) == 1 + + def test_desired_0_no_routes_no_creation(self) -> None: + """desired=0, no routes → PROVISIONING with 0 green created.""" + deployment = make_deployment(desired=0) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + + result = blue_green_evaluate(deployment, [], spec) + + # No green needed, so completion with 0 green + assert _count_scale_out(result) == 0 + + def test_more_green_healthy_than_desired(self) -> None: + """green_healthy > desired → still promotes (completes).""" + deployment = make_deployment(desired=2) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + greens = _green_routes(3, status=RouteStatus.HEALTHY) + routes = _blue_routes(2) + greens + + result = blue_green_evaluate(deployment, routes, spec) assert result.completed - assert len(result.route_changes.promote_route_ids) == 1 - assert len(result.route_changes.scale_in_route_ids) == 1 + green_ids = {r.route_id for r in greens} + assert set(_promote_ids(result)) == green_ids + def test_only_failed_green_no_blue_rolls_back(self) -> None: + """Only failed green routes, no blue → ROLLED_BACK.""" + deployment = make_deployment(desired=2) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + greens = _green_routes(2, status=RouteStatus.FAILED_TO_START) -class TestBlueGreenManyReplicas: - """Edge case: desired=5 many replicas.""" + result = blue_green_evaluate(deployment, greens, spec) - def test_many_replicas_creates_all(self) -> None: - deployment = _make_deployment(desired=5) - blues = _blue_routes(deployment, 5) + assert result.sub_step == DeploymentSubStep.ROLLED_BACK + + def test_deploying_rev_none_all_routes_classified_as_blue(self) -> None: + """If deploying_revision_id is None, all routes classified as blue.""" + deployment = make_deployment(desired=1, deploying_revision_id=None) # type: ignore[arg-type] + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + routes = [make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY)] - result = blue_green_evaluate(deployment, blues, BlueGreenSpec()) + result = blue_green_evaluate(deployment, routes, spec) + # All classified as blue (not green), no green → PROVISIONING with create assert result.sub_step == DeploymentSubStep.PROVISIONING - assert len(result.route_changes.scale_out_specs) == 5 + assert _count_scale_out(result) == 1 - def test_many_replicas_promotion(self) -> None: - deployment = _make_deployment(desired=5) - blues = _blue_routes(deployment, 5) - greens = _green_routes(deployment, 5, status=RouteStatus.HEALTHY) + def test_route_without_revision_classified_as_blue(self) -> None: + """Routes with revision_id=None are classified as blue (non-green).""" + deployment = make_deployment(desired=1) spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + routes = [make_route(revision_id=None, status=RouteStatus.HEALTHY)] # type: ignore[arg-type] - result = blue_green_evaluate(deployment, blues + greens, spec) + result = blue_green_evaluate(deployment, routes, spec) + + # revision_id=None != NEW_REV, so classified as blue + assert _count_scale_out(result) == 1 + + def test_provisioning_prioritized_over_promotion(self) -> None: + """PROVISIONING check comes before promotion check.""" + deployment = make_deployment(desired=1) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + routes = ( + _blue_routes(1) + + _green_routes(1, status=RouteStatus.HEALTHY) + + _green_routes(1, status=RouteStatus.PROVISIONING) + ) + + result = blue_green_evaluate(deployment, routes, spec) + + # Even though green_healthy >= desired, PROVISIONING takes precedence + assert result.sub_step == DeploymentSubStep.PROVISIONING + assert not result.completed + + def test_large_desired_creates_all(self) -> None: + """Large desired (10) creates all green at once.""" + deployment = make_deployment(desired=10) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + routes = _blue_routes(10) + + result = blue_green_evaluate(deployment, routes, spec) + + assert _count_scale_out(result) == 10 + + def test_large_desired_promotes_all(self) -> None: + """Large desired (10) promotes all green at once.""" + deployment = make_deployment(desired=10) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + blues = _blue_routes(10) + greens = _green_routes(10, status=RouteStatus.HEALTHY) + routes = blues + greens + + result = blue_green_evaluate(deployment, routes, spec) assert result.completed - assert len(result.route_changes.promote_route_ids) == 5 - assert len(result.route_changes.scale_in_route_ids) == 5 + assert len(_promote_ids(result)) == 10 + assert len(_scale_in_ids(result)) == 10 -class TestBlueGreenNoBlueRoutes: +# =========================================================================== +# 11. Realistic multi-step scenario (desired=5) +# =========================================================================== + + +class TestRealisticScenario: + """Simulate a realistic blue-green deployment with desired=5.""" + + def test_step_by_step_blue_green_deployment(self) -> None: + """Full simulation of a blue-green deployment across multiple cycles.""" + deployment = make_deployment(desired=5) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + + # Cycle 1: 5 blue, 0 green → create all 5 green (INACTIVE) + blues = _blue_routes(5) + r1 = blue_green_evaluate(deployment, blues, spec) + + assert r1.sub_step == DeploymentSubStep.PROVISIONING + assert _count_scale_out(r1) == 5 + assert len(_scale_in_ids(r1)) == 0 + + # Cycle 2: 5 blue, 5 green PROVISIONING → wait + routes_c2 = blues + _green_routes(5, status=RouteStatus.PROVISIONING) + r2 = blue_green_evaluate(deployment, routes_c2, spec) + + assert r2.sub_step == DeploymentSubStep.PROVISIONING + assert _count_scale_out(r2) == 0 + + # Cycle 3: 5 blue, 3 healthy + 2 provisioning → still PROVISIONING + routes_c3 = ( + blues + + _green_routes(3, status=RouteStatus.HEALTHY) + + _green_routes(2, status=RouteStatus.PROVISIONING) + ) + r3 = blue_green_evaluate(deployment, routes_c3, spec) + + assert r3.sub_step == DeploymentSubStep.PROVISIONING + + # Cycle 4: 5 blue, 4 healthy + 1 provisioning → still PROVISIONING + routes_c4 = ( + blues + + _green_routes(4, status=RouteStatus.HEALTHY) + + _green_routes(1, status=RouteStatus.PROVISIONING) + ) + r4 = blue_green_evaluate(deployment, routes_c4, spec) + + assert r4.sub_step == DeploymentSubStep.PROVISIONING + + # Cycle 5: 5 blue, 5 green healthy → completed (atomic promotion) + greens = _green_routes(5, status=RouteStatus.HEALTHY) + routes_c5 = blues + greens + r5 = blue_green_evaluate(deployment, routes_c5, spec) + + assert r5.completed + assert len(_promote_ids(r5)) == 5 + assert len(_scale_in_ids(r5)) == 5 + + def test_step_by_step_with_failure_rollback(self) -> None: + """Simulation of a blue-green deployment that fails and rolls back.""" + deployment = make_deployment(desired=3) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + + # Cycle 1: 3 blue, 0 green → create 3 green + blues = _blue_routes(3) + r1 = blue_green_evaluate(deployment, blues, spec) + assert _count_scale_out(r1) == 3 + + # Cycle 2: 3 blue, 3 green PROVISIONING → wait + routes_c2 = blues + _green_routes(3, status=RouteStatus.PROVISIONING) + r2 = blue_green_evaluate(deployment, routes_c2, spec) + assert r2.sub_step == DeploymentSubStep.PROVISIONING + + # Cycle 3: all green fail → ROLLED_BACK + greens_failed = _green_routes(3, status=RouteStatus.FAILED_TO_START) + routes_c3 = blues + greens_failed + r3 = blue_green_evaluate(deployment, routes_c3, spec) + + assert r3.sub_step == DeploymentSubStep.ROLLED_BACK + assert not r3.completed + green_ids = {r.route_id for r in greens_failed} + assert set(_scale_in_ids(r3)) == green_ids + + def test_step_by_step_manual_promotion(self) -> None: + """Simulation with auto_promote=False (manual promotion flow).""" + deployment = make_deployment(desired=3) + + # Cycle 1: auto_promote=False, create green + spec_manual = BlueGreenSpec(auto_promote=False, promote_delay_seconds=0) + blues = _blue_routes(3) + r1 = blue_green_evaluate(deployment, blues, spec_manual) + assert _count_scale_out(r1) == 3 + + # Cycle 2: all green healthy, but auto_promote=False → PROGRESSING (wait) + routes_c2 = blues + _green_routes(3, status=RouteStatus.HEALTHY) + r2 = blue_green_evaluate(deployment, routes_c2, spec_manual) + assert r2.sub_step == DeploymentSubStep.PROGRESSING + assert not r2.completed + assert len(_promote_ids(r2)) == 0 + + # Cycle 3: admin switches to auto_promote=True → completed + spec_auto = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + r3 = blue_green_evaluate(deployment, routes_c2, spec_auto) + assert r3.completed + assert len(_promote_ids(r3)) == 3 + assert len(_scale_in_ids(r3)) == 3 + + +# =========================================================================== +# 12. desired_replica_count vs replica_count +# =========================================================================== + + +class TestDesiredReplicaCount: + """Test that the correct desired count is used.""" + + def test_desired_replica_count_overrides_replica_count(self) -> None: + """When desired_replica_count is set, it takes precedence.""" + deployment = make_deployment(desired=3) + deployment.replica_spec = ReplicaSpec( + replica_count=1, + desired_replica_count=3, + ) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + + result = blue_green_evaluate(deployment, [], spec) + + # desired is 3 (from desired_replica_count), not 1 + assert _count_scale_out(result) == 3 + + def test_replica_count_used_when_no_desired(self) -> None: + """When desired_replica_count is None, uses replica_count.""" + deployment = make_deployment(desired=2) + deployment.replica_spec = ReplicaSpec( + replica_count=2, + desired_replica_count=None, + ) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + routes = _green_routes(2, status=RouteStatus.HEALTHY) + + result = blue_green_evaluate(deployment, routes, spec) + + assert result.completed + + def test_desired_replica_count_determines_green_creation_count(self) -> None: + """desired_replica_count controls how many green routes are created.""" + deployment = make_deployment(desired=5) + deployment.replica_spec = ReplicaSpec( + replica_count=2, + desired_replica_count=5, + ) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + routes = _blue_routes(2) + + result = blue_green_evaluate(deployment, routes, spec) + + assert _count_scale_out(result) == 5 + + +# =========================================================================== +# 13. Scale-down during blue-green deployment +# =========================================================================== + + +class TestScaleDownDuringBlueGreen: + """Test behavior when desired is reduced during blue-green deployment.""" + + def test_desired_reduced_fewer_green_needed(self) -> None: + """If desired is lowered during deployment, fewer green are healthy enough.""" + deployment = make_deployment(desired=2) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + # 3 blue (original desired was 3), now desired=2 + routes = _blue_routes(3) + _green_routes(2, status=RouteStatus.HEALTHY) + + result = blue_green_evaluate(deployment, routes, spec) + + # green_healthy=2 >= desired=2 → completed + assert result.completed + assert len(_promote_ids(result)) == 2 + assert len(_scale_in_ids(result)) == 3 # all 3 blue routes terminated + + def test_desired_increased_needs_more_green(self) -> None: + """If desired is raised, green_healthy < new_desired → PROGRESSING.""" + deployment = make_deployment(desired=5) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + routes = _blue_routes(3) + _green_routes(3, status=RouteStatus.HEALTHY) + + result = blue_green_evaluate(deployment, routes, spec) + + # green_healthy=3 < desired=5 → PROGRESSING + assert result.sub_step == DeploymentSubStep.PROGRESSING + assert not result.completed + + +# =========================================================================== +# 14. No blue routes (fresh deployment) +# =========================================================================== + + +class TestNoBlueRoutes: """When there are no blue routes (fresh deployment).""" + def test_fresh_deployment_creates_green(self) -> None: + """No blue, no green → create all desired green.""" + deployment = make_deployment(desired=3) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + + result = blue_green_evaluate(deployment, [], spec) + + assert result.sub_step == DeploymentSubStep.PROVISIONING + assert _count_scale_out(result) == 3 + def test_promotion_no_blue(self) -> None: - """Promotion with no blue routes to terminate.""" - deployment = _make_deployment(desired=3) - greens = _green_routes(deployment, 3, status=RouteStatus.HEALTHY) + """Promotion with no blue routes → complete with 0 scale_in.""" + deployment = make_deployment(desired=3) spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + greens = _green_routes(3, status=RouteStatus.HEALTHY) result = blue_green_evaluate(deployment, greens, spec) assert result.completed green_ids = {r.route_id for r in greens} - assert set(result.route_changes.promote_route_ids) == green_ids - assert not result.route_changes.scale_in_route_ids + assert set(_promote_ids(result)) == green_ids + assert len(_scale_in_ids(result)) == 0 + def test_fresh_deployment_all_fail_rollback(self) -> None: + """Fresh deployment where all green routes fail.""" + deployment = make_deployment(desired=3) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + greens = _green_routes(3, status=RouteStatus.FAILED_TO_START) -class TestBlueGreenPromotionRouteIdVerification: - """Verify promote and scale_in route IDs are exact matches.""" + result = blue_green_evaluate(deployment, greens, spec) - def test_promote_ids_match_green_healthy(self) -> None: - deployment = _make_deployment(desired=3) - blues = _blue_routes(deployment, 3) - greens = _green_routes(deployment, 3, status=RouteStatus.HEALTHY) + assert result.sub_step == DeploymentSubStep.ROLLED_BACK + + +# =========================================================================== +# 15. Concurrent provisioning checks +# =========================================================================== + + +class TestConcurrentProvisioningChecks: + """Test that provisioning blocks further changes correctly.""" + + def test_provisioning_blocks_promotion(self) -> None: + """Any green route in PROVISIONING → wait, even if enough healthy for promotion.""" + deployment = make_deployment(desired=2) spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + routes = ( + _blue_routes(2) + + _green_routes(2, status=RouteStatus.HEALTHY) + + _green_routes(1, status=RouteStatus.PROVISIONING) + ) - result = blue_green_evaluate(deployment, blues + greens, spec) + result = blue_green_evaluate(deployment, routes, spec) - expected_promote = [r.route_id for r in greens] - assert result.route_changes.promote_route_ids == expected_promote + # PROVISIONING takes priority over promotion + assert result.sub_step == DeploymentSubStep.PROVISIONING + assert not result.completed + assert len(_promote_ids(result)) == 0 + assert len(_scale_in_ids(result)) == 0 - def test_scale_in_ids_match_blue_active(self) -> None: - deployment = _make_deployment(desired=3) - blues = _blue_routes(deployment, 3) - greens = _green_routes(deployment, 3, status=RouteStatus.HEALTHY) + def test_multiple_provisioning_routes_still_waits(self) -> None: + """Multiple PROVISIONING routes → still PROVISIONING.""" + deployment = make_deployment(desired=3) spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + routes = _green_routes(3, status=RouteStatus.PROVISIONING) - result = blue_green_evaluate(deployment, blues + greens, spec) + result = blue_green_evaluate(deployment, routes, spec) - expected_scale_in = [r.route_id for r in blues] - assert result.route_changes.scale_in_route_ids == expected_scale_in + assert result.sub_step == DeploymentSubStep.PROVISIONING + + def test_provisioning_blocks_even_with_auto_promote_false(self) -> None: + """PROVISIONING still blocks with auto_promote=False.""" + deployment = make_deployment(desired=2) + spec = BlueGreenSpec(auto_promote=False, promote_delay_seconds=0) + routes = ( + _blue_routes(2) + + _green_routes(1, status=RouteStatus.HEALTHY) + + _green_routes(1, status=RouteStatus.PROVISIONING) + ) + + result = blue_green_evaluate(deployment, routes, spec) + + assert result.sub_step == DeploymentSubStep.PROVISIONING + + def test_no_actions_during_provisioning_wait(self) -> None: + """During PROVISIONING wait, no route changes should be emitted.""" + deployment = make_deployment(desired=3) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + routes = _blue_routes(3) + _green_routes(3, status=RouteStatus.PROVISIONING) + + result = blue_green_evaluate(deployment, routes, spec) + + assert _count_scale_out(result) == 0 + assert len(_scale_in_ids(result)) == 0 + assert len(_promote_ids(result)) == 0 + + +# =========================================================================== +# 16. Atomicity of promotion +# =========================================================================== + + +class TestAtomicPromotion: + """Test that promotion is atomic (all green promoted + all blue terminated at once).""" + + def test_promotion_is_all_or_nothing(self) -> None: + """On promotion, ALL healthy green are promoted and ALL active blue are terminated.""" + deployment = make_deployment(desired=5) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + blues = _blue_routes(5) + greens = _green_routes(5, status=RouteStatus.HEALTHY) + routes = blues + greens + + result = blue_green_evaluate(deployment, routes, spec) + + assert result.completed + assert len(_promote_ids(result)) == 5 + assert len(_scale_in_ids(result)) == 5 + assert _count_scale_out(result) == 0 + + def test_no_partial_promotion(self) -> None: + """With green < desired, no partial promotion happens.""" + deployment = make_deployment(desired=3) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + routes = _blue_routes(3) + _green_routes(2, status=RouteStatus.HEALTHY) + + result = blue_green_evaluate(deployment, routes, spec) + + # Not enough green healthy → no promotion + assert not result.completed + assert len(_promote_ids(result)) == 0 + assert len(_scale_in_ids(result)) == 0 + + def test_promotion_with_asymmetric_blue_green_count(self) -> None: + """Blue=3, Green=5 (desired=5) → all green promoted, all blue terminated.""" + deployment = make_deployment(desired=5) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + blues = _blue_routes(3) + greens = _green_routes(5, status=RouteStatus.HEALTHY) + routes = blues + greens + + result = blue_green_evaluate(deployment, routes, spec) + + assert result.completed + assert len(_promote_ids(result)) == 5 + assert len(_scale_in_ids(result)) == 3 + + def test_promotion_with_more_blue_than_green(self) -> None: + """Blue=5, Green=3 (desired=3) → all green promoted, all blue terminated.""" + deployment = make_deployment(desired=3) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + blues = _blue_routes(5) + greens = _green_routes(3, status=RouteStatus.HEALTHY) + routes = blues + greens + + result = blue_green_evaluate(deployment, routes, spec) + + assert result.completed + assert len(_promote_ids(result)) == 3 + assert len(_scale_in_ids(result)) == 5 + + +# =========================================================================== +# 17. Idempotency and repeated evaluations +# =========================================================================== + + +class TestIdempotency: + """Test that repeated evaluations with the same state produce the same result.""" + + def test_repeated_provisioning_evaluation(self) -> None: + """Same PROVISIONING state evaluated twice → same result.""" + deployment = make_deployment(desired=3) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + routes = _blue_routes(3) + _green_routes(3, status=RouteStatus.PROVISIONING) + + r1 = blue_green_evaluate(deployment, routes, spec) + r2 = blue_green_evaluate(deployment, routes, spec) + + assert r1.sub_step == r2.sub_step == DeploymentSubStep.PROVISIONING + assert r1.completed == r2.completed is False + + def test_repeated_completion_evaluation(self) -> None: + """Same completion state evaluated twice → same result.""" + deployment = make_deployment(desired=3) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + blues = _blue_routes(3) + greens = _green_routes(3, status=RouteStatus.HEALTHY) + routes = blues + greens + + r1 = blue_green_evaluate(deployment, routes, spec) + r2 = blue_green_evaluate(deployment, routes, spec) + + assert r1.completed == r2.completed is True + assert len(_promote_ids(r1)) == len(_promote_ids(r2)) == 3 + + def test_repeated_rollback_evaluation(self) -> None: + """Same rollback state evaluated twice → same result.""" + deployment = make_deployment(desired=3) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + routes = _blue_routes(3) + _green_routes(3, status=RouteStatus.FAILED_TO_START) + + r1 = blue_green_evaluate(deployment, routes, spec) + r2 = blue_green_evaluate(deployment, routes, spec) + + assert r1.sub_step == r2.sub_step == DeploymentSubStep.ROLLED_BACK + + +# =========================================================================== +# 18. Spec parameter boundary values +# =========================================================================== + + +class TestSpecBoundaryValues: + """Test boundary values for BlueGreenSpec parameters.""" + + def test_promote_delay_zero_promotes(self) -> None: + """promote_delay_seconds=0 → immediate promotion.""" + deployment = make_deployment(desired=1) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + routes = _blue_routes(1) + _green_routes(1, status=RouteStatus.HEALTHY) + + result = blue_green_evaluate(deployment, routes, spec) + + assert result.completed + + def test_promote_delay_large_waits(self) -> None: + """promote_delay_seconds=3600 (1 hour) → PROGRESSING (delay wait).""" + deployment = make_deployment(desired=1) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=3600) + routes = _blue_routes(1) + _green_routes(1, status=RouteStatus.HEALTHY) + + result = blue_green_evaluate(deployment, routes, spec) + + assert not result.completed + assert result.sub_step == DeploymentSubStep.PROGRESSING + + def test_promote_delay_max_int_waits(self) -> None: + """Very large delay → PROGRESSING (delay wait).""" + deployment = make_deployment(desired=1) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=999999) + routes = _blue_routes(1) + _green_routes(1, status=RouteStatus.HEALTHY) + + result = blue_green_evaluate(deployment, routes, spec) + + assert not result.completed + + def test_promote_delay_irrelevant_when_not_auto(self) -> None: + """When auto_promote=False, promote_delay_seconds is ignored.""" + deployment = make_deployment(desired=1) + spec = BlueGreenSpec(auto_promote=False, promote_delay_seconds=0) + routes = _blue_routes(1) + _green_routes(1, status=RouteStatus.HEALTHY) + + result = blue_green_evaluate(deployment, routes, spec) + + # auto_promote=False → manual wait, delay doesn't matter + assert not result.completed + assert result.sub_step == DeploymentSubStep.PROGRESSING + + +# =========================================================================== +# 19. Green route healthy count vs desired +# =========================================================================== + + +class TestGreenHealthyVsDesired: + """Test how green healthy count interacts with desired.""" + + def test_green_healthy_exactly_desired_promotes(self) -> None: + """green_healthy == desired → promotes.""" + deployment = make_deployment(desired=3) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + routes = _blue_routes(3) + _green_routes(3, status=RouteStatus.HEALTHY) + + result = blue_green_evaluate(deployment, routes, spec) + + assert result.completed + + def test_green_healthy_one_less_than_desired_waits(self) -> None: + """green_healthy == desired - 1 → PROGRESSING.""" + deployment = make_deployment(desired=3) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + routes = _blue_routes(3) + _green_routes(2, status=RouteStatus.HEALTHY) + + result = blue_green_evaluate(deployment, routes, spec) + + assert result.sub_step == DeploymentSubStep.PROGRESSING + assert not result.completed + + def test_green_healthy_more_than_desired_promotes(self) -> None: + """green_healthy > desired → still promotes.""" + deployment = make_deployment(desired=2) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + routes = _blue_routes(2) + _green_routes(4, status=RouteStatus.HEALTHY) + + result = blue_green_evaluate(deployment, routes, spec) + + assert result.completed + assert len(_promote_ids(result)) == 4 + + def test_green_healthy_zero_desired_nonzero_waits(self) -> None: + """0 healthy green, desired > 0 → PROGRESSING.""" + deployment = make_deployment(desired=3) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + routes = _blue_routes(3) # no green at all + + result = blue_green_evaluate(deployment, routes, spec) + + # No green live → PROVISIONING (create green) + assert result.sub_step == DeploymentSubStep.PROVISIONING + + +# =========================================================================== +# 20. Multiple deployments with different endpoint IDs +# =========================================================================== + + +class TestDifferentEndpointIds: + """Test that the FSM correctly handles different endpoint IDs.""" + + def test_different_endpoint_does_not_interfere(self) -> None: + """Routes from different endpoints are processed independently.""" + ep1 = UUID("11111111-0000-0000-0000-000000000001") + + deployment = make_deployment(desired=2, endpoint_id=ep1) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + + # Routes for ep1 + routes = [ + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY, endpoint_id=ep1), + make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY, endpoint_id=ep1), + ] + + result = blue_green_evaluate(deployment, routes, spec) + + # Only ep1 routes → no green, create 2 + assert _count_scale_out(result) == 2 + + def test_routes_for_other_endpoint_in_list(self) -> None: + """Routes for other endpoints are treated as blue routes (different revision).""" + deployment = make_deployment(desired=1) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=0) + green = _green_routes(1, status=RouteStatus.HEALTHY) + + result = blue_green_evaluate(deployment, green, spec) + + assert result.completed + assert len(_promote_ids(result)) == 1 From 0df575d8e9bd6a8632d2f4c118072a65673b9c99 Mon Sep 17 00:00:00 2001 From: jopemachine Date: Mon, 2 Mar 2026 04:12:14 +0000 Subject: [PATCH 6/8] feat: add status_updated_at support and promote_delay_seconds to blue-green strategy Add status_updated_at field to RouteInfo and RoutingRow for tracking when route status last changed. Implement promote_delay_seconds time calculation in blue_green_evaluate() using _latest_status_updated_at helper. Add fetch_routes_by_endpoint_ids to repository for fetching all routes including failed/terminated (needed for rollback detection). Update tests with status_updated_at parameter and add promote delay test scenarios. Co-Authored-By: Claude Opus 4.6 --- .../backend/manager/data/deployment/types.py | 1 + src/ai/backend/manager/models/routing/row.py | 7 ++ .../repositories/deployment/creators/route.py | 2 + .../deployment/db_source/db_source.py | 26 +++++++ .../repositories/deployment/repository.py | 8 ++ .../sokovan/deployment/strategy/blue_green.py | 31 ++++++-- .../sokovan/deployment/strategy/evaluator.py | 2 +- .../deployment/strategy/test_blue_green.py | 76 +++++++++++++++++-- 8 files changed, 140 insertions(+), 13 deletions(-) diff --git a/src/ai/backend/manager/data/deployment/types.py b/src/ai/backend/manager/data/deployment/types.py index ccee6a29ed1..28a3ff8e050 100644 --- a/src/ai/backend/manager/data/deployment/types.py +++ b/src/ai/backend/manager/data/deployment/types.py @@ -405,6 +405,7 @@ class RouteInfo: created_at: datetime | None revision_id: UUID | None traffic_status: RouteTrafficStatus + status_updated_at: datetime | None = None error_data: dict[str, Any] = field(default_factory=dict) diff --git a/src/ai/backend/manager/models/routing/row.py b/src/ai/backend/manager/models/routing/row.py index 51a9d9c1f9f..37ac464c8ce 100644 --- a/src/ai/backend/manager/models/routing/row.py +++ b/src/ai/backend/manager/models/routing/row.py @@ -95,6 +95,12 @@ class RoutingRow(Base): # type: ignore[misc] # Revision reference without FK (relationship only) revision: Mapped[uuid.UUID | None] = mapped_column("revision", GUID, nullable=True) + status_updated_at: Mapped[datetime | None] = mapped_column( + "status_updated_at", + sa.DateTime(timezone=True), + server_default=sa.text("now()"), + nullable=True, + ) traffic_status: Mapped[RouteTrafficStatus] = mapped_column( "traffic_status", EnumValueType(RouteTrafficStatus), @@ -255,5 +261,6 @@ def to_route_info(self) -> RouteInfo: created_at=self.created_at, revision_id=self.revision, traffic_status=self.traffic_status, + status_updated_at=self.status_updated_at, error_data=self.error_data or {}, ) diff --git a/src/ai/backend/manager/repositories/deployment/creators/route.py b/src/ai/backend/manager/repositories/deployment/creators/route.py index 2b313d7c172..254b6c087f4 100644 --- a/src/ai/backend/manager/repositories/deployment/creators/route.py +++ b/src/ai/backend/manager/repositories/deployment/creators/route.py @@ -4,6 +4,7 @@ import uuid from dataclasses import dataclass +from datetime import UTC, datetime from typing import Any, override from ai.backend.manager.data.deployment.types import RouteStatus, RouteTrafficStatus @@ -66,6 +67,7 @@ def build_values(self) -> dict[str, Any]: values: dict[str, Any] = {} if self.status is not None: values["status"] = self.status + values["status_updated_at"] = datetime.now(UTC) if self.traffic_ratio is not None: values["traffic_ratio"] = self.traffic_ratio if self.traffic_status is not None: diff --git a/src/ai/backend/manager/repositories/deployment/db_source/db_source.py b/src/ai/backend/manager/repositories/deployment/db_source/db_source.py index ecebecdff56..442b901e5ef 100644 --- a/src/ai/backend/manager/repositories/deployment/db_source/db_source.py +++ b/src/ai/backend/manager/repositories/deployment/db_source/db_source.py @@ -1406,6 +1406,32 @@ async def fetch_active_routes_by_endpoint_ids( routes_by_endpoint[row.endpoint].append(row.to_route_info()) return routes_by_endpoint + async def fetch_routes_by_endpoint_ids( + self, + endpoint_ids: set[uuid.UUID], + ) -> Mapping[uuid.UUID, list[RouteInfo]]: + """Fetch all routes for given endpoint IDs (no status filter). + + Unlike fetch_active_routes_by_endpoint_ids, this includes routes + in all statuses (FAILED_TO_START, TERMINATED, etc.), which is + required for blue-green rollback detection. + """ + if not endpoint_ids: + return {} + + async with self._begin_readonly_session_read_committed() as db_sess: + query = sa.select(RoutingRow).where( + RoutingRow.endpoint.in_(endpoint_ids), + ) + result = await db_sess.execute(query) + rows: Sequence[RoutingRow] = result.scalars().all() + routes_by_endpoint: defaultdict[uuid.UUID, list[RouteInfo]] = defaultdict(list) + for row in rows: + if row.endpoint not in routes_by_endpoint: + routes_by_endpoint[row.endpoint] = [] + routes_by_endpoint[row.endpoint].append(row.to_route_info()) + return routes_by_endpoint + async def scale_routes( self, scale_out_creators: Sequence[Creator[RoutingRow]], diff --git a/src/ai/backend/manager/repositories/deployment/repository.py b/src/ai/backend/manager/repositories/deployment/repository.py index 46014a2c330..89b0aebfd10 100644 --- a/src/ai/backend/manager/repositories/deployment/repository.py +++ b/src/ai/backend/manager/repositories/deployment/repository.py @@ -548,6 +548,14 @@ async def fetch_active_routes_by_endpoint_ids( """Fetch routes for multiple endpoints.""" return await self._db_source.fetch_active_routes_by_endpoint_ids(endpoint_ids) + @deployment_repository_resilience.apply() + async def fetch_routes_by_endpoint_ids( + self, + endpoint_ids: set[uuid.UUID], + ) -> Mapping[uuid.UUID, list[RouteInfo]]: + """Fetch all routes for multiple endpoints (no status filter).""" + return await self._db_source.fetch_routes_by_endpoint_ids(endpoint_ids) + @deployment_repository_resilience.apply() async def scale_routes( self, diff --git a/src/ai/backend/manager/sokovan/deployment/strategy/blue_green.py b/src/ai/backend/manager/sokovan/deployment/strategy/blue_green.py index 0791f881b4a..c690ce386bd 100644 --- a/src/ai/backend/manager/sokovan/deployment/strategy/blue_green.py +++ b/src/ai/backend/manager/sokovan/deployment/strategy/blue_green.py @@ -8,6 +8,7 @@ import logging from collections.abc import Sequence +from datetime import UTC, datetime from ai.backend.logging import BraceStyleAdapter from ai.backend.manager.data.deployment.types import ( @@ -130,14 +131,24 @@ def blue_green_evaluate( ) return CycleEvaluationResult(sub_step=DeploymentSubStep.PROGRESSING) - # ── 7. auto_promote=True + delay>0 → PROGRESSING (delay wait) ── + # ── 7. auto_promote=True + delay>0 → check elapsed time ── if spec.promote_delay_seconds > 0: - log.debug( - "deployment {}: all green healthy, waiting for promote delay ({}s)", - deployment.id, - spec.promote_delay_seconds, - ) - return CycleEvaluationResult(sub_step=DeploymentSubStep.PROGRESSING) + latest_healthy_at = _latest_status_updated_at(green_healthy) + if latest_healthy_at is None: + log.debug( + "deployment {}: all green healthy but status_updated_at unknown — waiting", + deployment.id, + ) + return CycleEvaluationResult(sub_step=DeploymentSubStep.PROGRESSING) + elapsed = (datetime.now(UTC) - latest_healthy_at).total_seconds() + if elapsed < spec.promote_delay_seconds: + log.debug( + "deployment {}: promote delay {:.0f}/{} seconds elapsed — waiting", + deployment.id, + elapsed, + spec.promote_delay_seconds, + ) + return CycleEvaluationResult(sub_step=DeploymentSubStep.PROGRESSING) # ── 8. Promotion: green → ACTIVE, blue → TERMINATING ── log.info( @@ -157,6 +168,12 @@ def blue_green_evaluate( ) +def _latest_status_updated_at(routes: list[RouteInfo]) -> datetime | None: + """Return the most recent status_updated_at among the given routes.""" + timestamps = [r.status_updated_at for r in routes if r.status_updated_at is not None] + return max(timestamps) if timestamps else None + + def _build_route_creators( deployment: DeploymentInfo, count: int, diff --git a/src/ai/backend/manager/sokovan/deployment/strategy/evaluator.py b/src/ai/backend/manager/sokovan/deployment/strategy/evaluator.py index 79c7000c033..88d96e208d3 100644 --- a/src/ai/backend/manager/sokovan/deployment/strategy/evaluator.py +++ b/src/ai/backend/manager/sokovan/deployment/strategy/evaluator.py @@ -69,7 +69,7 @@ async def evaluate( ) ) policy_map = {p.endpoint: p for p in policy_search.items} - route_map = await self._deployment_repo.fetch_active_routes_by_endpoint_ids(endpoint_ids) + route_map = await self._deployment_repo.fetch_routes_by_endpoint_ids(endpoint_ids) # ── 2. Per-deployment evaluation ── for deployment in deployments: diff --git a/tests/unit/manager/sokovan/deployment/strategy/test_blue_green.py b/tests/unit/manager/sokovan/deployment/strategy/test_blue_green.py index da7bec443a3..3402aff3b22 100644 --- a/tests/unit/manager/sokovan/deployment/strategy/test_blue_green.py +++ b/tests/unit/manager/sokovan/deployment/strategy/test_blue_green.py @@ -15,7 +15,7 @@ from __future__ import annotations -from datetime import UTC, datetime +from datetime import UTC, datetime, timedelta from uuid import UUID, uuid4 from ai.backend.common.data.endpoint.types import EndpointLifecycle @@ -84,6 +84,7 @@ def make_route( route_id: UUID | None = None, traffic_status: RouteTrafficStatus | None = None, traffic_ratio: float | None = None, + status_updated_at: datetime | None = None, ) -> RouteInfo: if traffic_status is None: traffic_status = ( @@ -100,6 +101,7 @@ def make_route( created_at=datetime.now(UTC), revision_id=revision_id, traffic_status=traffic_status, + status_updated_at=status_updated_at, ) @@ -142,6 +144,7 @@ def _green_routes( status: RouteStatus = RouteStatus.HEALTHY, traffic_status: RouteTrafficStatus = RouteTrafficStatus.INACTIVE, traffic_ratio: float = 0.0, + status_updated_at: datetime | None = None, ) -> list[RouteInfo]: return [ make_route( @@ -149,6 +152,7 @@ def _green_routes( status=status, traffic_status=traffic_status, traffic_ratio=traffic_ratio, + status_updated_at=status_updated_at, ) for _ in range(count) ] @@ -259,10 +263,13 @@ def test_auto_promote_false_waits_for_manual(self) -> None: assert len(_scale_in_ids(result)) == 0 def test_auto_promote_true_delay_positive_waits(self) -> None: - """auto_promote=True, delay>0 → PROGRESSING (delay wait).""" + """auto_promote=True, delay>0 + recently healthy → PROGRESSING (delay wait).""" deployment = make_deployment(desired=3) spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=60) - routes = _blue_routes(3) + _green_routes(3, status=RouteStatus.HEALTHY) + recently_healthy = datetime.now(UTC) - timedelta(seconds=10) + routes = _blue_routes(3) + _green_routes( + 3, status=RouteStatus.HEALTHY, status_updated_at=recently_healthy + ) result = blue_green_evaluate(deployment, routes, spec) @@ -284,16 +291,75 @@ def test_auto_promote_false_delay_positive_still_waits(self) -> None: assert len(_promote_ids(result)) == 0 def test_auto_promote_true_delay_1_second_waits(self) -> None: - """auto_promote=True, delay=1 → still waits (any positive delay).""" + """auto_promote=True, delay=1 + just became healthy → still waits.""" deployment = make_deployment(desired=2) spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=1) - routes = _blue_routes(2) + _green_routes(2, status=RouteStatus.HEALTHY) + just_now = datetime.now(UTC) + routes = _blue_routes(2) + _green_routes( + 2, status=RouteStatus.HEALTHY, status_updated_at=just_now + ) result = blue_green_evaluate(deployment, routes, spec) assert not result.completed assert len(_promote_ids(result)) == 0 + def test_auto_promote_true_delay_elapsed_promotes(self) -> None: + """auto_promote=True, delay>0 + delay fully elapsed → completed.""" + deployment = make_deployment(desired=3) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=30) + long_ago = datetime.now(UTC) - timedelta(seconds=60) + routes = _blue_routes(3) + _green_routes( + 3, status=RouteStatus.HEALTHY, status_updated_at=long_ago + ) + + result = blue_green_evaluate(deployment, routes, spec) + + assert result.completed + assert len(_promote_ids(result)) == 3 + assert len(_scale_in_ids(result)) == 3 + + def test_auto_promote_delay_no_status_updated_at_waits(self) -> None: + """auto_promote=True, delay>0 + status_updated_at=None → PROGRESSING (wait).""" + deployment = make_deployment(desired=2) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=30) + routes = _blue_routes(2) + _green_routes( + 2, status=RouteStatus.HEALTHY, status_updated_at=None + ) + + result = blue_green_evaluate(deployment, routes, spec) + + assert not result.completed + assert result.sub_step == DeploymentSubStep.PROGRESSING + assert len(_promote_ids(result)) == 0 + + def test_auto_promote_delay_uses_latest_timestamp(self) -> None: + """With mixed timestamps, delay check uses the latest one.""" + deployment = make_deployment(desired=2) + spec = BlueGreenSpec(auto_promote=True, promote_delay_seconds=30) + # One route healthy long ago, another route healthy recently + old_healthy = make_route( + revision_id=NEW_REV, + status=RouteStatus.HEALTHY, + traffic_status=RouteTrafficStatus.INACTIVE, + traffic_ratio=0.0, + status_updated_at=datetime.now(UTC) - timedelta(seconds=120), + ) + recent_healthy = make_route( + revision_id=NEW_REV, + status=RouteStatus.HEALTHY, + traffic_status=RouteTrafficStatus.INACTIVE, + traffic_ratio=0.0, + status_updated_at=datetime.now(UTC) - timedelta(seconds=5), + ) + routes = _blue_routes(2) + [old_healthy, recent_healthy] + + result = blue_green_evaluate(deployment, routes, spec) + + # Latest is 5 seconds ago, delay is 30 seconds → not elapsed yet + assert not result.completed + assert result.sub_step == DeploymentSubStep.PROGRESSING + def test_default_spec_auto_promote_false(self) -> None: """Default BlueGreenSpec has auto_promote=False.""" deployment = make_deployment(desired=2) From 03bac413eae5558093c2d2a95c8dedbe042a383d Mon Sep 17 00:00:00 2001 From: jopemachine Date: Mon, 2 Mar 2026 10:05:41 +0000 Subject: [PATCH 7/8] misc: Revert wrong change --- changes/9567.feature.md | 1 - .../deployment/strategy/rolling_update.py | 146 +-- .../strategy/test_rolling_update.py | 1128 ----------------- 3 files changed, 4 insertions(+), 1271 deletions(-) delete mode 100644 changes/9567.feature.md delete mode 100644 tests/unit/manager/sokovan/deployment/strategy/test_rolling_update.py diff --git a/changes/9567.feature.md b/changes/9567.feature.md deleted file mode 100644 index 1065196f9bd..00000000000 --- a/changes/9567.feature.md +++ /dev/null @@ -1 +0,0 @@ -Implement Rolling Update deployment strategy diff --git a/src/ai/backend/manager/sokovan/deployment/strategy/rolling_update.py b/src/ai/backend/manager/sokovan/deployment/strategy/rolling_update.py index d64ea24e980..fbcb764355c 100644 --- a/src/ai/backend/manager/sokovan/deployment/strategy/rolling_update.py +++ b/src/ai/backend/manager/sokovan/deployment/strategy/rolling_update.py @@ -1,4 +1,4 @@ -"""Rolling update FSM evaluation for a single deployment cycle (BEP-1049). +"""Rolling update strategy evaluation for a single deployment cycle (BEP-1049). Classifies routes by revision (old/new) and status, then decides the next sub-step and route mutations based on ``max_surge`` / ``max_unavailable``. @@ -6,24 +6,15 @@ from __future__ import annotations -import logging from collections.abc import Sequence -from ai.backend.logging import BraceStyleAdapter from ai.backend.manager.data.deployment.types import ( DeploymentInfo, - DeploymentSubStep, RouteInfo, - RouteStatus, ) from ai.backend.manager.models.deployment_policy import RollingUpdateSpec -from ai.backend.manager.models.routing import RoutingRow -from ai.backend.manager.repositories.base import Creator -from ai.backend.manager.repositories.deployment.creators import RouteCreatorSpec -from .types import CycleEvaluationResult, RouteChanges - -log = BraceStyleAdapter(logging.getLogger(__name__)) +from .types import CycleEvaluationResult def rolling_update_evaluate( @@ -31,134 +22,5 @@ def rolling_update_evaluate( routes: Sequence[RouteInfo], spec: RollingUpdateSpec, ) -> CycleEvaluationResult: - """Evaluate one cycle of rolling update for a single deployment. - - FSM flow: - 1. Classify routes into old / new by revision_id. - 2. If any new route is PROVISIONING → PROVISIONING (wait). - 3. If no old routes remain and new_healthy >= desired → completed. - 4. If all new routes failed → ROLLED_BACK. - 5. Compute allowed surge/unavailable, decide create/terminate → PROGRESSING. - """ - deploying_rev = deployment.deploying_revision_id - desired = deployment.replica_spec.target_replica_count - - # ── 1. Classify routes ── - old_active: list[RouteInfo] = [] - new_provisioning: list[RouteInfo] = [] - new_healthy: list[RouteInfo] = [] - new_failed: list[RouteInfo] = [] - - for r in routes: - is_new = r.revision_id == deploying_rev - if not is_new: - if r.status.is_active(): - old_active.append(r) - continue - - if r.status == RouteStatus.PROVISIONING: - new_provisioning.append(r) - elif r.status == RouteStatus.HEALTHY: - new_healthy.append(r) - elif r.status in (RouteStatus.FAILED_TO_START, RouteStatus.TERMINATED): - new_failed.append(r) - elif r.status.is_active(): - new_healthy.append(r) - - total_new_live = len(new_provisioning) + len(new_healthy) - - # ── 2. PROVISIONING: wait for in-flight routes ── - if new_provisioning: - log.debug( - "deployment {}: {} new routes still provisioning", - deployment.id, - len(new_provisioning), - ) - return CycleEvaluationResult(sub_step=DeploymentSubStep.PROVISIONING) - - # ── 3. Completed: all old replaced, enough new healthy ── - if not old_active and len(new_healthy) >= desired: - log.info( - "deployment {}: rolling update complete ({} healthy routes)", - deployment.id, - len(new_healthy), - ) - return CycleEvaluationResult( - sub_step=DeploymentSubStep.PROGRESSING, - completed=True, - ) - - # ── 4. Rolled back: every new route failed ── - if total_new_live == 0 and new_failed: - log.warning( - "deployment {}: all {} new routes failed — rolling back", - deployment.id, - len(new_failed), - ) - return CycleEvaluationResult(sub_step=DeploymentSubStep.ROLLED_BACK) - - # ── 5. PROGRESSING: compute surge / unavailable budget ── - max_surge = spec.max_surge - max_unavailable = spec.max_unavailable - - # Total pods allowed at peak = desired + max_surge - max_total = desired + max_surge - current_total = len(old_active) + total_new_live - - # Minimum available pods = desired - max_unavailable - min_available = max(0, desired - max_unavailable) - - route_changes = RouteChanges() - - # Decide how many new routes to create - can_create = max_total - current_total - still_needed = desired - total_new_live - to_create = max(0, min(can_create, still_needed)) - - if to_create > 0: - route_changes.scale_out_specs = _build_route_creators(deployment, to_create) - - # Decide how many old routes to terminate - available_count = len(new_healthy) + len(old_active) - can_terminate = available_count - min_available - to_terminate = max(0, min(can_terminate, len(old_active))) - - if to_terminate > 0: - # Terminate old routes with lowest termination priority first - sorted_old = sorted(old_active, key=lambda r: r.status.termination_priority()) - for r in sorted_old[:to_terminate]: - route_changes.scale_in_route_ids.append(r.route_id) - - log.debug( - "deployment {}: PROGRESSING create={}, terminate={}, " - "old_active={}, new_healthy={}, new_prov={}", - deployment.id, - to_create, - to_terminate, - len(old_active), - len(new_healthy), - len(new_provisioning), - ) - - return CycleEvaluationResult( - sub_step=DeploymentSubStep.PROGRESSING, - route_changes=route_changes, - ) - - -def _build_route_creators( - deployment: DeploymentInfo, - count: int, -) -> list[Creator[RoutingRow]]: - """Build route creator specs for new revision routes.""" - creators: list[Creator[RoutingRow]] = [] - for _ in range(count): - spec = RouteCreatorSpec( - endpoint_id=deployment.id, - session_owner_id=deployment.metadata.session_owner, - domain=deployment.metadata.domain, - project_id=deployment.metadata.project, - revision_id=deployment.deploying_revision_id, - ) - creators.append(Creator(spec=spec)) - return creators + """Evaluate one cycle of rolling update for a single deployment.""" + raise NotImplementedError("Rolling update strategy is not yet implemented") diff --git a/tests/unit/manager/sokovan/deployment/strategy/test_rolling_update.py b/tests/unit/manager/sokovan/deployment/strategy/test_rolling_update.py deleted file mode 100644 index ce285060a09..00000000000 --- a/tests/unit/manager/sokovan/deployment/strategy/test_rolling_update.py +++ /dev/null @@ -1,1128 +0,0 @@ -"""Comprehensive tests for the rolling update FSM evaluation (BEP-1049). - -Tests cover: -- Various max_surge / max_unavailable combinations -- Single and multi-replica scenarios -- FSM state transitions: PROVISIONING, PROGRESSING, ROLLED_BACK, completed -- Edge cases: no routes, all failed, mixed statuses -- Termination priority ordering -""" - -from __future__ import annotations - -from datetime import UTC, datetime -from uuid import UUID, uuid4 - -from ai.backend.common.data.endpoint.types import EndpointLifecycle -from ai.backend.common.types import SessionId -from ai.backend.manager.data.deployment.types import ( - DeploymentInfo, - DeploymentMetadata, - DeploymentNetworkSpec, - DeploymentState, - DeploymentSubStep, - ReplicaSpec, - RouteInfo, - RouteStatus, - RouteTrafficStatus, -) -from ai.backend.manager.models.deployment_policy import RollingUpdateSpec -from ai.backend.manager.repositories.deployment.creators import RouteCreatorSpec -from ai.backend.manager.sokovan.deployment.strategy.rolling_update import ( - rolling_update_evaluate, -) -from ai.backend.manager.sokovan.deployment.strategy.types import CycleEvaluationResult - -ENDPOINT_ID = UUID("aaaaaaaa-0000-0000-0000-aaaaaaaaaaaa") -OLD_REV = UUID("11111111-1111-1111-1111-111111111111") -NEW_REV = UUID("22222222-2222-2222-2222-222222222222") -PROJECT_ID = UUID("cccccccc-cccc-cccc-cccc-cccccccccccc") -USER_ID = UUID("dddddddd-dddd-dddd-dddd-dddddddddddd") - - -def make_deployment( - *, - desired: int = 1, - deploying_revision_id: UUID = NEW_REV, - current_revision_id: UUID = OLD_REV, - endpoint_id: UUID = ENDPOINT_ID, -) -> DeploymentInfo: - return DeploymentInfo( - id=endpoint_id, - metadata=DeploymentMetadata( - name="test-deploy", - domain="default", - project=PROJECT_ID, - resource_group="default", - created_user=USER_ID, - session_owner=USER_ID, - created_at=datetime.now(UTC), - revision_history_limit=5, - ), - state=DeploymentState( - lifecycle=EndpointLifecycle.DEPLOYING, - retry_count=0, - ), - replica_spec=ReplicaSpec( - replica_count=desired, - ), - network=DeploymentNetworkSpec(open_to_public=False), - model_revisions=[], - current_revision_id=current_revision_id, - deploying_revision_id=deploying_revision_id, - ) - - -def make_route( - *, - revision_id: UUID, - status: RouteStatus = RouteStatus.HEALTHY, - endpoint_id: UUID = ENDPOINT_ID, - route_id: UUID | None = None, -) -> RouteInfo: - return RouteInfo( - route_id=route_id or uuid4(), - endpoint_id=endpoint_id, - session_id=SessionId(uuid4()), - status=status, - traffic_ratio=1.0 if status.is_active() else 0.0, - created_at=datetime.now(UTC), - revision_id=revision_id, - traffic_status=RouteTrafficStatus.ACTIVE - if status.is_active() - else RouteTrafficStatus.INACTIVE, - ) - - -# --------------------------------------------------------------------------- -# Helper -# --------------------------------------------------------------------------- - - -def _count_scale_out(result: CycleEvaluationResult) -> int: - return len(result.route_changes.scale_out_specs) - - -def _scale_in_ids(result: CycleEvaluationResult) -> list[UUID]: - return result.route_changes.scale_in_route_ids - - -# =========================================================================== -# 1. Basic FSM states -# =========================================================================== - - -class TestBasicFSMStates: - """Test fundamental FSM transitions.""" - - def test_no_routes_initial_cycle_creates_new(self) -> None: - """First cycle with 0 routes → PROGRESSING, creates desired count.""" - deployment = make_deployment(desired=1) - spec = RollingUpdateSpec(max_surge=1, max_unavailable=0) - - result = rolling_update_evaluate(deployment, [], spec) - - assert result.sub_step == DeploymentSubStep.PROGRESSING - assert not result.completed - assert _count_scale_out(result) == 1 - assert len(_scale_in_ids(result)) == 0 - - def test_new_provisioning_waits(self) -> None: - """New routes in PROVISIONING → wait (PROVISIONING sub-step).""" - deployment = make_deployment(desired=1) - spec = RollingUpdateSpec(max_surge=1, max_unavailable=0) - routes = [ - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=NEW_REV, status=RouteStatus.PROVISIONING), - ] - - result = rolling_update_evaluate(deployment, routes, spec) - - assert result.sub_step == DeploymentSubStep.PROVISIONING - assert not result.completed - assert _count_scale_out(result) == 0 - assert len(_scale_in_ids(result)) == 0 - - def test_completed_when_all_new_healthy_and_no_old(self) -> None: - """All old gone + new_healthy >= desired → completed.""" - deployment = make_deployment(desired=2) - spec = RollingUpdateSpec(max_surge=1, max_unavailable=0) - routes = [ - make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), - ] - - result = rolling_update_evaluate(deployment, routes, spec) - - assert result.completed - assert result.sub_step == DeploymentSubStep.PROGRESSING - - def test_rollback_when_all_new_failed(self) -> None: - """All new routes failed → ROLLED_BACK.""" - deployment = make_deployment(desired=1) - spec = RollingUpdateSpec(max_surge=1, max_unavailable=0) - routes = [ - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=NEW_REV, status=RouteStatus.FAILED_TO_START), - ] - - result = rolling_update_evaluate(deployment, routes, spec) - - assert result.sub_step == DeploymentSubStep.ROLLED_BACK - assert not result.completed - - def test_rollback_with_terminated_new_routes(self) -> None: - """New routes in TERMINATED also count as failed.""" - deployment = make_deployment(desired=1) - spec = RollingUpdateSpec(max_surge=1, max_unavailable=0) - routes = [ - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=NEW_REV, status=RouteStatus.TERMINATED), - ] - - result = rolling_update_evaluate(deployment, routes, spec) - - assert result.sub_step == DeploymentSubStep.ROLLED_BACK - - -# =========================================================================== -# 2. max_surge variations -# =========================================================================== - - -class TestMaxSurge: - """Test max_surge parameter controls.""" - - def test_surge_1_desired_1_creates_1(self) -> None: - """surge=1, desired=1: 1 old → create 1 new (total=2 allowed).""" - deployment = make_deployment(desired=1) - spec = RollingUpdateSpec(max_surge=1, max_unavailable=0) - routes = [make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY)] - - result = rolling_update_evaluate(deployment, routes, spec) - - assert result.sub_step == DeploymentSubStep.PROGRESSING - assert _count_scale_out(result) == 1 - - def test_surge_2_desired_3_creates_2(self) -> None: - """surge=2, desired=3: 3 old → max_total=5, can create 2.""" - deployment = make_deployment(desired=3) - spec = RollingUpdateSpec(max_surge=2, max_unavailable=0) - routes = [ - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - ] - - result = rolling_update_evaluate(deployment, routes, spec) - - assert _count_scale_out(result) == 2 - - def test_surge_0_desired_3_no_create_without_unavailable(self) -> None: - """surge=0, unavailable=0: cannot create new (no budget).""" - deployment = make_deployment(desired=3) - spec = RollingUpdateSpec(max_surge=0, max_unavailable=0) - routes = [ - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - ] - - result = rolling_update_evaluate(deployment, routes, spec) - - # max_total = 3+0 = 3, current_total = 3, can_create = 0 - assert _count_scale_out(result) == 0 - # min_available = 3-0 = 3, available=3, can_terminate = 0 - assert len(_scale_in_ids(result)) == 0 - - def test_surge_3_desired_2_caps_at_desired(self) -> None: - """surge=3, desired=2: creates at most desired - already_new.""" - deployment = make_deployment(desired=2) - spec = RollingUpdateSpec(max_surge=3, max_unavailable=0) - routes = [ - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - ] - - result = rolling_update_evaluate(deployment, routes, spec) - - # max_total = 5, current_total = 2, can_create = 3 - # still_needed = 2 - 0 = 2 → min(3,2) = 2 - assert _count_scale_out(result) == 2 - - def test_surge_already_at_max_no_create(self) -> None: - """Already at max_total → no new creates.""" - deployment = make_deployment(desired=2) - spec = RollingUpdateSpec(max_surge=1, max_unavailable=0) - routes = [ - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), - ] - - result = rolling_update_evaluate(deployment, routes, spec) - - # max_total = 3, current = 3 → can_create = 0 - assert _count_scale_out(result) == 0 - - -# =========================================================================== -# 3. max_unavailable variations -# =========================================================================== - - -class TestMaxUnavailable: - """Test max_unavailable parameter controls.""" - - def test_unavailable_0_no_terminate_until_new_healthy(self) -> None: - """unavailable=0: only terminate when new routes are healthy.""" - deployment = make_deployment(desired=2) - spec = RollingUpdateSpec(max_surge=1, max_unavailable=0) - routes = [ - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - ] - - result = rolling_update_evaluate(deployment, routes, spec) - - # min_available = 2-0 = 2, available = 0(new_healthy) + 2(old) = 2 - # can_terminate = 2 - 2 = 0 - assert len(_scale_in_ids(result)) == 0 - - def test_unavailable_1_terminates_1_old(self) -> None: - """unavailable=1: can terminate 1 old even without new ready.""" - deployment = make_deployment(desired=3) - spec = RollingUpdateSpec(max_surge=1, max_unavailable=1) - routes = [ - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - ] - - result = rolling_update_evaluate(deployment, routes, spec) - - # min_available = 3-1 = 2, available = 0+3 = 3, can_terminate = 1 - assert len(_scale_in_ids(result)) == 1 - - def test_unavailable_2_terminates_2_old(self) -> None: - """unavailable=2: can terminate up to 2 old routes.""" - deployment = make_deployment(desired=3) - spec = RollingUpdateSpec(max_surge=0, max_unavailable=2) - routes = [ - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - ] - - result = rolling_update_evaluate(deployment, routes, spec) - - # min_available = 3-2 = 1, available = 0+3 = 3, can_terminate = 2 - assert len(_scale_in_ids(result)) == 2 - # max_total = 3+0 = 3, current = 3, can_create = 0 - # But still_needed = 3 → min(0, 3) = 0 - assert _count_scale_out(result) == 0 - - def test_unavailable_with_new_healthy_allows_more_termination(self) -> None: - """With new healthy routes, more old can be terminated.""" - deployment = make_deployment(desired=3) - spec = RollingUpdateSpec(max_surge=1, max_unavailable=0) - routes = [ - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), - ] - - result = rolling_update_evaluate(deployment, routes, spec) - - # min_available = 3, available = 1(new_healthy)+3(old) = 4 - # can_terminate = 4 - 3 = 1 - assert len(_scale_in_ids(result)) == 1 - - def test_unavailable_exceeds_desired_floors_to_zero(self) -> None: - """unavailable > desired → min_available floors to 0.""" - deployment = make_deployment(desired=1) - spec = RollingUpdateSpec(max_surge=0, max_unavailable=5) - routes = [make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY)] - - result = rolling_update_evaluate(deployment, routes, spec) - - # min_available = max(0, 1-5) = 0, available = 0+1 = 1 - # can_terminate = 1 - 0 = 1 - assert len(_scale_in_ids(result)) == 1 - - -# =========================================================================== -# 4. Combined surge + unavailable -# =========================================================================== - - -class TestCombinedSurgeAndUnavailable: - """Test combinations of max_surge and max_unavailable.""" - - def test_surge_1_unavailable_1_desired_3(self) -> None: - """surge=1, unavailable=1, desired=3 with 3 old routes.""" - deployment = make_deployment(desired=3) - spec = RollingUpdateSpec(max_surge=1, max_unavailable=1) - routes = [ - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - ] - - result = rolling_update_evaluate(deployment, routes, spec) - - # max_total = 4, current = 3, can_create = 1, still_needed = 3 → create 1 - assert _count_scale_out(result) == 1 - # min_available = 2, available = 0+3 = 3, can_terminate = 1 - assert len(_scale_in_ids(result)) == 1 - - def test_surge_2_unavailable_1_desired_4(self) -> None: - """surge=2, unavailable=1, desired=4 with 4 old routes.""" - deployment = make_deployment(desired=4) - spec = RollingUpdateSpec(max_surge=2, max_unavailable=1) - routes = [ - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - ] - - result = rolling_update_evaluate(deployment, routes, spec) - - # max_total = 6, current = 4, can_create = 2, still_needed = 4 → 2 - assert _count_scale_out(result) == 2 - # min_available = 3, available = 0+4 = 4, can_terminate = 1 - assert len(_scale_in_ids(result)) == 1 - - def test_aggressive_strategy_surge_3_unavail_2_desired_3(self) -> None: - """Aggressive: surge=3, unavailable=2, desired=3 with 3 old.""" - deployment = make_deployment(desired=3) - spec = RollingUpdateSpec(max_surge=3, max_unavailable=2) - routes = [ - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - ] - - result = rolling_update_evaluate(deployment, routes, spec) - - # max_total = 6, current = 3, can_create = 3, still_needed = 3 → 3 - assert _count_scale_out(result) == 3 - # min_available = 1, available = 0+3 = 3, can_terminate = 2 - assert len(_scale_in_ids(result)) == 2 - - -# =========================================================================== -# 5. Multi-cycle progression -# =========================================================================== - - -class TestMultiCycleProgression: - """Simulate multiple evaluation cycles.""" - - def test_cycle_2_after_new_routes_become_healthy(self) -> None: - """After new routes become healthy, old ones can be terminated.""" - deployment = make_deployment(desired=3) - spec = RollingUpdateSpec(max_surge=1, max_unavailable=0) - routes = [ - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), - ] - - result = rolling_update_evaluate(deployment, routes, spec) - - # max_total = 4, current = 3, can_create = 1, still_needed = 2 → 1 - assert _count_scale_out(result) == 1 - # min_available = 3, available = 1+2 = 3, can_terminate = 0 - # Wait, that's wrong: available = 1(new_healthy) + 2(old) = 3 - # can_terminate = 3 - 3 = 0 - assert len(_scale_in_ids(result)) == 0 - - def test_cycle_3_with_2_new_healthy(self) -> None: - """2 new healthy, 2 old: can terminate 1 old and create 1 new.""" - deployment = make_deployment(desired=3) - spec = RollingUpdateSpec(max_surge=1, max_unavailable=0) - routes = [ - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), - ] - - result = rolling_update_evaluate(deployment, routes, spec) - - # max_total = 4, current = 4, can_create = 0 - assert _count_scale_out(result) == 0 - # min_available = 3, available = 2+2 = 4, can_terminate = 1 - assert len(_scale_in_ids(result)) == 1 - - def test_final_cycle_completes(self) -> None: - """3 new healthy, 0 old → completed.""" - deployment = make_deployment(desired=3) - spec = RollingUpdateSpec(max_surge=1, max_unavailable=0) - routes = [ - make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), - ] - - result = rolling_update_evaluate(deployment, routes, spec) - - assert result.completed - - def test_not_completed_when_old_still_exists(self) -> None: - """Even with enough new, old still exists → not completed.""" - deployment = make_deployment(desired=2) - spec = RollingUpdateSpec(max_surge=1, max_unavailable=0) - routes = [ - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), - ] - - result = rolling_update_evaluate(deployment, routes, spec) - - assert not result.completed - # Should terminate the old route - assert len(_scale_in_ids(result)) == 1 - - -# =========================================================================== -# 6. Mixed route statuses -# =========================================================================== - - -class TestMixedRouteStatuses: - """Test with routes in various statuses.""" - - def test_degraded_new_counts_as_healthy(self) -> None: - """DEGRADED new routes count as active (is_active=True).""" - deployment = make_deployment(desired=1) - spec = RollingUpdateSpec(max_surge=1, max_unavailable=0) - routes = [ - make_route(revision_id=NEW_REV, status=RouteStatus.DEGRADED), - ] - - result = rolling_update_evaluate(deployment, routes, spec) - - assert result.completed - - def test_unhealthy_new_counts_as_healthy(self) -> None: - """UNHEALTHY new routes count as active.""" - deployment = make_deployment(desired=1) - spec = RollingUpdateSpec(max_surge=1, max_unavailable=0) - routes = [ - make_route(revision_id=NEW_REV, status=RouteStatus.UNHEALTHY), - ] - - result = rolling_update_evaluate(deployment, routes, spec) - - assert result.completed - - def test_old_terminating_not_counted_as_active(self) -> None: - """Old routes in TERMINATING are not counted as old_active.""" - deployment = make_deployment(desired=1) - spec = RollingUpdateSpec(max_surge=1, max_unavailable=0) - routes = [ - make_route(revision_id=OLD_REV, status=RouteStatus.TERMINATING), - make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), - ] - - result = rolling_update_evaluate(deployment, routes, spec) - - # old_active = 0 (terminating doesn't count), new_healthy = 1 >= desired - assert result.completed - - def test_old_terminated_not_counted(self) -> None: - """Old routes in TERMINATED are not counted as old_active.""" - deployment = make_deployment(desired=1) - spec = RollingUpdateSpec(max_surge=1, max_unavailable=0) - routes = [ - make_route(revision_id=OLD_REV, status=RouteStatus.TERMINATED), - make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), - ] - - result = rolling_update_evaluate(deployment, routes, spec) - - assert result.completed - - def test_mixed_old_statuses_counts_only_active(self) -> None: - """Only active old routes are counted in old_active.""" - deployment = make_deployment(desired=2) - spec = RollingUpdateSpec(max_surge=1, max_unavailable=0) - routes = [ - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=OLD_REV, status=RouteStatus.TERMINATING), - make_route(revision_id=OLD_REV, status=RouteStatus.TERMINATED), - ] - - result = rolling_update_evaluate(deployment, routes, spec) - - # old_active = 1 (only HEALTHY), total_new_live = 0 - # max_total = 3, current = 1, can_create = 2, still_needed = 2 → 2 - assert _count_scale_out(result) == 2 - - def test_mix_of_failed_and_healthy_new_not_rollback(self) -> None: - """Some new failed, some new healthy → no rollback (live routes exist).""" - deployment = make_deployment(desired=2) - spec = RollingUpdateSpec(max_surge=2, max_unavailable=0) - routes = [ - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=NEW_REV, status=RouteStatus.FAILED_TO_START), - ] - - result = rolling_update_evaluate(deployment, routes, spec) - - # total_new_live = 1 (healthy) > 0, so NOT rolled back - assert result.sub_step == DeploymentSubStep.PROGRESSING - assert not result.completed - - -# =========================================================================== -# 7. Termination priority ordering -# =========================================================================== - - -class TestTerminationPriority: - """Test that old routes are terminated in priority order.""" - - def test_unhealthy_terminated_before_healthy(self) -> None: - """UNHEALTHY old routes should be terminated before HEALTHY ones.""" - unhealthy_id = UUID("00000000-0000-0000-0000-000000000001") - healthy_id = UUID("00000000-0000-0000-0000-000000000002") - - deployment = make_deployment(desired=2) - spec = RollingUpdateSpec(max_surge=0, max_unavailable=1) - routes = [ - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY, route_id=healthy_id), - make_route(revision_id=OLD_REV, status=RouteStatus.UNHEALTHY, route_id=unhealthy_id), - ] - - result = rolling_update_evaluate(deployment, routes, spec) - - assert len(_scale_in_ids(result)) == 1 - assert _scale_in_ids(result)[0] == unhealthy_id - - def test_degraded_before_healthy(self) -> None: - """DEGRADED old routes terminated before HEALTHY ones.""" - degraded_id = UUID("00000000-0000-0000-0000-000000000001") - healthy_id = UUID("00000000-0000-0000-0000-000000000002") - - deployment = make_deployment(desired=2) - spec = RollingUpdateSpec(max_surge=0, max_unavailable=1) - routes = [ - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY, route_id=healthy_id), - make_route(revision_id=OLD_REV, status=RouteStatus.DEGRADED, route_id=degraded_id), - ] - - result = rolling_update_evaluate(deployment, routes, spec) - - assert len(_scale_in_ids(result)) == 1 - assert _scale_in_ids(result)[0] == degraded_id - - def test_priority_order_unhealthy_degraded_provisioning_healthy(self) -> None: - """Full priority order: unhealthy < degraded < provisioning < healthy.""" - unhealthy_id = UUID("00000000-0000-0000-0000-000000000001") - degraded_id = UUID("00000000-0000-0000-0000-000000000002") - provisioning_id = UUID("00000000-0000-0000-0000-000000000003") - healthy_id = UUID("00000000-0000-0000-0000-000000000004") - - deployment = make_deployment(desired=4) - spec = RollingUpdateSpec(max_surge=0, max_unavailable=3) - routes = [ - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY, route_id=healthy_id), - make_route( - revision_id=OLD_REV, status=RouteStatus.PROVISIONING, route_id=provisioning_id - ), - make_route(revision_id=OLD_REV, status=RouteStatus.DEGRADED, route_id=degraded_id), - make_route(revision_id=OLD_REV, status=RouteStatus.UNHEALTHY, route_id=unhealthy_id), - ] - - result = rolling_update_evaluate(deployment, routes, spec) - - terminated = _scale_in_ids(result) - assert len(terminated) == 3 - assert terminated[0] == unhealthy_id - assert terminated[1] == degraded_id - assert terminated[2] == provisioning_id - - -# =========================================================================== -# 8. Edge cases -# =========================================================================== - - -class TestEdgeCases: - """Edge cases and boundary conditions.""" - - def test_desired_0_no_routes_completed(self) -> None: - """desired=0, no routes → completed (vacuously true).""" - deployment = make_deployment(desired=0) - spec = RollingUpdateSpec(max_surge=1, max_unavailable=0) - - result = rolling_update_evaluate(deployment, [], spec) - - assert result.completed - - def test_more_new_healthy_than_desired_still_completes(self) -> None: - """new_healthy > desired and no old → completed.""" - deployment = make_deployment(desired=2) - spec = RollingUpdateSpec(max_surge=2, max_unavailable=0) - routes = [ - make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), - ] - - result = rolling_update_evaluate(deployment, routes, spec) - - assert result.completed - - def test_no_routes_no_failed_creates_new(self) -> None: - """Empty routes list → PROGRESSING with scale out.""" - deployment = make_deployment(desired=3) - spec = RollingUpdateSpec(max_surge=2, max_unavailable=1) - - result = rolling_update_evaluate(deployment, [], spec) - - assert result.sub_step == DeploymentSubStep.PROGRESSING - # max_total = 5, current = 0, can_create = 5, still_needed = 3 → 3 - assert _count_scale_out(result) == 3 - - def test_only_failed_new_no_old_rolls_back(self) -> None: - """Only failed new routes, no old → ROLLED_BACK.""" - deployment = make_deployment(desired=2) - spec = RollingUpdateSpec(max_surge=1, max_unavailable=0) - routes = [ - make_route(revision_id=NEW_REV, status=RouteStatus.FAILED_TO_START), - make_route(revision_id=NEW_REV, status=RouteStatus.FAILED_TO_START), - ] - - result = rolling_update_evaluate(deployment, routes, spec) - - assert result.sub_step == DeploymentSubStep.ROLLED_BACK - - def test_all_old_inactive_no_new_creates_desired(self) -> None: - """All old routes are inactive (terminated), no new → create desired.""" - deployment = make_deployment(desired=2) - spec = RollingUpdateSpec(max_surge=1, max_unavailable=0) - routes = [ - make_route(revision_id=OLD_REV, status=RouteStatus.TERMINATED), - make_route(revision_id=OLD_REV, status=RouteStatus.TERMINATED), - ] - - result = rolling_update_evaluate(deployment, routes, spec) - - # old_active = 0, no new → max_total = 3, current = 0, can_create = 3 - # still_needed = 2, min(3, 2) = 2 - assert _count_scale_out(result) == 2 - - def test_large_desired_surge_1_unavailable_0_creates_exactly_1(self) -> None: - """Large desired with conservative settings creates exactly 1.""" - deployment = make_deployment(desired=10) - spec = RollingUpdateSpec(max_surge=1, max_unavailable=0) - routes = [make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY) for _ in range(10)] - - result = rolling_update_evaluate(deployment, routes, spec) - - assert _count_scale_out(result) == 1 - assert len(_scale_in_ids(result)) == 0 - - def test_deploying_rev_none_all_routes_classified_as_old(self) -> None: - """If deploying_revision_id is None, all routes are old (r.revision_id != None).""" - deployment = make_deployment(desired=1, deploying_revision_id=None) # type: ignore[arg-type] - spec = RollingUpdateSpec(max_surge=1, max_unavailable=0) - routes = [make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY)] - - result = rolling_update_evaluate(deployment, routes, spec) - - # All classified as old, no new → PROGRESSING with create - assert result.sub_step == DeploymentSubStep.PROGRESSING - assert _count_scale_out(result) == 1 - - def test_route_without_revision_classified_as_old(self) -> None: - """Routes with revision_id=None are classified as old.""" - deployment = make_deployment(desired=1) - spec = RollingUpdateSpec(max_surge=1, max_unavailable=0) - routes = [make_route(revision_id=None, status=RouteStatus.HEALTHY)] # type: ignore[arg-type] - - result = rolling_update_evaluate(deployment, routes, spec) - - # revision_id=None != NEW_REV, so classified as old - assert _count_scale_out(result) == 1 - - def test_provisioning_prioritized_over_completion_check(self) -> None: - """PROVISIONING check comes before completion check.""" - deployment = make_deployment(desired=1) - spec = RollingUpdateSpec(max_surge=1, max_unavailable=0) - routes = [ - make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=NEW_REV, status=RouteStatus.PROVISIONING), - ] - - result = rolling_update_evaluate(deployment, routes, spec) - - # Even though new_healthy >= desired, PROVISIONING takes precedence - assert result.sub_step == DeploymentSubStep.PROVISIONING - assert not result.completed - - -# =========================================================================== -# 9. Route creator specs validation -# =========================================================================== - - -class TestRouteCreatorSpecs: - """Validate that route creator specs have correct fields.""" - - def test_creator_specs_use_deploying_revision(self) -> None: - """Created routes should use the deploying revision.""" - deployment = make_deployment(desired=1) - spec = RollingUpdateSpec(max_surge=1, max_unavailable=0) - - result = rolling_update_evaluate(deployment, [], spec) - - assert _count_scale_out(result) == 1 - creator_spec = result.route_changes.scale_out_specs[0].spec - assert isinstance(creator_spec, RouteCreatorSpec) - assert creator_spec.revision_id == NEW_REV - assert creator_spec.endpoint_id == ENDPOINT_ID - assert creator_spec.session_owner_id == USER_ID - assert creator_spec.domain == "default" - assert creator_spec.project_id == PROJECT_ID - - def test_multiple_creators_all_correct(self) -> None: - """Multiple creators all have correct metadata.""" - deployment = make_deployment(desired=3) - spec = RollingUpdateSpec(max_surge=3, max_unavailable=0) - - result = rolling_update_evaluate(deployment, [], spec) - - assert _count_scale_out(result) == 3 - for creator in result.route_changes.scale_out_specs: - creator_spec = creator.spec - assert isinstance(creator_spec, RouteCreatorSpec) - assert creator_spec.revision_id == NEW_REV - assert creator_spec.endpoint_id == ENDPOINT_ID - - -# =========================================================================== -# 10. Realistic multi-step scenario (desired=5) -# =========================================================================== - - -class TestRealisticScenario: - """Simulate a realistic rolling update with desired=5, surge=2, unavail=1.""" - - def test_step_by_step_rolling_update(self) -> None: - """Full simulation of a rolling update across multiple cycles.""" - deployment = make_deployment(desired=5) - spec = RollingUpdateSpec(max_surge=2, max_unavailable=1) - - # Cycle 1: 5 old, 0 new - old_routes = [make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY) for _ in range(5)] - r1 = rolling_update_evaluate(deployment, old_routes, spec) - - # max_total = 7, current = 5, can_create = 2, still_needed = 5 → 2 - assert _count_scale_out(r1) == 2 - # min_available = 4, available = 0+5 = 5, can_terminate = 1 - assert len(_scale_in_ids(r1)) == 1 - - # Cycle 2: 4 old, 2 new healthy - routes_c2 = [ - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), - ] - r2 = rolling_update_evaluate(deployment, routes_c2, spec) - - # max_total = 7, current = 6, can_create = 1, still_needed = 3 → 1 - assert _count_scale_out(r2) == 1 - # min_available = 4, available = 2+4 = 6, can_terminate = 2 - assert len(_scale_in_ids(r2)) == 2 - - # Cycle 3: 2 old, 3 new healthy - routes_c3 = [ - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), - ] - r3 = rolling_update_evaluate(deployment, routes_c3, spec) - - # max_total = 7, current = 5, can_create = 2, still_needed = 2 → 2 - assert _count_scale_out(r3) == 2 - # min_available = 4, available = 3+2 = 5, can_terminate = 1 - assert len(_scale_in_ids(r3)) == 1 - - # Cycle 4: 1 old, 5 new healthy - routes_c4 = [ - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), - ] - r4 = rolling_update_evaluate(deployment, routes_c4, spec) - - # can_create = 0 (still_needed = 0), can_terminate = 1 - assert _count_scale_out(r4) == 0 - assert len(_scale_in_ids(r4)) == 1 - assert not r4.completed - - # Cycle 5: 0 old, 5 new healthy → completed - routes_c5 = [ - make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), - ] - r5 = rolling_update_evaluate(deployment, routes_c5, spec) - - assert r5.completed - - -# =========================================================================== -# 11. Deadlock and stall detection -# =========================================================================== - - -class TestDeadlockAndStall: - """Test scenarios where the FSM could potentially stall.""" - - def test_surge_0_unavailable_0_deadlock(self) -> None: - """Both surge=0 and unavailable=0 → no progress possible (deadlock). - - This is a configuration error: at least one must be > 0 for progress. - The FSM correctly returns PROGRESSING with no changes. - """ - deployment = make_deployment(desired=3) - spec = RollingUpdateSpec(max_surge=0, max_unavailable=0) - routes = [ - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - ] - - result = rolling_update_evaluate(deployment, routes, spec) - - assert result.sub_step == DeploymentSubStep.PROGRESSING - assert _count_scale_out(result) == 0 - assert len(_scale_in_ids(result)) == 0 - # This is a known deadlock — no progress is possible. - - def test_surge_0_unavailable_1_terminates_first_then_creates(self) -> None: - """surge=0, unavailable=1 → terminate 1, then next cycle creates 1. - - This pattern kills old routes before creating new ones (downtime-tolerant). - """ - deployment = make_deployment(desired=3) - spec = RollingUpdateSpec(max_surge=0, max_unavailable=1) - - # Cycle 1: 3 old → terminate 1, create 0 - routes = [ - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - ] - r1 = rolling_update_evaluate(deployment, routes, spec) - assert _count_scale_out(r1) == 0 - assert len(_scale_in_ids(r1)) == 1 - - # Cycle 2: 2 old → now we can create 1 - routes_c2 = [ - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - ] - r2 = rolling_update_evaluate(deployment, routes_c2, spec) - # max_total = 3, current = 2, can_create = 1, still_needed = 3 → 1 - assert _count_scale_out(r2) == 1 - # min_available = 2, available = 0+2 = 2, can_terminate = 0 - assert len(_scale_in_ids(r2)) == 0 - - def test_partial_new_failure_continues_progress(self) -> None: - """Some new routes fail while others succeed → continue, no rollback.""" - deployment = make_deployment(desired=3) - spec = RollingUpdateSpec(max_surge=2, max_unavailable=0) - routes = [ - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=NEW_REV, status=RouteStatus.FAILED_TO_START), - ] - - result = rolling_update_evaluate(deployment, routes, spec) - - # total_new_live = 1 > 0, so NOT rolled back - assert result.sub_step == DeploymentSubStep.PROGRESSING - # still_needed = 3-1 = 2, max_total=5, current=4 → can_create = 1 - assert _count_scale_out(result) == 1 - - def test_new_routes_exceed_desired_no_extra_create(self) -> None: - """More new_live than desired → no extra creation (still_needed < 0).""" - deployment = make_deployment(desired=2) - spec = RollingUpdateSpec(max_surge=2, max_unavailable=0) - routes = [ - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), - ] - - result = rolling_update_evaluate(deployment, routes, spec) - - # still_needed = 2-3 = -1 → to_create = max(0, ...) = 0 - assert _count_scale_out(result) == 0 - # min_available = 2, available = 3+1 = 4, can_terminate = 2 → min(2, 1) = 1 - assert len(_scale_in_ids(result)) == 1 - - -# =========================================================================== -# 12. desired_replica_count vs replica_count -# =========================================================================== - - -class TestDesiredReplicaCount: - """Test that the correct desired count is used.""" - - def test_desired_replica_count_overrides_replica_count(self) -> None: - """When desired_replica_count is set, it takes precedence.""" - deployment = make_deployment(desired=3) - # Override desired_replica_count - deployment.replica_spec = ReplicaSpec( - replica_count=1, - desired_replica_count=3, - ) - spec = RollingUpdateSpec(max_surge=1, max_unavailable=0) - routes = [make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY)] - - result = rolling_update_evaluate(deployment, routes, spec) - - # desired is 3 (from desired_replica_count), not 1 - # max_total = 4, current = 1, can_create = 3, still_needed = 3 → 3 - assert _count_scale_out(result) == 3 - - def test_replica_count_used_when_no_desired(self) -> None: - """When desired_replica_count is None, uses replica_count.""" - deployment = make_deployment(desired=2) - deployment.replica_spec = ReplicaSpec( - replica_count=2, - desired_replica_count=None, - ) - spec = RollingUpdateSpec(max_surge=1, max_unavailable=0) - routes = [ - make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=NEW_REV, status=RouteStatus.HEALTHY), - ] - - result = rolling_update_evaluate(deployment, routes, spec) - - assert result.completed - - -# =========================================================================== -# 13. Scale-down during rolling update -# =========================================================================== - - -class TestScaleDownDuringRollingUpdate: - """Test behavior when desired is reduced during rolling update.""" - - def test_desired_reduced_terminates_excess_old(self) -> None: - """If desired is lowered, more old can be terminated.""" - deployment = make_deployment(desired=1) - spec = RollingUpdateSpec(max_surge=1, max_unavailable=0) - routes = [ - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - ] - - result = rolling_update_evaluate(deployment, routes, spec) - - # max_total = 2, current = 3 → can_create = max(0, -1) = 0 - assert _count_scale_out(result) == 0 - # Wait: still_needed = 1 - 0 = 1, but can_create is capped by max_total - # max_total = 2, current = 3 → can_create = -1 → to_create = max(0, min(-1, 1)) = 0 - # min_available = 1, available = 0+3 = 3, can_terminate = 2 - assert len(_scale_in_ids(result)) == 2 - - def test_desired_increased_creates_more(self) -> None: - """If desired is raised, more new routes are created.""" - deployment = make_deployment(desired=5) - spec = RollingUpdateSpec(max_surge=2, max_unavailable=0) - routes = [ - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - ] - - result = rolling_update_evaluate(deployment, routes, spec) - - # max_total = 7, current = 2, can_create = 5, still_needed = 5 → 5 - assert _count_scale_out(result) == 5 - - -# =========================================================================== -# 14. Concurrent provisioning and termination -# =========================================================================== - - -class TestConcurrentOperations: - """Test that provisioning blocks further changes correctly.""" - - def test_provisioning_blocks_all_further_actions(self) -> None: - """Any new route in PROVISIONING → wait, even if old can be terminated.""" - deployment = make_deployment(desired=3) - spec = RollingUpdateSpec(max_surge=2, max_unavailable=1) - routes = [ - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - make_route(revision_id=NEW_REV, status=RouteStatus.PROVISIONING), - ] - - result = rolling_update_evaluate(deployment, routes, spec) - - # PROVISIONING takes priority over all other decisions - assert result.sub_step == DeploymentSubStep.PROVISIONING - assert _count_scale_out(result) == 0 - assert len(_scale_in_ids(result)) == 0 - - def test_multiple_provisioning_routes_still_waits(self) -> None: - """Multiple PROVISIONING routes → still PROVISIONING.""" - deployment = make_deployment(desired=3) - spec = RollingUpdateSpec(max_surge=3, max_unavailable=3) - routes = [ - make_route(revision_id=NEW_REV, status=RouteStatus.PROVISIONING), - make_route(revision_id=NEW_REV, status=RouteStatus.PROVISIONING), - make_route(revision_id=NEW_REV, status=RouteStatus.PROVISIONING), - ] - - result = rolling_update_evaluate(deployment, routes, spec) - - assert result.sub_step == DeploymentSubStep.PROVISIONING - - def test_old_provisioning_counted_as_active(self) -> None: - """Old routes in PROVISIONING are counted as old_active.""" - deployment = make_deployment(desired=2) - spec = RollingUpdateSpec(max_surge=1, max_unavailable=0) - routes = [ - make_route(revision_id=OLD_REV, status=RouteStatus.PROVISIONING), - make_route(revision_id=OLD_REV, status=RouteStatus.HEALTHY), - ] - - result = rolling_update_evaluate(deployment, routes, spec) - - # old_active = 2 (both PROVISIONING and HEALTHY are active) - assert result.sub_step == DeploymentSubStep.PROGRESSING - assert not result.completed From af7f67b473d2eac55a91a4c53f40b8fa82e20e8b Mon Sep 17 00:00:00 2001 From: jopemachine Date: Wed, 4 Mar 2026 03:57:01 +0000 Subject: [PATCH 8/8] wip --- .../manager/sokovan/deployment/strategy/blue_green.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ai/backend/manager/sokovan/deployment/strategy/blue_green.py b/src/ai/backend/manager/sokovan/deployment/strategy/blue_green.py index c690ce386bd..a94221c8729 100644 --- a/src/ai/backend/manager/sokovan/deployment/strategy/blue_green.py +++ b/src/ai/backend/manager/sokovan/deployment/strategy/blue_green.py @@ -80,7 +80,7 @@ def blue_green_evaluate( desired, ) route_changes = RouteChanges( - scale_out_specs=_build_route_creators(deployment, desired), + rollout_specs=_build_route_creators(deployment, desired), ) return CycleEvaluationResult( sub_step=DeploymentSubStep.PROVISIONING, @@ -104,7 +104,7 @@ def blue_green_evaluate( len(green_failed), ) route_changes = RouteChanges( - scale_in_route_ids=[r.route_id for r in green_failed], + drain_route_ids=[r.route_id for r in green_failed], ) return CycleEvaluationResult( sub_step=DeploymentSubStep.ROLLED_BACK, @@ -159,7 +159,7 @@ def blue_green_evaluate( ) route_changes = RouteChanges( promote_route_ids=[r.route_id for r in green_healthy], - scale_in_route_ids=[r.route_id for r in blue_active], + drain_route_ids=[r.route_id for r in blue_active], ) return CycleEvaluationResult( sub_step=DeploymentSubStep.PROGRESSING,