From 8a99f4743eb1be36a658e64cd20f290e28d3c218 Mon Sep 17 00:00:00 2001 From: jopemachine Date: Sun, 1 Mar 2026 23:54:26 +0000 Subject: [PATCH 01/23] feat: Implement Rolling Update deployment strategy --- .../backend/manager/data/deployment/types.py | 14 ++ src/ai/backend/manager/defs.py | 1 + src/ai/backend/manager/models/endpoint/row.py | 2 + .../deployment/db_source/db_source.py | 68 +++++++ .../repositories/deployment/repository.py | 24 +++ .../manager/sokovan/deployment/coordinator.py | 184 ++++++++++++++++- .../sokovan/deployment/handlers/__init__.py | 10 + .../sokovan/deployment/handlers/deploying.py | 189 ++++++++++++++++++ .../sokovan/deployment/strategy/__init__.py | 1 + .../sokovan/deployment/strategy/evaluator.py | 150 ++++++++++++++ .../deployment/strategy/rolling_update.py | 164 +++++++++++++++ .../sokovan/deployment/strategy/types.py | 48 +++++ .../manager/sokovan/deployment/types.py | 2 + 13 files changed, 852 insertions(+), 5 deletions(-) create mode 100644 src/ai/backend/manager/sokovan/deployment/handlers/deploying.py create mode 100644 src/ai/backend/manager/sokovan/deployment/strategy/__init__.py create mode 100644 src/ai/backend/manager/sokovan/deployment/strategy/evaluator.py create mode 100644 src/ai/backend/manager/sokovan/deployment/strategy/rolling_update.py create mode 100644 src/ai/backend/manager/sokovan/deployment/strategy/types.py diff --git a/src/ai/backend/manager/data/deployment/types.py b/src/ai/backend/manager/data/deployment/types.py index 2b2d00caaf7..ccee6a29ed1 100644 --- a/src/ai/backend/manager/data/deployment/types.py +++ b/src/ai/backend/manager/data/deployment/types.py @@ -157,6 +157,19 @@ class DeploymentSubStatus(enum.StrEnum): """ +class DeploymentSubStep(DeploymentSubStatus): + """Sub-steps for the DEPLOYING lifecycle phase. + + - PROVISIONING: New revision routes are being provisioned; waiting for readiness. + - PROGRESSING: Actively replacing old routes with new routes. + - ROLLED_BACK: All new routes failed; deployment rolled back to previous revision. + """ + + PROVISIONING = "provisioning" + PROGRESSING = "progressing" + ROLLED_BACK = "rolled_back" + + @dataclass(frozen=True) class DeploymentLifecycleStatus: """Target lifecycle state for a deployment status transition. @@ -353,6 +366,7 @@ class DeploymentInfo: network: DeploymentNetworkSpec model_revisions: list[ModelRevisionSpec] current_revision_id: UUID | None = None + deploying_revision_id: UUID | None = None def target_revision(self) -> ModelRevisionSpec | None: if self.model_revisions: diff --git a/src/ai/backend/manager/defs.py b/src/ai/backend/manager/defs.py index c9a1f9b073c..67d0bd066ae 100644 --- a/src/ai/backend/manager/defs.py +++ b/src/ai/backend/manager/defs.py @@ -110,6 +110,7 @@ class LockID(enum.IntEnum): LOCKID_DEPLOYMENT_CHECK_PENDING = 226 # For operations checking PENDING sessions LOCKID_DEPLOYMENT_CHECK_REPLICA = 227 # For operations checking REPLICA sessions LOCKID_DEPLOYMENT_DESTROYING = 228 # For operations destroying deployments + LOCKID_DEPLOYMENT_DEPLOYING = 229 # For operations deploying (rolling update) deployments # Sokovan target status locks (prevent concurrent operations on same status) LOCKID_SOKOVAN_TARGET_PENDING = 230 # For operations targeting PENDING sessions LOCKID_SOKOVAN_TARGET_PREPARING = 231 # For operations targeting PREPARING/PULLING sessions diff --git a/src/ai/backend/manager/models/endpoint/row.py b/src/ai/backend/manager/models/endpoint/row.py index 21c6a36849f..e27daca3095 100644 --- a/src/ai/backend/manager/models/endpoint/row.py +++ b/src/ai/backend/manager/models/endpoint/row.py @@ -837,6 +837,7 @@ def _to_deployment_info_from_revision( ), ], current_revision_id=self.current_revision, + deploying_revision_id=self.deploying_revision, ) def _to_deployment_info_legacy(self) -> DeploymentInfo: @@ -898,6 +899,7 @@ def _to_deployment_info_legacy(self) -> DeploymentInfo: ), ], current_revision_id=self.current_revision, + deploying_revision_id=self.deploying_revision, ) diff --git a/src/ai/backend/manager/repositories/deployment/db_source/db_source.py b/src/ai/backend/manager/repositories/deployment/db_source/db_source.py index 7f4b479f82a..ac953e35c20 100644 --- a/src/ai/backend/manager/repositories/deployment/db_source/db_source.py +++ b/src/ai/backend/manager/repositories/deployment/db_source/db_source.py @@ -2248,6 +2248,74 @@ async def delete_deployment_policy( async with self._begin_session_read_committed() as db_sess: return await execute_purger(db_sess, purger) + async def fetch_deployment_policies_by_endpoint_ids( + self, + endpoint_ids: set[uuid.UUID], + ) -> Mapping[uuid.UUID, DeploymentPolicyData]: + """Fetch deployment policies for multiple endpoints in bulk. + + Args: + endpoint_ids: Set of endpoint IDs to fetch policies for. + + Returns: + Mapping of endpoint ID to DeploymentPolicyData. + """ + if not endpoint_ids: + return {} + async with self._db.begin_readonly_session_read_committed() as db_sess: + query = sa.select(DeploymentPolicyRow).where( + DeploymentPolicyRow.endpoint.in_(endpoint_ids) + ) + result = await db_sess.execute(query) + rows = result.scalars().all() + return {row.endpoint: row.to_data() for row in rows} + + async def complete_deployment_revision_swap( + self, + endpoint_ids: set[uuid.UUID], + ) -> None: + """Swap deploying_revision to current_revision for completed deployments. + + Sets current_revision = deploying_revision and deploying_revision = NULL + for the given endpoints. + + Args: + endpoint_ids: Set of endpoint IDs to swap revisions for. + """ + if not endpoint_ids: + return + async with self._begin_session_read_committed() as db_sess: + stmt = ( + sa.update(EndpointRow) + .where(EndpointRow.id.in_(endpoint_ids)) + .values( + current_revision=EndpointRow.deploying_revision, + deploying_revision=None, + ) + ) + await db_sess.execute(stmt) + + async def clear_deploying_revision( + self, + endpoint_ids: set[uuid.UUID], + ) -> None: + """Clear deploying_revision for rolled-back deployments. + + Sets deploying_revision = NULL without modifying current_revision. + + Args: + endpoint_ids: Set of endpoint IDs to clear deploying revision for. + """ + if not endpoint_ids: + return + async with self._begin_session_read_committed() as db_sess: + stmt = ( + sa.update(EndpointRow) + .where(EndpointRow.id.in_(endpoint_ids)) + .values(deploying_revision=None) + ) + await db_sess.execute(stmt) + # ========== Access Token Operations ========== async def create_access_token( diff --git a/src/ai/backend/manager/repositories/deployment/repository.py b/src/ai/backend/manager/repositories/deployment/repository.py index 7849839480a..20a6c9df4c9 100644 --- a/src/ai/backend/manager/repositories/deployment/repository.py +++ b/src/ai/backend/manager/repositories/deployment/repository.py @@ -1216,6 +1216,30 @@ async def delete_deployment_policy( """ return await self._db_source.delete_deployment_policy(purger) + @deployment_repository_resilience.apply() + async def fetch_deployment_policies_by_endpoint_ids( + self, + endpoint_ids: set[uuid.UUID], + ) -> Mapping[uuid.UUID, DeploymentPolicyData]: + """Fetch deployment policies for multiple endpoints in bulk.""" + return await self._db_source.fetch_deployment_policies_by_endpoint_ids(endpoint_ids) + + @deployment_repository_resilience.apply() + async def complete_deployment_revision_swap( + self, + endpoint_ids: set[uuid.UUID], + ) -> None: + """Swap deploying_revision to current_revision for completed deployments.""" + await self._db_source.complete_deployment_revision_swap(endpoint_ids) + + @deployment_repository_resilience.apply() + async def clear_deploying_revision( + self, + endpoint_ids: set[uuid.UUID], + ) -> None: + """Clear deploying_revision for rolled-back deployments.""" + await self._db_source.clear_deploying_revision(endpoint_ids) + # =================== # Route operations # =================== diff --git a/src/ai/backend/manager/sokovan/deployment/coordinator.py b/src/ai/backend/manager/sokovan/deployment/coordinator.py index 4ca8247d8fe..4e2233c34d1 100644 --- a/src/ai/backend/manager/sokovan/deployment/coordinator.py +++ b/src/ai/backend/manager/sokovan/deployment/coordinator.py @@ -26,8 +26,9 @@ from ai.backend.common.leader.tasks.event_task import EventTaskSpec from ai.backend.logging import BraceStyleAdapter from ai.backend.manager.config.provider import ManagerConfigProvider -from ai.backend.manager.data.deployment.types import DeploymentInfo +from ai.backend.manager.data.deployment.types import DeploymentInfo, DeploymentSubStep from ai.backend.manager.data.session.types import SchedulingResult +from ai.backend.manager.defs import LockID from ai.backend.manager.models.endpoint import EndpointRow from ai.backend.manager.repositories.base.creator import BulkCreator from ai.backend.manager.repositories.base.updater import BatchUpdater @@ -51,13 +52,20 @@ from .handlers import ( CheckPendingDeploymentHandler, CheckReplicaDeploymentHandler, + DeployingProgressingHandler, + DeployingProvisioningHandler, + DeployingRolledBackHandler, DeploymentHandler, DestroyingDeploymentHandler, ReconcileDeploymentHandler, ScalingDeploymentHandler, ) +from .strategy.evaluator import DeploymentStrategyEvaluator from .types import DeploymentExecutionResult, DeploymentLifecycleType +# Handler key: either a simple lifecycle type or a (lifecycle, sub-step) tuple +HandlerKey = DeploymentLifecycleType | tuple[DeploymentLifecycleType, DeploymentSubStep] + log = BraceStyleAdapter(logging.getLogger(__name__)) @@ -95,7 +103,8 @@ class DeploymentCoordinator: _valkey_schedule: ValkeyScheduleClient _deployment_controller: DeploymentController _deployment_repository: DeploymentRepository - _deployment_handlers: Mapping[DeploymentLifecycleType, DeploymentHandler] + _deployment_handlers: Mapping[HandlerKey, DeploymentHandler] + _deployment_evaluators: Mapping[DeploymentLifecycleType, DeploymentStrategyEvaluator] _lock_factory: DistributedLockFactory _config_provider: ManagerConfigProvider _event_producer: EventProducer @@ -131,12 +140,17 @@ def __init__( valkey_stat=valkey_stat, ) self._deployment_handlers = self._init_handlers(executor) + self._deployment_evaluators = { + DeploymentLifecycleType.DEPLOYING: DeploymentStrategyEvaluator( + deployment_repo=self._deployment_repository, + ), + } def _init_handlers( self, executor: DeploymentExecutor - ) -> Mapping[DeploymentLifecycleType, DeploymentHandler]: - """Initialize and return the mapping of deployment lifecycle types to their handlers.""" - return { + ) -> Mapping[HandlerKey, DeploymentHandler]: + """Initialize and return the mapping of handler keys to their handlers.""" + handlers: dict[HandlerKey, DeploymentHandler] = { DeploymentLifecycleType.CHECK_PENDING: CheckPendingDeploymentHandler( deployment_executor=executor, deployment_controller=self._deployment_controller, @@ -159,12 +173,37 @@ def _init_handlers( deployment_controller=self._deployment_controller, route_controller=self._route_controller, ), + # DEPLOYING sub-step handlers (keyed by composite key) + (DeploymentLifecycleType.DEPLOYING, DeploymentSubStep.PROVISIONING): ( + DeployingProvisioningHandler( + deployment_controller=self._deployment_controller, + route_controller=self._route_controller, + ) + ), + (DeploymentLifecycleType.DEPLOYING, DeploymentSubStep.PROGRESSING): ( + DeployingProgressingHandler( + deployment_controller=self._deployment_controller, + route_controller=self._route_controller, + ) + ), + (DeploymentLifecycleType.DEPLOYING, DeploymentSubStep.ROLLED_BACK): ( + DeployingRolledBackHandler( + deployment_repo=self._deployment_repository, + ) + ), } + return handlers async def process_deployment_lifecycle( self, lifecycle_type: DeploymentLifecycleType, ) -> None: + # Check if this lifecycle type uses an evaluator (e.g. DEPLOYING) + evaluator = self._deployment_evaluators.get(lifecycle_type) + if evaluator is not None: + await self._process_with_evaluator(lifecycle_type, evaluator) + return + handler = self._deployment_handlers.get(lifecycle_type) if not handler: log.warning("No handler for deployment lifecycle type: {}", lifecycle_type.value) @@ -314,6 +353,134 @@ async def _handle_status_transitions( except Exception as e: log.warning("Failed to send lifecycle notification: {}", e) + async def _process_with_evaluator( + self, + lifecycle_type: DeploymentLifecycleType, + evaluator: DeploymentStrategyEvaluator, + ) -> None: + """Process deployments that use a strategy evaluator (e.g. DEPLOYING). + + 1. Acquire distributed lock. + 2. Load DEPLOYING deployments. + 3. Run evaluator (evaluates strategy FSM + applies route mutations). + 4. For each sub-step group, run the corresponding handler. + 5. For completed deployments, swap revisions and transition to READY. + """ + lock_lifetime = self._config_provider.config.manager.session_schedule_lock_lifetime + async with self._lock_factory(LockID.LOCKID_DEPLOYMENT_DEPLOYING, lock_lifetime): + deployments = await self._deployment_repository.get_endpoints_by_statuses([ + EndpointLifecycle.DEPLOYING + ]) + if not deployments: + log.trace("No DEPLOYING deployments to process") + return + log.info("DEPLOYING: processing {} deployments", len(deployments)) + + deployment_ids = [d.id for d in deployments] + with DeploymentRecorderContext.scope( + lifecycle_type.value, entity_ids=deployment_ids + ) as pool: + eval_result = await evaluator.evaluate(deployments) + all_records = pool.build_all_records() + + # Process each sub-step group with its handler + for sub_step, group in eval_result.groups.items(): + handler_key: HandlerKey = (lifecycle_type, sub_step) + handler = self._deployment_handlers.get(handler_key) + if handler is None: + log.warning( + "No handler for sub-step {}/{}", lifecycle_type.value, sub_step.value + ) + continue + + sub_result = await handler.execute(group.deployments) + await self._handle_status_transitions(handler, sub_result, all_records) + + # Post-process outside recorder scope + for sub_step, group in eval_result.groups.items(): + handler_key = (lifecycle_type, sub_step) + handler = self._deployment_handlers.get(handler_key) + if handler is None: + continue + try: + result_for_post = DeploymentExecutionResult(successes=group.deployments) + await handler.post_process(result_for_post) + except Exception as e: + log.error( + "Error during post-processing for sub-step {}: {}", + sub_step.value, + e, + ) + + # Transition completed deployments: swap revision and move to READY + if eval_result.completed: + await self._transition_completed_deployments(lifecycle_type, eval_result.completed) + + async def _transition_completed_deployments( + self, + lifecycle_type: DeploymentLifecycleType, + completed: list[DeploymentInfo], + ) -> None: + """Transition completed DEPLOYING deployments to READY. + + 1. Swap deploying_revision → current_revision. + 2. Update lifecycle to READY with history recording. + 3. Send notification events. + """ + endpoint_ids = {deployment.id for deployment in completed} + + # Swap revisions + await self._deployment_repository.complete_deployment_revision_swap(endpoint_ids) + log.info( + "Swapped deploying_revision → current_revision for {} deployments", + len(endpoint_ids), + ) + + # Build lifecycle transition + target_statuses = [EndpointLifecycle.DEPLOYING] + from_status = EndpointLifecycle.DEPLOYING + to_status = EndpointLifecycle.READY + + batch_updater = BatchUpdater( + spec=EndpointLifecycleBatchUpdaterSpec(lifecycle_stage=to_status), + conditions=[ + DeploymentConditions.by_ids(list(endpoint_ids)), + DeploymentConditions.by_lifecycle_stages(target_statuses), + ], + ) + + timestamp_now = datetime.now(UTC).isoformat() + history_specs = [ + DeploymentHistoryCreatorSpec( + deployment_id=deployment.id, + phase=lifecycle_type.value, + result=SchedulingResult.SUCCESS, + message="Rolling update completed successfully", + from_status=from_status, + to_status=to_status, + sub_steps=[], + ) + for deployment in completed + ] + + await self._deployment_repository.update_endpoint_lifecycle_bulk_with_history( + [batch_updater], BulkCreator(specs=history_specs) + ) + + # Send notifications + for deployment in completed: + try: + event = self._build_lifecycle_notification_event( + deployment=deployment, + from_status=from_status, + to_status=to_status, + transition_result="success", + timestamp=timestamp_now, + ) + await self._event_producer.anycast_event(event) + except Exception as e: + log.warning("Failed to send lifecycle notification: {}", e) + def _build_lifecycle_notification_event( self, deployment: DeploymentInfo, @@ -386,6 +553,13 @@ def _create_task_specs() -> list[DeploymentTaskSpec]: long_interval=30.0, initial_delay=10.0, ), + # Deploying (rolling update) - both short and long cycles + DeploymentTaskSpec( + DeploymentLifecycleType.DEPLOYING, + short_interval=5.0, + long_interval=30.0, + initial_delay=10.0, + ), # Check destroying deployments - only long cycle DeploymentTaskSpec( DeploymentLifecycleType.DESTROYING, diff --git a/src/ai/backend/manager/sokovan/deployment/handlers/__init__.py b/src/ai/backend/manager/sokovan/deployment/handlers/__init__.py index 90f4cc62fe7..a5c94ed0ae4 100644 --- a/src/ai/backend/manager/sokovan/deployment/handlers/__init__.py +++ b/src/ai/backend/manager/sokovan/deployment/handlers/__init__.py @@ -3,6 +3,12 @@ """ from .base import DeploymentHandler +from .deploying import ( + DeployingInProgressHandler, + DeployingProgressingHandler, + DeployingProvisioningHandler, + DeployingRolledBackHandler, +) from .destroying import DestroyingDeploymentHandler from .pending import CheckPendingDeploymentHandler from .reconcile import ReconcileDeploymentHandler @@ -12,6 +18,10 @@ __all__ = [ "CheckPendingDeploymentHandler", "CheckReplicaDeploymentHandler", + "DeployingInProgressHandler", + "DeployingProgressingHandler", + "DeployingProvisioningHandler", + "DeployingRolledBackHandler", "DeploymentHandler", "DestroyingDeploymentHandler", "ReconcileDeploymentHandler", diff --git a/src/ai/backend/manager/sokovan/deployment/handlers/deploying.py b/src/ai/backend/manager/sokovan/deployment/handlers/deploying.py new file mode 100644 index 00000000000..93e60da0467 --- /dev/null +++ b/src/ai/backend/manager/sokovan/deployment/handlers/deploying.py @@ -0,0 +1,189 @@ +"""Handlers for DEPLOYING sub-steps (BEP-1049). + +In-progress handlers (PROVISIONING, PROGRESSING) run *after* the strategy +evaluator has already applied route mutations. Their ``execute`` simply +returns success. ``post_process`` triggers the next DEPLOYING cycle and +route provisioning. + +The rolled-back handler clears ``deploying_revision`` and transitions the +deployment back to READY. +""" + +from __future__ import annotations + +import logging +from collections.abc import Sequence +from typing import override + +from ai.backend.logging import BraceStyleAdapter +from ai.backend.manager.data.deployment.types import ( + DeploymentInfo, + DeploymentLifecycleStatus, + DeploymentStatusTransitions, + DeploymentSubStep, +) +from ai.backend.manager.data.model_serving.types import EndpointLifecycle +from ai.backend.manager.defs import LockID +from ai.backend.manager.repositories.deployment.repository import DeploymentRepository +from ai.backend.manager.sokovan.deployment.deployment_controller import DeploymentController +from ai.backend.manager.sokovan.deployment.route.route_controller import RouteController +from ai.backend.manager.sokovan.deployment.route.types import RouteLifecycleType +from ai.backend.manager.sokovan.deployment.types import ( + DeploymentExecutionResult, + DeploymentLifecycleType, +) + +from .base import DeploymentHandler + +log = BraceStyleAdapter(logging.getLogger(__name__)) + + +# --------------------------------------------------------------------------- +# In-progress handlers (PROVISIONING / PROGRESSING) +# --------------------------------------------------------------------------- + + +class DeployingInProgressHandler(DeploymentHandler): + """Base handler for in-progress DEPLOYING sub-steps. + + execute() returns success for all supplied deployments. + post_process() re-schedules the DEPLOYING cycle and triggers route provisioning. + """ + + def __init__( + self, + deployment_controller: DeploymentController, + route_controller: RouteController, + ) -> None: + self._deployment_controller = deployment_controller + self._route_controller = route_controller + + @classmethod + @override + def name(cls) -> str: + return "deploying-in-progress" + + @property + @override + def lock_id(self) -> LockID | None: + return None # Lock is managed by the coordinator's _process_with_evaluator + + @classmethod + @override + def target_statuses(cls) -> list[EndpointLifecycle]: + return [EndpointLifecycle.DEPLOYING] + + @classmethod + @override + def status_transitions(cls) -> DeploymentStatusTransitions: + # Stay in DEPLOYING — no automatic transition here. + return DeploymentStatusTransitions(success=None, failure=None) + + @override + async def execute(self, deployments: Sequence[DeploymentInfo]) -> DeploymentExecutionResult: + return DeploymentExecutionResult(successes=list(deployments)) + + @override + async def post_process(self, result: DeploymentExecutionResult) -> None: + # Re-schedule DEPLOYING for the next coordinator cycle + await self._deployment_controller.mark_lifecycle_needed(DeploymentLifecycleType.DEPLOYING) + # Trigger route provisioning so new routes get sessions + await self._route_controller.mark_lifecycle_needed(RouteLifecycleType.PROVISIONING) + + +class DeployingProvisioningHandler(DeployingInProgressHandler): + """Handler for DEPLOYING / PROVISIONING sub-step. + + New-revision routes are being created; waiting for them to become HEALTHY. + """ + + @classmethod + @override + def name(cls) -> str: + return "deploying-provisioning" + + @classmethod + @override + def status_transitions(cls) -> DeploymentStatusTransitions: + return DeploymentStatusTransitions( + success=DeploymentLifecycleStatus( + lifecycle=EndpointLifecycle.DEPLOYING, + sub_status=DeploymentSubStep.PROVISIONING, + ), + failure=None, + ) + + +class DeployingProgressingHandler(DeployingInProgressHandler): + """Handler for DEPLOYING / PROGRESSING sub-step. + + Actively replacing old routes with new routes. + """ + + @classmethod + @override + def name(cls) -> str: + return "deploying-progressing" + + @classmethod + @override + def status_transitions(cls) -> DeploymentStatusTransitions: + return DeploymentStatusTransitions( + success=DeploymentLifecycleStatus( + lifecycle=EndpointLifecycle.DEPLOYING, + sub_status=DeploymentSubStep.PROGRESSING, + ), + failure=None, + ) + + +# --------------------------------------------------------------------------- +# Rolled-back handler +# --------------------------------------------------------------------------- + + +class DeployingRolledBackHandler(DeploymentHandler): + """Handler for DEPLOYING / ROLLED_BACK sub-step. + + Clears ``deploying_revision`` and transitions to READY / ROLLED_BACK. + """ + + def __init__(self, deployment_repo: DeploymentRepository) -> None: + self._deployment_repo = deployment_repo + + @classmethod + @override + def name(cls) -> str: + return "deploying-rolled-back" + + @property + @override + def lock_id(self) -> LockID | None: + return None # Lock is managed by the coordinator + + @classmethod + @override + def target_statuses(cls) -> list[EndpointLifecycle]: + return [EndpointLifecycle.DEPLOYING] + + @classmethod + @override + def status_transitions(cls) -> DeploymentStatusTransitions: + return DeploymentStatusTransitions( + success=DeploymentLifecycleStatus( + lifecycle=EndpointLifecycle.READY, + sub_status=DeploymentSubStep.ROLLED_BACK, + ), + failure=None, + ) + + @override + async def execute(self, deployments: Sequence[DeploymentInfo]) -> DeploymentExecutionResult: + endpoint_ids = {d.id for d in deployments} + await self._deployment_repo.clear_deploying_revision(endpoint_ids) + log.info("Cleared deploying_revision for {} rolled-back deployments", len(endpoint_ids)) + return DeploymentExecutionResult(successes=list(deployments)) + + @override + async def post_process(self, result: DeploymentExecutionResult) -> None: + pass diff --git a/src/ai/backend/manager/sokovan/deployment/strategy/__init__.py b/src/ai/backend/manager/sokovan/deployment/strategy/__init__.py new file mode 100644 index 00000000000..a2ecf59ecb4 --- /dev/null +++ b/src/ai/backend/manager/sokovan/deployment/strategy/__init__.py @@ -0,0 +1 @@ +"""Deployment strategy evaluation for rolling update and blue-green deployments (BEP-1049).""" diff --git a/src/ai/backend/manager/sokovan/deployment/strategy/evaluator.py b/src/ai/backend/manager/sokovan/deployment/strategy/evaluator.py new file mode 100644 index 00000000000..c5a916b6968 --- /dev/null +++ b/src/ai/backend/manager/sokovan/deployment/strategy/evaluator.py @@ -0,0 +1,150 @@ +"""Deployment strategy evaluator — orchestrates per-deployment FSM evaluation (BEP-1049). + +Loads policies and routes in bulk, dispatches each deployment to the appropriate +strategy FSM, aggregates route mutations, and applies them in one batch. +""" + +from __future__ import annotations + +import logging +from collections.abc import Sequence +from uuid import UUID + +from ai.backend.common.data.model_deployment.types import DeploymentStrategy +from ai.backend.logging import BraceStyleAdapter +from ai.backend.manager.data.deployment.types import ( + DeploymentInfo, + DeploymentPolicyData, + RouteInfo, + RouteStatus, + RouteTrafficStatus, +) +from ai.backend.manager.models.deployment_policy import RollingUpdateSpec +from ai.backend.manager.models.routing import RoutingRow +from ai.backend.manager.repositories.base import Creator +from ai.backend.manager.repositories.base.updater import BatchUpdater +from ai.backend.manager.repositories.deployment.creators import RouteBatchUpdaterSpec +from ai.backend.manager.repositories.deployment.options import RouteConditions +from ai.backend.manager.repositories.deployment.repository import DeploymentRepository + +from .rolling_update import rolling_update_evaluate +from .types import CycleEvaluationResult, EvaluationGroup, EvaluationResult + +log = BraceStyleAdapter(logging.getLogger(__name__)) + + +class DeploymentStrategyEvaluator: + """Evaluates DEPLOYING deployments and produces grouped results + route mutations.""" + + def __init__(self, deployment_repo: DeploymentRepository) -> None: + self._deployment_repo = deployment_repo + + async def evaluate( + self, + deployments: Sequence[DeploymentInfo], + ) -> EvaluationResult: + """Evaluate all DEPLOYING deployments in a single cycle. + + Steps: + 1. Bulk-load policies and active routes. + 2. Per-deployment: dispatch to strategy FSM. + 3. Aggregate route changes and apply in one batch. + 4. Group deployments by sub-step and return. + """ + result = EvaluationResult() + + if not deployments: + return result + + endpoint_ids = {d.id for d in deployments} + + # ── 1. Bulk-load policies and routes ── + policy_map = await self._deployment_repo.fetch_deployment_policies_by_endpoint_ids( + endpoint_ids + ) + route_map = await self._deployment_repo.fetch_active_routes_by_endpoint_ids(endpoint_ids) + + # ── 2. Per-deployment evaluation ── + all_scale_out: list[Creator[RoutingRow]] = [] + all_scale_in_ids: list[UUID] = [] + + for deployment in deployments: + policy = policy_map.get(deployment.id) + if policy is None: + log.warning("deployment {}: no policy found — skipping", deployment.id) + result.skipped.append(deployment) + continue + + routes: list[RouteInfo] = list(route_map.get(deployment.id, [])) + + try: + cycle_result = self._evaluate_single(deployment, routes, policy.strategy, policy) + except Exception as e: + log.warning("deployment {}: evaluation error — {}", deployment.id, e) + result.errors.append((deployment, str(e))) + continue + + # Collect route changes + changes = cycle_result.route_changes + all_scale_out.extend(changes.scale_out_specs) + all_scale_in_ids.extend(changes.scale_in_route_ids) + + # Group by sub-step + if cycle_result.completed: + result.completed.append(deployment) + else: + group = result.groups.setdefault( + cycle_result.sub_step, + EvaluationGroup(sub_step=cycle_result.sub_step), + ) + group.deployments.append(deployment) + + # ── 3. Apply route mutations in batch ── + await self._apply_route_changes(all_scale_out, all_scale_in_ids) + + return result + + def _evaluate_single( + self, + deployment: DeploymentInfo, + routes: list[RouteInfo], + strategy: DeploymentStrategy, + policy: DeploymentPolicyData, + ) -> CycleEvaluationResult: + """Dispatch to the appropriate strategy FSM.""" + if strategy == DeploymentStrategy.ROLLING: + spec = policy.strategy_spec + if not isinstance(spec, RollingUpdateSpec): + raise ValueError( + f"Expected RollingUpdateSpec for ROLLING strategy, got {type(spec).__name__}" + ) + return rolling_update_evaluate(deployment, routes, spec) + + raise ValueError(f"Unsupported deployment strategy: {strategy}") + + async def _apply_route_changes( + self, + scale_out: list[Creator[RoutingRow]], + scale_in_ids: list[UUID], + ) -> None: + """Apply aggregated route mutations in a single DB transaction.""" + if not scale_out and not scale_in_ids: + return + + scale_in_updater: BatchUpdater[RoutingRow] | None = None + if scale_in_ids: + scale_in_updater = BatchUpdater( + spec=RouteBatchUpdaterSpec( + status=RouteStatus.TERMINATING, + traffic_ratio=0.0, + traffic_status=RouteTrafficStatus.INACTIVE, + ), + conditions=[RouteConditions.by_ids(scale_in_ids)], + ) + + await self._deployment_repo.scale_routes(scale_out, scale_in_updater) + log.debug( + "Applied route changes: {} created, {} terminated", + len(scale_out), + len(scale_in_ids), + ) diff --git a/src/ai/backend/manager/sokovan/deployment/strategy/rolling_update.py b/src/ai/backend/manager/sokovan/deployment/strategy/rolling_update.py new file mode 100644 index 00000000000..d64ea24e980 --- /dev/null +++ b/src/ai/backend/manager/sokovan/deployment/strategy/rolling_update.py @@ -0,0 +1,164 @@ +"""Rolling update FSM evaluation for a single deployment cycle (BEP-1049). + +Classifies routes by revision (old/new) and status, then decides the next +sub-step and route mutations based on ``max_surge`` / ``max_unavailable``. +""" + +from __future__ import annotations + +import logging +from collections.abc import Sequence + +from ai.backend.logging import BraceStyleAdapter +from ai.backend.manager.data.deployment.types import ( + DeploymentInfo, + DeploymentSubStep, + RouteInfo, + RouteStatus, +) +from ai.backend.manager.models.deployment_policy import RollingUpdateSpec +from ai.backend.manager.models.routing import RoutingRow +from ai.backend.manager.repositories.base import Creator +from ai.backend.manager.repositories.deployment.creators import RouteCreatorSpec + +from .types import CycleEvaluationResult, RouteChanges + +log = BraceStyleAdapter(logging.getLogger(__name__)) + + +def rolling_update_evaluate( + deployment: DeploymentInfo, + routes: Sequence[RouteInfo], + spec: RollingUpdateSpec, +) -> CycleEvaluationResult: + """Evaluate one cycle of rolling update for a single deployment. + + FSM flow: + 1. Classify routes into old / new by revision_id. + 2. If any new route is PROVISIONING → PROVISIONING (wait). + 3. If no old routes remain and new_healthy >= desired → completed. + 4. If all new routes failed → ROLLED_BACK. + 5. Compute allowed surge/unavailable, decide create/terminate → PROGRESSING. + """ + deploying_rev = deployment.deploying_revision_id + desired = deployment.replica_spec.target_replica_count + + # ── 1. Classify routes ── + old_active: list[RouteInfo] = [] + new_provisioning: list[RouteInfo] = [] + new_healthy: list[RouteInfo] = [] + new_failed: list[RouteInfo] = [] + + for r in routes: + is_new = r.revision_id == deploying_rev + if not is_new: + if r.status.is_active(): + old_active.append(r) + continue + + if r.status == RouteStatus.PROVISIONING: + new_provisioning.append(r) + elif r.status == RouteStatus.HEALTHY: + new_healthy.append(r) + elif r.status in (RouteStatus.FAILED_TO_START, RouteStatus.TERMINATED): + new_failed.append(r) + elif r.status.is_active(): + new_healthy.append(r) + + total_new_live = len(new_provisioning) + len(new_healthy) + + # ── 2. PROVISIONING: wait for in-flight routes ── + if new_provisioning: + log.debug( + "deployment {}: {} new routes still provisioning", + deployment.id, + len(new_provisioning), + ) + return CycleEvaluationResult(sub_step=DeploymentSubStep.PROVISIONING) + + # ── 3. Completed: all old replaced, enough new healthy ── + if not old_active and len(new_healthy) >= desired: + log.info( + "deployment {}: rolling update complete ({} healthy routes)", + deployment.id, + len(new_healthy), + ) + return CycleEvaluationResult( + sub_step=DeploymentSubStep.PROGRESSING, + completed=True, + ) + + # ── 4. Rolled back: every new route failed ── + if total_new_live == 0 and new_failed: + log.warning( + "deployment {}: all {} new routes failed — rolling back", + deployment.id, + len(new_failed), + ) + return CycleEvaluationResult(sub_step=DeploymentSubStep.ROLLED_BACK) + + # ── 5. PROGRESSING: compute surge / unavailable budget ── + max_surge = spec.max_surge + max_unavailable = spec.max_unavailable + + # Total pods allowed at peak = desired + max_surge + max_total = desired + max_surge + current_total = len(old_active) + total_new_live + + # Minimum available pods = desired - max_unavailable + min_available = max(0, desired - max_unavailable) + + route_changes = RouteChanges() + + # Decide how many new routes to create + can_create = max_total - current_total + still_needed = desired - total_new_live + to_create = max(0, min(can_create, still_needed)) + + if to_create > 0: + route_changes.scale_out_specs = _build_route_creators(deployment, to_create) + + # Decide how many old routes to terminate + available_count = len(new_healthy) + len(old_active) + can_terminate = available_count - min_available + to_terminate = max(0, min(can_terminate, len(old_active))) + + if to_terminate > 0: + # Terminate old routes with lowest termination priority first + sorted_old = sorted(old_active, key=lambda r: r.status.termination_priority()) + for r in sorted_old[:to_terminate]: + route_changes.scale_in_route_ids.append(r.route_id) + + log.debug( + "deployment {}: PROGRESSING create={}, terminate={}, " + "old_active={}, new_healthy={}, new_prov={}", + deployment.id, + to_create, + to_terminate, + len(old_active), + len(new_healthy), + len(new_provisioning), + ) + + return CycleEvaluationResult( + sub_step=DeploymentSubStep.PROGRESSING, + route_changes=route_changes, + ) + + +def _build_route_creators( + deployment: DeploymentInfo, + count: int, +) -> list[Creator[RoutingRow]]: + """Build route creator specs for new revision routes.""" + creators: list[Creator[RoutingRow]] = [] + for _ in range(count): + spec = RouteCreatorSpec( + endpoint_id=deployment.id, + session_owner_id=deployment.metadata.session_owner, + domain=deployment.metadata.domain, + project_id=deployment.metadata.project, + revision_id=deployment.deploying_revision_id, + ) + creators.append(Creator(spec=spec)) + return creators diff --git a/src/ai/backend/manager/sokovan/deployment/strategy/types.py b/src/ai/backend/manager/sokovan/deployment/strategy/types.py new file mode 100644 index 00000000000..dd8c61c13e5 --- /dev/null +++ b/src/ai/backend/manager/sokovan/deployment/strategy/types.py @@ -0,0 +1,48 @@ +"""Types for deployment strategy evaluation (BEP-1049).""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from uuid import UUID + +from ai.backend.manager.data.deployment.types import ( + DeploymentInfo, + DeploymentSubStep, +) +from ai.backend.manager.models.routing import RoutingRow +from ai.backend.manager.repositories.base import Creator + + +@dataclass +class RouteChanges: + """Route mutations to apply for a single deployment cycle.""" + + scale_out_specs: list[Creator[RoutingRow]] = field(default_factory=list) + scale_in_route_ids: list[UUID] = field(default_factory=list) + + +@dataclass +class CycleEvaluationResult: + """Result of evaluating a single deployment's rolling update cycle.""" + + sub_step: DeploymentSubStep + completed: bool = False + route_changes: RouteChanges = field(default_factory=RouteChanges) + + +@dataclass +class EvaluationGroup: + """Deployments grouped by their sub-step result.""" + + sub_step: DeploymentSubStep + deployments: list[DeploymentInfo] = field(default_factory=list) + + +@dataclass +class EvaluationResult: + """Aggregate result of evaluating all DEPLOYING deployments.""" + + groups: dict[DeploymentSubStep, EvaluationGroup] = field(default_factory=dict) + completed: list[DeploymentInfo] = field(default_factory=list) + skipped: list[DeploymentInfo] = field(default_factory=list) + errors: list[tuple[DeploymentInfo, str]] = field(default_factory=list) diff --git a/src/ai/backend/manager/sokovan/deployment/types.py b/src/ai/backend/manager/sokovan/deployment/types.py index 508534fc850..bf6e6dd2744 100644 --- a/src/ai/backend/manager/sokovan/deployment/types.py +++ b/src/ai/backend/manager/sokovan/deployment/types.py @@ -16,6 +16,7 @@ class DeploymentLifecycleType(StrEnum): CHECK_REPLICA = "check_replica" SCALING = "scaling" RECONCILE = "reconcile" + DEPLOYING = "deploying" DESTROYING = "destroying" @@ -34,6 +35,7 @@ class DeploymentExecutionResult: successes: list[DeploymentInfo] = field(default_factory=list) errors: list[DeploymentExecutionError] = field(default_factory=list) skipped: list[DeploymentInfo] = field(default_factory=list) + completed: list[DeploymentInfo] = field(default_factory=list) @dataclass From 21ac94970f8367db6666437768348c731d9d82f1 Mon Sep 17 00:00:00 2001 From: jopemachine Date: Mon, 2 Mar 2026 00:04:56 +0000 Subject: [PATCH 02/23] wip --- .../sokovan/deployment/strategy/blue_green.py | 26 ++++ .../sokovan/deployment/strategy/evaluator.py | 11 +- .../deployment/strategy/rolling_update.py | 146 +----------------- 3 files changed, 40 insertions(+), 143 deletions(-) create mode 100644 src/ai/backend/manager/sokovan/deployment/strategy/blue_green.py diff --git a/src/ai/backend/manager/sokovan/deployment/strategy/blue_green.py b/src/ai/backend/manager/sokovan/deployment/strategy/blue_green.py new file mode 100644 index 00000000000..6e76625ed8b --- /dev/null +++ b/src/ai/backend/manager/sokovan/deployment/strategy/blue_green.py @@ -0,0 +1,26 @@ +"""Blue-green deployment strategy evaluation for a single deployment cycle (BEP-1049). + +Provisions a full set of new-revision routes, validates them, then atomically +switches traffic from the old revision to the new one. +""" + +from __future__ import annotations + +from collections.abc import Sequence + +from ai.backend.manager.data.deployment.types import ( + DeploymentInfo, + RouteInfo, +) +from ai.backend.manager.models.deployment_policy import BlueGreenSpec + +from .types import CycleEvaluationResult + + +def blue_green_evaluate( + deployment: DeploymentInfo, + routes: Sequence[RouteInfo], + spec: BlueGreenSpec, +) -> CycleEvaluationResult: + """Evaluate one cycle of blue-green deployment for a single deployment.""" + raise NotImplementedError("Blue-green deployment strategy is not yet implemented") diff --git a/src/ai/backend/manager/sokovan/deployment/strategy/evaluator.py b/src/ai/backend/manager/sokovan/deployment/strategy/evaluator.py index c5a916b6968..69fe7f4a957 100644 --- a/src/ai/backend/manager/sokovan/deployment/strategy/evaluator.py +++ b/src/ai/backend/manager/sokovan/deployment/strategy/evaluator.py @@ -19,7 +19,7 @@ RouteStatus, RouteTrafficStatus, ) -from ai.backend.manager.models.deployment_policy import RollingUpdateSpec +from ai.backend.manager.models.deployment_policy import BlueGreenSpec, RollingUpdateSpec from ai.backend.manager.models.routing import RoutingRow from ai.backend.manager.repositories.base import Creator from ai.backend.manager.repositories.base.updater import BatchUpdater @@ -27,6 +27,7 @@ from ai.backend.manager.repositories.deployment.options import RouteConditions from ai.backend.manager.repositories.deployment.repository import DeploymentRepository +from .blue_green import blue_green_evaluate from .rolling_update import rolling_update_evaluate from .types import CycleEvaluationResult, EvaluationGroup, EvaluationResult @@ -120,6 +121,14 @@ def _evaluate_single( ) return rolling_update_evaluate(deployment, routes, spec) + if strategy == DeploymentStrategy.BLUE_GREEN: + spec = policy.strategy_spec + if not isinstance(spec, BlueGreenSpec): + raise ValueError( + f"Expected BlueGreenSpec for BLUE_GREEN strategy, got {type(spec).__name__}" + ) + return blue_green_evaluate(deployment, routes, spec) + raise ValueError(f"Unsupported deployment strategy: {strategy}") async def _apply_route_changes( diff --git a/src/ai/backend/manager/sokovan/deployment/strategy/rolling_update.py b/src/ai/backend/manager/sokovan/deployment/strategy/rolling_update.py index d64ea24e980..fbcb764355c 100644 --- a/src/ai/backend/manager/sokovan/deployment/strategy/rolling_update.py +++ b/src/ai/backend/manager/sokovan/deployment/strategy/rolling_update.py @@ -1,4 +1,4 @@ -"""Rolling update FSM evaluation for a single deployment cycle (BEP-1049). +"""Rolling update strategy evaluation for a single deployment cycle (BEP-1049). Classifies routes by revision (old/new) and status, then decides the next sub-step and route mutations based on ``max_surge`` / ``max_unavailable``. @@ -6,24 +6,15 @@ from __future__ import annotations -import logging from collections.abc import Sequence -from ai.backend.logging import BraceStyleAdapter from ai.backend.manager.data.deployment.types import ( DeploymentInfo, - DeploymentSubStep, RouteInfo, - RouteStatus, ) from ai.backend.manager.models.deployment_policy import RollingUpdateSpec -from ai.backend.manager.models.routing import RoutingRow -from ai.backend.manager.repositories.base import Creator -from ai.backend.manager.repositories.deployment.creators import RouteCreatorSpec -from .types import CycleEvaluationResult, RouteChanges - -log = BraceStyleAdapter(logging.getLogger(__name__)) +from .types import CycleEvaluationResult def rolling_update_evaluate( @@ -31,134 +22,5 @@ def rolling_update_evaluate( routes: Sequence[RouteInfo], spec: RollingUpdateSpec, ) -> CycleEvaluationResult: - """Evaluate one cycle of rolling update for a single deployment. - - FSM flow: - 1. Classify routes into old / new by revision_id. - 2. If any new route is PROVISIONING → PROVISIONING (wait). - 3. If no old routes remain and new_healthy >= desired → completed. - 4. If all new routes failed → ROLLED_BACK. - 5. Compute allowed surge/unavailable, decide create/terminate → PROGRESSING. - """ - deploying_rev = deployment.deploying_revision_id - desired = deployment.replica_spec.target_replica_count - - # ── 1. Classify routes ── - old_active: list[RouteInfo] = [] - new_provisioning: list[RouteInfo] = [] - new_healthy: list[RouteInfo] = [] - new_failed: list[RouteInfo] = [] - - for r in routes: - is_new = r.revision_id == deploying_rev - if not is_new: - if r.status.is_active(): - old_active.append(r) - continue - - if r.status == RouteStatus.PROVISIONING: - new_provisioning.append(r) - elif r.status == RouteStatus.HEALTHY: - new_healthy.append(r) - elif r.status in (RouteStatus.FAILED_TO_START, RouteStatus.TERMINATED): - new_failed.append(r) - elif r.status.is_active(): - new_healthy.append(r) - - total_new_live = len(new_provisioning) + len(new_healthy) - - # ── 2. PROVISIONING: wait for in-flight routes ── - if new_provisioning: - log.debug( - "deployment {}: {} new routes still provisioning", - deployment.id, - len(new_provisioning), - ) - return CycleEvaluationResult(sub_step=DeploymentSubStep.PROVISIONING) - - # ── 3. Completed: all old replaced, enough new healthy ── - if not old_active and len(new_healthy) >= desired: - log.info( - "deployment {}: rolling update complete ({} healthy routes)", - deployment.id, - len(new_healthy), - ) - return CycleEvaluationResult( - sub_step=DeploymentSubStep.PROGRESSING, - completed=True, - ) - - # ── 4. Rolled back: every new route failed ── - if total_new_live == 0 and new_failed: - log.warning( - "deployment {}: all {} new routes failed — rolling back", - deployment.id, - len(new_failed), - ) - return CycleEvaluationResult(sub_step=DeploymentSubStep.ROLLED_BACK) - - # ── 5. PROGRESSING: compute surge / unavailable budget ── - max_surge = spec.max_surge - max_unavailable = spec.max_unavailable - - # Total pods allowed at peak = desired + max_surge - max_total = desired + max_surge - current_total = len(old_active) + total_new_live - - # Minimum available pods = desired - max_unavailable - min_available = max(0, desired - max_unavailable) - - route_changes = RouteChanges() - - # Decide how many new routes to create - can_create = max_total - current_total - still_needed = desired - total_new_live - to_create = max(0, min(can_create, still_needed)) - - if to_create > 0: - route_changes.scale_out_specs = _build_route_creators(deployment, to_create) - - # Decide how many old routes to terminate - available_count = len(new_healthy) + len(old_active) - can_terminate = available_count - min_available - to_terminate = max(0, min(can_terminate, len(old_active))) - - if to_terminate > 0: - # Terminate old routes with lowest termination priority first - sorted_old = sorted(old_active, key=lambda r: r.status.termination_priority()) - for r in sorted_old[:to_terminate]: - route_changes.scale_in_route_ids.append(r.route_id) - - log.debug( - "deployment {}: PROGRESSING create={}, terminate={}, " - "old_active={}, new_healthy={}, new_prov={}", - deployment.id, - to_create, - to_terminate, - len(old_active), - len(new_healthy), - len(new_provisioning), - ) - - return CycleEvaluationResult( - sub_step=DeploymentSubStep.PROGRESSING, - route_changes=route_changes, - ) - - -def _build_route_creators( - deployment: DeploymentInfo, - count: int, -) -> list[Creator[RoutingRow]]: - """Build route creator specs for new revision routes.""" - creators: list[Creator[RoutingRow]] = [] - for _ in range(count): - spec = RouteCreatorSpec( - endpoint_id=deployment.id, - session_owner_id=deployment.metadata.session_owner, - domain=deployment.metadata.domain, - project_id=deployment.metadata.project, - revision_id=deployment.deploying_revision_id, - ) - creators.append(Creator(spec=spec)) - return creators + """Evaluate one cycle of rolling update for a single deployment.""" + raise NotImplementedError("Rolling update strategy is not yet implemented") From 234e1b890a213b8807e1663f94bbb8a24e01e279 Mon Sep 17 00:00:00 2001 From: jopemachine Date: Mon, 2 Mar 2026 00:07:38 +0000 Subject: [PATCH 03/23] docs: Add news fragment --- changes/9566.feature.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 changes/9566.feature.md diff --git a/changes/9566.feature.md b/changes/9566.feature.md new file mode 100644 index 00000000000..66ace8236c9 --- /dev/null +++ b/changes/9566.feature.md @@ -0,0 +1 @@ +Add the DEPLOYING lifecycle with strategy evaluator framework, sub-step handlers (PROVISIONING, PROGRESSING, ROLLED_BACK), and coordinator integration for BEP-1049. \ No newline at end of file From 3255c369d5cb5ad564067aa522c086baccf86cdc Mon Sep 17 00:00:00 2001 From: jopemachine Date: Mon, 2 Mar 2026 04:03:29 +0000 Subject: [PATCH 04/23] feat: Implement Blue-Green deployment strategy (BEP-1049) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement the blue-green FSM in blue_green_evaluate() with 8-step flow: classify routes → create green (INACTIVE) → wait provisioning → rollback if all failed → wait healthy < desired → manual wait → delay check → atomic promotion (green→ACTIVE + blue→TERMINATING). Key changes: - Add promote_route_ids to RouteChanges for green→ACTIVE promotion - Add status_updated_at to RoutingRow/RouteInfo for promote_delay_seconds - Add fetch_routes_by_endpoint_ids (no status filter) for rollback detection - Extend scale_routes with promote_updater parameter - Auto-set status_updated_at in RouteBatchUpdaterSpec on status change - Add alembic migration for status_updated_at column Co-Authored-By: Claude Opus 4.6 --- .../backend/manager/data/deployment/types.py | 1 + ...0e1f2_add_status_updated_at_to_routings.py | 34 ++ src/ai/backend/manager/models/routing/row.py | 7 + .../repositories/deployment/creators/route.py | 2 + .../deployment/db_source/db_source.py | 30 ++ .../repositories/deployment/repository.py | 11 +- .../sokovan/deployment/strategy/blue_green.py | 116 ++++- .../sokovan/deployment/strategy/evaluator.py | 24 +- .../sokovan/deployment/strategy/types.py | 1 + .../manager/sokovan/deployment/strategy/BUILD | 3 + .../sokovan/deployment/strategy/__init__.py | 0 .../deployment/strategy/test_blue_green.py | 491 ++++++++++++++++++ 12 files changed, 711 insertions(+), 9 deletions(-) create mode 100644 src/ai/backend/manager/models/alembic/versions/a7b8c9d0e1f2_add_status_updated_at_to_routings.py create mode 100644 tests/unit/manager/sokovan/deployment/strategy/BUILD create mode 100644 tests/unit/manager/sokovan/deployment/strategy/__init__.py create mode 100644 tests/unit/manager/sokovan/deployment/strategy/test_blue_green.py diff --git a/src/ai/backend/manager/data/deployment/types.py b/src/ai/backend/manager/data/deployment/types.py index ccee6a29ed1..28a3ff8e050 100644 --- a/src/ai/backend/manager/data/deployment/types.py +++ b/src/ai/backend/manager/data/deployment/types.py @@ -405,6 +405,7 @@ class RouteInfo: created_at: datetime | None revision_id: UUID | None traffic_status: RouteTrafficStatus + status_updated_at: datetime | None = None error_data: dict[str, Any] = field(default_factory=dict) diff --git a/src/ai/backend/manager/models/alembic/versions/a7b8c9d0e1f2_add_status_updated_at_to_routings.py b/src/ai/backend/manager/models/alembic/versions/a7b8c9d0e1f2_add_status_updated_at_to_routings.py new file mode 100644 index 00000000000..b85b941ba00 --- /dev/null +++ b/src/ai/backend/manager/models/alembic/versions/a7b8c9d0e1f2_add_status_updated_at_to_routings.py @@ -0,0 +1,34 @@ +"""add status_updated_at to routings + +Revision ID: a7b8c9d0e1f2 +Revises: 32ad43817452 +Create Date: 2026-03-02 + +""" + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = "a7b8c9d0e1f2" +down_revision = "32ad43817452" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + op.add_column( + "routings", + sa.Column( + "status_updated_at", + sa.DateTime(timezone=True), + server_default=sa.text("now()"), + nullable=True, + ), + ) + # Backfill existing rows with created_at value + op.execute("UPDATE routings SET status_updated_at = COALESCE(created_at, now())") + + +def downgrade() -> None: + op.drop_column("routings", "status_updated_at") diff --git a/src/ai/backend/manager/models/routing/row.py b/src/ai/backend/manager/models/routing/row.py index 51a9d9c1f9f..37ac464c8ce 100644 --- a/src/ai/backend/manager/models/routing/row.py +++ b/src/ai/backend/manager/models/routing/row.py @@ -95,6 +95,12 @@ class RoutingRow(Base): # type: ignore[misc] # Revision reference without FK (relationship only) revision: Mapped[uuid.UUID | None] = mapped_column("revision", GUID, nullable=True) + status_updated_at: Mapped[datetime | None] = mapped_column( + "status_updated_at", + sa.DateTime(timezone=True), + server_default=sa.text("now()"), + nullable=True, + ) traffic_status: Mapped[RouteTrafficStatus] = mapped_column( "traffic_status", EnumValueType(RouteTrafficStatus), @@ -255,5 +261,6 @@ def to_route_info(self) -> RouteInfo: created_at=self.created_at, revision_id=self.revision, traffic_status=self.traffic_status, + status_updated_at=self.status_updated_at, error_data=self.error_data or {}, ) diff --git a/src/ai/backend/manager/repositories/deployment/creators/route.py b/src/ai/backend/manager/repositories/deployment/creators/route.py index 2b313d7c172..254b6c087f4 100644 --- a/src/ai/backend/manager/repositories/deployment/creators/route.py +++ b/src/ai/backend/manager/repositories/deployment/creators/route.py @@ -4,6 +4,7 @@ import uuid from dataclasses import dataclass +from datetime import UTC, datetime from typing import Any, override from ai.backend.manager.data.deployment.types import RouteStatus, RouteTrafficStatus @@ -66,6 +67,7 @@ def build_values(self) -> dict[str, Any]: values: dict[str, Any] = {} if self.status is not None: values["status"] = self.status + values["status_updated_at"] = datetime.now(UTC) if self.traffic_ratio is not None: values["traffic_ratio"] = self.traffic_ratio if self.traffic_status is not None: diff --git a/src/ai/backend/manager/repositories/deployment/db_source/db_source.py b/src/ai/backend/manager/repositories/deployment/db_source/db_source.py index ac953e35c20..050e712f7e2 100644 --- a/src/ai/backend/manager/repositories/deployment/db_source/db_source.py +++ b/src/ai/backend/manager/repositories/deployment/db_source/db_source.py @@ -1406,10 +1406,37 @@ async def fetch_active_routes_by_endpoint_ids( routes_by_endpoint[row.endpoint].append(row.to_route_info()) return routes_by_endpoint + async def fetch_routes_by_endpoint_ids( + self, + endpoint_ids: set[uuid.UUID], + ) -> Mapping[uuid.UUID, list[RouteInfo]]: + """Fetch all routes for given endpoint IDs (no status filter). + + Unlike fetch_active_routes_by_endpoint_ids, this includes routes + in all statuses (FAILED_TO_START, TERMINATED, etc.), which is + required for blue-green rollback detection. + """ + if not endpoint_ids: + return {} + + async with self._begin_readonly_session_read_committed() as db_sess: + query = sa.select(RoutingRow).where( + RoutingRow.endpoint.in_(endpoint_ids), + ) + result = await db_sess.execute(query) + rows: Sequence[RoutingRow] = result.scalars().all() + routes_by_endpoint: defaultdict[uuid.UUID, list[RouteInfo]] = defaultdict(list) + for row in rows: + if row.endpoint not in routes_by_endpoint: + routes_by_endpoint[row.endpoint] = [] + routes_by_endpoint[row.endpoint].append(row.to_route_info()) + return routes_by_endpoint + async def scale_routes( self, scale_out_creators: Sequence[Creator[RoutingRow]], scale_in_updater: BatchUpdater[RoutingRow] | None, + promote_updater: BatchUpdater[RoutingRow] | None = None, ) -> None: """Scale out/in routes based on provided creators and updater.""" async with self._begin_session_read_committed() as db_sess: @@ -1419,6 +1446,9 @@ async def scale_routes( # Scale in routes if scale_in_updater: await execute_batch_updater(db_sess, scale_in_updater) + # Promote routes (blue-green) + if promote_updater: + await execute_batch_updater(db_sess, promote_updater) # Route operations diff --git a/src/ai/backend/manager/repositories/deployment/repository.py b/src/ai/backend/manager/repositories/deployment/repository.py index 20a6c9df4c9..4b28ef2596d 100644 --- a/src/ai/backend/manager/repositories/deployment/repository.py +++ b/src/ai/backend/manager/repositories/deployment/repository.py @@ -548,13 +548,22 @@ async def fetch_active_routes_by_endpoint_ids( """Fetch routes for multiple endpoints.""" return await self._db_source.fetch_active_routes_by_endpoint_ids(endpoint_ids) + @deployment_repository_resilience.apply() + async def fetch_routes_by_endpoint_ids( + self, + endpoint_ids: set[uuid.UUID], + ) -> Mapping[uuid.UUID, list[RouteInfo]]: + """Fetch all routes for multiple endpoints (no status filter).""" + return await self._db_source.fetch_routes_by_endpoint_ids(endpoint_ids) + @deployment_repository_resilience.apply() async def scale_routes( self, scale_out_creators: Sequence[Creator[RoutingRow]], scale_in_updater: BatchUpdater[RoutingRow] | None, + promote_updater: BatchUpdater[RoutingRow] | None = None, ) -> None: - await self._db_source.scale_routes(scale_out_creators, scale_in_updater) + await self._db_source.scale_routes(scale_out_creators, scale_in_updater, promote_updater) # Route operations diff --git a/src/ai/backend/manager/sokovan/deployment/strategy/blue_green.py b/src/ai/backend/manager/sokovan/deployment/strategy/blue_green.py index 6e76625ed8b..37f071abd32 100644 --- a/src/ai/backend/manager/sokovan/deployment/strategy/blue_green.py +++ b/src/ai/backend/manager/sokovan/deployment/strategy/blue_green.py @@ -7,14 +7,21 @@ from __future__ import annotations from collections.abc import Sequence +from datetime import UTC, datetime from ai.backend.manager.data.deployment.types import ( DeploymentInfo, + DeploymentSubStep, RouteInfo, + RouteStatus, + RouteTrafficStatus, ) from ai.backend.manager.models.deployment_policy import BlueGreenSpec +from ai.backend.manager.models.routing import RoutingRow +from ai.backend.manager.repositories.base import Creator +from ai.backend.manager.repositories.deployment.creators import RouteCreatorSpec -from .types import CycleEvaluationResult +from .types import CycleEvaluationResult, RouteChanges def blue_green_evaluate( @@ -22,5 +29,108 @@ def blue_green_evaluate( routes: Sequence[RouteInfo], spec: BlueGreenSpec, ) -> CycleEvaluationResult: - """Evaluate one cycle of blue-green deployment for a single deployment.""" - raise NotImplementedError("Blue-green deployment strategy is not yet implemented") + """Evaluate one cycle of blue-green deployment for a single deployment. + + FSM Steps: + 1. Classify routes into blue (old revision) and green (new/deploying revision). + 2. If no green routes exist, create ``desired`` green routes (INACTIVE). + 3. If any green is PROVISIONING, wait. + 4. If all green routes FAILED, rollback. + 5. If healthy green < desired, wait. + 6. All green healthy + auto_promote=False → wait for manual promotion. + 7. All green healthy + auto_promote=True + delay not elapsed → wait. + 8. All green healthy + auto_promote=True + delay elapsed → promote. + """ + deploying_revision = deployment.deploying_revision_id + desired = deployment.replica_spec.target_replica_count + + # ── Step 1: Classify routes ── + green: list[RouteInfo] = [] + blue: list[RouteInfo] = [] + for r in routes: + if r.revision_id is not None and r.revision_id == deploying_revision: + green.append(r) + else: + blue.append(r) + + green_provisioning = [r for r in green if r.status == RouteStatus.PROVISIONING] + green_healthy = [r for r in green if r.status == RouteStatus.HEALTHY] + green_failed = [ + r for r in green if r.status in (RouteStatus.FAILED_TO_START, RouteStatus.TERMINATED) + ] + blue_active = [r for r in blue if r.status.is_active()] + + # ── Step 2: No green routes → create them (INACTIVE, ratio=0.0) ── + if not green: + creators = _build_route_creators(deployment, desired) + return CycleEvaluationResult( + sub_step=DeploymentSubStep.PROVISIONING, + route_changes=RouteChanges(scale_out_specs=creators), + ) + + # ── Step 3: Green PROVISIONING exists → wait ── + if green_provisioning: + return CycleEvaluationResult(sub_step=DeploymentSubStep.PROVISIONING) + + # ── Step 4: All green failed → rollback ── + if green_failed and not green_healthy: + return CycleEvaluationResult( + sub_step=DeploymentSubStep.ROLLED_BACK, + route_changes=RouteChanges( + scale_in_route_ids=[r.route_id for r in green_failed], + ), + ) + + # ── Step 5: Healthy green < desired → wait (progressing) ── + if len(green_healthy) < desired: + return CycleEvaluationResult(sub_step=DeploymentSubStep.PROGRESSING) + + # ── Step 6: All green healthy + auto_promote=False → manual wait ── + if not spec.auto_promote: + return CycleEvaluationResult(sub_step=DeploymentSubStep.PROGRESSING) + + # ── Step 7: auto_promote=True + delay check ── + if spec.promote_delay_seconds > 0: + latest_healthy_at = _latest_status_updated_at(green_healthy) + if latest_healthy_at is None: + return CycleEvaluationResult(sub_step=DeploymentSubStep.PROGRESSING) + elapsed = (datetime.now(UTC) - latest_healthy_at).total_seconds() + if elapsed < spec.promote_delay_seconds: + return CycleEvaluationResult(sub_step=DeploymentSubStep.PROGRESSING) + + # ── Step 8: Promote green, terminate blue ── + return CycleEvaluationResult( + sub_step=DeploymentSubStep.PROGRESSING, + completed=True, + route_changes=RouteChanges( + promote_route_ids=[r.route_id for r in green_healthy], + scale_in_route_ids=[r.route_id for r in blue_active], + ), + ) + + +def _latest_status_updated_at(routes: list[RouteInfo]) -> datetime | None: + """Return the most recent status_updated_at among the given routes.""" + timestamps = [r.status_updated_at for r in routes if r.status_updated_at is not None] + return max(timestamps) if timestamps else None + + +def _build_route_creators( + deployment: DeploymentInfo, + count: int, +) -> list[Creator[RoutingRow]]: + """Build route creators for green routes (INACTIVE, traffic_ratio=0.0).""" + return [ + Creator( + spec=RouteCreatorSpec( + endpoint_id=deployment.id, + session_owner_id=deployment.metadata.session_owner, + domain=deployment.metadata.domain, + project_id=deployment.metadata.project, + traffic_ratio=0.0, + revision_id=deployment.deploying_revision_id, + traffic_status=RouteTrafficStatus.INACTIVE, + ) + ) + for _ in range(count) + ] diff --git a/src/ai/backend/manager/sokovan/deployment/strategy/evaluator.py b/src/ai/backend/manager/sokovan/deployment/strategy/evaluator.py index 69fe7f4a957..e10c8463f8f 100644 --- a/src/ai/backend/manager/sokovan/deployment/strategy/evaluator.py +++ b/src/ai/backend/manager/sokovan/deployment/strategy/evaluator.py @@ -63,11 +63,12 @@ async def evaluate( policy_map = await self._deployment_repo.fetch_deployment_policies_by_endpoint_ids( endpoint_ids ) - route_map = await self._deployment_repo.fetch_active_routes_by_endpoint_ids(endpoint_ids) + route_map = await self._deployment_repo.fetch_routes_by_endpoint_ids(endpoint_ids) # ── 2. Per-deployment evaluation ── all_scale_out: list[Creator[RoutingRow]] = [] all_scale_in_ids: list[UUID] = [] + all_promote_ids: list[UUID] = [] for deployment in deployments: policy = policy_map.get(deployment.id) @@ -89,6 +90,7 @@ async def evaluate( changes = cycle_result.route_changes all_scale_out.extend(changes.scale_out_specs) all_scale_in_ids.extend(changes.scale_in_route_ids) + all_promote_ids.extend(changes.promote_route_ids) # Group by sub-step if cycle_result.completed: @@ -101,7 +103,7 @@ async def evaluate( group.deployments.append(deployment) # ── 3. Apply route mutations in batch ── - await self._apply_route_changes(all_scale_out, all_scale_in_ids) + await self._apply_route_changes(all_scale_out, all_scale_in_ids, all_promote_ids) return result @@ -135,9 +137,10 @@ async def _apply_route_changes( self, scale_out: list[Creator[RoutingRow]], scale_in_ids: list[UUID], + promote_ids: list[UUID] | None = None, ) -> None: """Apply aggregated route mutations in a single DB transaction.""" - if not scale_out and not scale_in_ids: + if not scale_out and not scale_in_ids and not promote_ids: return scale_in_updater: BatchUpdater[RoutingRow] | None = None @@ -151,9 +154,20 @@ async def _apply_route_changes( conditions=[RouteConditions.by_ids(scale_in_ids)], ) - await self._deployment_repo.scale_routes(scale_out, scale_in_updater) + promote_updater: BatchUpdater[RoutingRow] | None = None + if promote_ids: + promote_updater = BatchUpdater( + spec=RouteBatchUpdaterSpec( + traffic_status=RouteTrafficStatus.ACTIVE, + traffic_ratio=1.0, + ), + conditions=[RouteConditions.by_ids(promote_ids)], + ) + + await self._deployment_repo.scale_routes(scale_out, scale_in_updater, promote_updater) log.debug( - "Applied route changes: {} created, {} terminated", + "Applied route changes: {} created, {} terminated, {} promoted", len(scale_out), len(scale_in_ids), + len(promote_ids) if promote_ids else 0, ) diff --git a/src/ai/backend/manager/sokovan/deployment/strategy/types.py b/src/ai/backend/manager/sokovan/deployment/strategy/types.py index dd8c61c13e5..cb69cbe2f9e 100644 --- a/src/ai/backend/manager/sokovan/deployment/strategy/types.py +++ b/src/ai/backend/manager/sokovan/deployment/strategy/types.py @@ -19,6 +19,7 @@ class RouteChanges: scale_out_specs: list[Creator[RoutingRow]] = field(default_factory=list) scale_in_route_ids: list[UUID] = field(default_factory=list) + promote_route_ids: list[UUID] = field(default_factory=list) @dataclass diff --git a/tests/unit/manager/sokovan/deployment/strategy/BUILD b/tests/unit/manager/sokovan/deployment/strategy/BUILD new file mode 100644 index 00000000000..57341b1358b --- /dev/null +++ b/tests/unit/manager/sokovan/deployment/strategy/BUILD @@ -0,0 +1,3 @@ +python_tests( + name="tests", +) diff --git a/tests/unit/manager/sokovan/deployment/strategy/__init__.py b/tests/unit/manager/sokovan/deployment/strategy/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/unit/manager/sokovan/deployment/strategy/test_blue_green.py b/tests/unit/manager/sokovan/deployment/strategy/test_blue_green.py new file mode 100644 index 00000000000..cd3be817bf4 --- /dev/null +++ b/tests/unit/manager/sokovan/deployment/strategy/test_blue_green.py @@ -0,0 +1,491 @@ +"""Unit tests for blue-green deployment strategy evaluation (BEP-1049).""" + +from __future__ import annotations + +import uuid +from datetime import UTC, datetime, timedelta + +from ai.backend.manager.data.deployment.types import ( + DeploymentInfo, + DeploymentMetadata, + DeploymentNetworkSpec, + DeploymentState, + DeploymentSubStep, + ReplicaSpec, + RouteInfo, + RouteStatus, + RouteTrafficStatus, +) +from ai.backend.manager.models.deployment_policy import BlueGreenSpec +from ai.backend.manager.repositories.base import Creator +from ai.backend.manager.repositories.deployment.creators import RouteCreatorSpec +from ai.backend.manager.sokovan.deployment.strategy.blue_green import blue_green_evaluate + +# ── Helpers ── + +_ENDPOINT_ID = uuid.uuid4() +_DEPLOYING_REVISION_ID = uuid.uuid4() +_OLD_REVISION_ID = uuid.uuid4() +_SESSION_OWNER = uuid.uuid4() +_PROJECT_ID = uuid.uuid4() +_DOMAIN = "default" + + +def _make_deployment( + *, + desired: int = 3, + deploying_revision_id: uuid.UUID | None = None, +) -> DeploymentInfo: + return DeploymentInfo( + id=_ENDPOINT_ID, + metadata=DeploymentMetadata( + name="test-deploy", + domain=_DOMAIN, + project=_PROJECT_ID, + resource_group="default", + created_user=_SESSION_OWNER, + session_owner=_SESSION_OWNER, + created_at=datetime.now(UTC), + revision_history_limit=5, + ), + state=DeploymentState( + lifecycle="DEPLOYING", # type: ignore[arg-type] + retry_count=0, + ), + replica_spec=ReplicaSpec( + replica_count=desired, + desired_replica_count=desired, + ), + network=DeploymentNetworkSpec(open_to_public=False), + model_revisions=[], + deploying_revision_id=deploying_revision_id or _DEPLOYING_REVISION_ID, + ) + + +def _make_route( + *, + revision_id: uuid.UUID | None = None, + status: RouteStatus = RouteStatus.HEALTHY, + traffic_status: RouteTrafficStatus = RouteTrafficStatus.ACTIVE, + traffic_ratio: float = 1.0, + status_updated_at: datetime | None = None, +) -> RouteInfo: + if status_updated_at is None: + status_updated_at = datetime.now(UTC) + return RouteInfo( + route_id=uuid.uuid4(), + endpoint_id=_ENDPOINT_ID, + session_id=None, + status=status, + traffic_ratio=traffic_ratio, + created_at=datetime.now(UTC), + revision_id=revision_id, + traffic_status=traffic_status, + status_updated_at=status_updated_at, + ) + + +def _blue_routes( + count: int, + *, + status: RouteStatus = RouteStatus.HEALTHY, +) -> list[RouteInfo]: + return [_make_route(revision_id=_OLD_REVISION_ID, status=status) for _ in range(count)] + + +def _green_routes( + count: int, + *, + status: RouteStatus = RouteStatus.HEALTHY, + traffic_status: RouteTrafficStatus = RouteTrafficStatus.INACTIVE, + traffic_ratio: float = 0.0, + status_updated_at: datetime | None = None, +) -> list[RouteInfo]: + return [ + _make_route( + revision_id=_DEPLOYING_REVISION_ID, + status=status, + traffic_status=traffic_status, + traffic_ratio=traffic_ratio, + status_updated_at=status_updated_at, + ) + for _ in range(count) + ] + + +def _default_spec( + *, + auto_promote: bool = False, + promote_delay_seconds: int = 0, +) -> BlueGreenSpec: + return BlueGreenSpec( + auto_promote=auto_promote, + promote_delay_seconds=promote_delay_seconds, + ) + + +# ── Test Classes ── + + +class TestNoGreenRoutes: + """Step 2: No green routes → create them (INACTIVE).""" + + def test_creates_green_routes_when_none_exist(self) -> None: + deployment = _make_deployment(desired=3) + routes = _blue_routes(3) + + result = blue_green_evaluate(deployment, routes, _default_spec()) + + assert result.sub_step == DeploymentSubStep.PROVISIONING + assert not result.completed + assert len(result.route_changes.scale_out_specs) == 3 + assert not result.route_changes.scale_in_route_ids + assert not result.route_changes.promote_route_ids + + def test_creator_spec_has_inactive_traffic(self) -> None: + deployment = _make_deployment(desired=2) + routes = _blue_routes(2) + + result = blue_green_evaluate(deployment, routes, _default_spec()) + + for creator in result.route_changes.scale_out_specs: + assert isinstance(creator, Creator) + spec = creator.spec + assert isinstance(spec, RouteCreatorSpec) + assert spec.traffic_status == RouteTrafficStatus.INACTIVE + assert spec.traffic_ratio == 0.0 + assert spec.revision_id == _DEPLOYING_REVISION_ID + + def test_creates_routes_when_no_blue_either(self) -> None: + deployment = _make_deployment(desired=2) + + result = blue_green_evaluate(deployment, [], _default_spec()) + + assert result.sub_step == DeploymentSubStep.PROVISIONING + assert len(result.route_changes.scale_out_specs) == 2 + + +class TestGreenProvisioning: + """Step 3: Green PROVISIONING → wait.""" + + def test_waits_when_green_provisioning(self) -> None: + deployment = _make_deployment(desired=3) + routes = _blue_routes(3) + _green_routes(3, status=RouteStatus.PROVISIONING) + + result = blue_green_evaluate(deployment, routes, _default_spec()) + + assert result.sub_step == DeploymentSubStep.PROVISIONING + assert not result.completed + assert not result.route_changes.scale_out_specs + assert not result.route_changes.scale_in_route_ids + + def test_waits_when_mixed_provisioning_and_healthy(self) -> None: + deployment = _make_deployment(desired=3) + routes = ( + _blue_routes(3) + + _green_routes(2, status=RouteStatus.HEALTHY) + + _green_routes(1, status=RouteStatus.PROVISIONING) + ) + + result = blue_green_evaluate(deployment, routes, _default_spec()) + + assert result.sub_step == DeploymentSubStep.PROVISIONING + assert not result.completed + + +class TestRollback: + """Step 4: All green failed → rollback.""" + + def test_rollback_when_all_green_failed(self) -> None: + deployment = _make_deployment(desired=3) + green_failed = _green_routes(3, status=RouteStatus.FAILED_TO_START) + routes = _blue_routes(3) + green_failed + + result = blue_green_evaluate(deployment, routes, _default_spec()) + + assert result.sub_step == DeploymentSubStep.ROLLED_BACK + assert not result.completed + assert len(result.route_changes.scale_in_route_ids) == 3 + for gf in green_failed: + assert gf.route_id in result.route_changes.scale_in_route_ids + + def test_rollback_with_terminated_green(self) -> None: + deployment = _make_deployment(desired=2) + routes = _blue_routes(2) + _green_routes(2, status=RouteStatus.TERMINATED) + + result = blue_green_evaluate(deployment, routes, _default_spec()) + + assert result.sub_step == DeploymentSubStep.ROLLED_BACK + assert len(result.route_changes.scale_in_route_ids) == 2 + + def test_no_rollback_when_some_green_healthy(self) -> None: + deployment = _make_deployment(desired=3) + routes = ( + _blue_routes(3) + + _green_routes(1, status=RouteStatus.HEALTHY) + + _green_routes(2, status=RouteStatus.FAILED_TO_START) + ) + + result = blue_green_evaluate(deployment, routes, _default_spec()) + + # Mixed: healthy < desired → PROGRESSING (step 5) + assert result.sub_step == DeploymentSubStep.PROGRESSING + assert not result.completed + + +class TestHealthyLessThanDesired: + """Step 5: Healthy green < desired → PROGRESSING.""" + + def test_progressing_when_healthy_less_than_desired(self) -> None: + deployment = _make_deployment(desired=5) + routes = _blue_routes(5) + _green_routes(3, status=RouteStatus.HEALTHY) + + result = blue_green_evaluate(deployment, routes, _default_spec()) + + assert result.sub_step == DeploymentSubStep.PROGRESSING + assert not result.completed + + +class TestManualPromotion: + """Step 6: All green healthy + auto_promote=False → manual wait.""" + + def test_waits_for_manual_promotion(self) -> None: + deployment = _make_deployment(desired=3) + routes = _blue_routes(3) + _green_routes(3, status=RouteStatus.HEALTHY) + spec = _default_spec(auto_promote=False) + + result = blue_green_evaluate(deployment, routes, spec) + + assert result.sub_step == DeploymentSubStep.PROGRESSING + assert not result.completed + assert not result.route_changes.promote_route_ids + assert not result.route_changes.scale_in_route_ids + + +class TestPromoteDelay: + """Step 7: auto_promote=True + promote_delay_seconds.""" + + def test_waits_when_delay_not_elapsed(self) -> None: + deployment = _make_deployment(desired=3) + recent = datetime.now(UTC) - timedelta(seconds=10) + routes = _blue_routes(3) + _green_routes( + 3, status=RouteStatus.HEALTHY, status_updated_at=recent + ) + spec = _default_spec(auto_promote=True, promote_delay_seconds=60) + + result = blue_green_evaluate(deployment, routes, spec) + + assert result.sub_step == DeploymentSubStep.PROGRESSING + assert not result.completed + + def test_promotes_when_delay_elapsed(self) -> None: + deployment = _make_deployment(desired=3) + past = datetime.now(UTC) - timedelta(seconds=120) + green = _green_routes(3, status=RouteStatus.HEALTHY, status_updated_at=past) + blue = _blue_routes(3) + routes = blue + green + spec = _default_spec(auto_promote=True, promote_delay_seconds=60) + + result = blue_green_evaluate(deployment, routes, spec) + + assert result.completed + assert len(result.route_changes.promote_route_ids) == 3 + assert len(result.route_changes.scale_in_route_ids) == 3 + + def test_waits_when_status_updated_at_is_none(self) -> None: + deployment = _make_deployment(desired=2) + green = _green_routes(2, status=RouteStatus.HEALTHY, status_updated_at=None) + # Manually set status_updated_at to None + for r in green: + object.__setattr__(r, "status_updated_at", None) + routes = _blue_routes(2) + green + spec = _default_spec(auto_promote=True, promote_delay_seconds=30) + + result = blue_green_evaluate(deployment, routes, spec) + + assert result.sub_step == DeploymentSubStep.PROGRESSING + assert not result.completed + + +class TestAutoPromotion: + """Step 8: auto_promote=True + delay=0 → immediate promotion.""" + + def test_promotes_immediately_with_zero_delay(self) -> None: + deployment = _make_deployment(desired=3) + green = _green_routes(3, status=RouteStatus.HEALTHY) + blue = _blue_routes(3) + routes = blue + green + spec = _default_spec(auto_promote=True, promote_delay_seconds=0) + + result = blue_green_evaluate(deployment, routes, spec) + + assert result.completed + assert result.sub_step == DeploymentSubStep.PROGRESSING + # Green route IDs promoted + assert len(result.route_changes.promote_route_ids) == 3 + for g in green: + assert g.route_id in result.route_changes.promote_route_ids + # Blue route IDs scaled in + assert len(result.route_changes.scale_in_route_ids) == 3 + for b in blue: + assert b.route_id in result.route_changes.scale_in_route_ids + + def test_no_blue_to_terminate(self) -> None: + deployment = _make_deployment(desired=2) + green = _green_routes(2, status=RouteStatus.HEALTHY) + spec = _default_spec(auto_promote=True) + + result = blue_green_evaluate(deployment, green, spec) + + assert result.completed + assert len(result.route_changes.promote_route_ids) == 2 + assert len(result.route_changes.scale_in_route_ids) == 0 + + +class TestSingleReplica: + """Edge case: desired=1.""" + + def test_single_replica_full_cycle(self) -> None: + deployment = _make_deployment(desired=1) + green = _green_routes(1, status=RouteStatus.HEALTHY) + blue = _blue_routes(1) + spec = _default_spec(auto_promote=True) + + result = blue_green_evaluate(deployment, blue + green, spec) + + assert result.completed + assert len(result.route_changes.promote_route_ids) == 1 + assert len(result.route_changes.scale_in_route_ids) == 1 + + +class TestLargeReplicaCount: + """Edge case: desired=10.""" + + def test_creates_correct_number_of_green_routes(self) -> None: + deployment = _make_deployment(desired=10) + routes = _blue_routes(10) + + result = blue_green_evaluate(deployment, routes, _default_spec()) + + assert len(result.route_changes.scale_out_specs) == 10 + + +class TestBlueRouteStatuses: + """Only active blue routes are terminated during promotion.""" + + def test_only_active_blue_terminated(self) -> None: + deployment = _make_deployment(desired=2) + blue_active = _blue_routes(2, status=RouteStatus.HEALTHY) + blue_inactive = [ + _make_route(revision_id=_OLD_REVISION_ID, status=RouteStatus.TERMINATED), + ] + green = _green_routes(2, status=RouteStatus.HEALTHY) + spec = _default_spec(auto_promote=True) + + result = blue_green_evaluate(deployment, blue_active + blue_inactive + green, spec) + + assert result.completed + # Only active blue routes are terminated + assert len(result.route_changes.scale_in_route_ids) == 2 + for b in blue_active: + assert b.route_id in result.route_changes.scale_in_route_ids + for b in blue_inactive: + assert b.route_id not in result.route_changes.scale_in_route_ids + + +class TestCreatorSpecFields: + """Verify RouteCreatorSpec fields for green routes.""" + + def test_creator_fields(self) -> None: + deployment = _make_deployment(desired=1) + + result = blue_green_evaluate(deployment, [], _default_spec()) + + wrapper = result.route_changes.scale_out_specs[0] + assert isinstance(wrapper, Creator) + spec = wrapper.spec + assert isinstance(spec, RouteCreatorSpec) + assert spec.endpoint_id == _ENDPOINT_ID + assert spec.session_owner_id == _SESSION_OWNER + assert spec.domain == _DOMAIN + assert spec.project_id == _PROJECT_ID + assert spec.traffic_ratio == 0.0 + assert spec.traffic_status == RouteTrafficStatus.INACTIVE + assert spec.revision_id == _DEPLOYING_REVISION_ID + + +class TestMixedGreenStatuses: + """Mixed green routes: some healthy, some failed, no provisioning.""" + + def test_mixed_healthy_and_failed_progresses(self) -> None: + deployment = _make_deployment(desired=4) + routes = ( + _blue_routes(4) + + _green_routes(2, status=RouteStatus.HEALTHY) + + _green_routes(2, status=RouteStatus.FAILED_TO_START) + ) + spec = _default_spec(auto_promote=True) + + result = blue_green_evaluate(deployment, routes, spec) + + # 2 healthy < 4 desired → PROGRESSING + assert result.sub_step == DeploymentSubStep.PROGRESSING + assert not result.completed + + +class TestDifferentEndpoints: + """Routes for different endpoints should still classify correctly.""" + + def test_different_deploying_revision(self) -> None: + other_revision = uuid.uuid4() + deployment = _make_deployment(desired=2, deploying_revision_id=other_revision) + # Routes with a different revision_id are classified as blue + routes = [ + _make_route(revision_id=_DEPLOYING_REVISION_ID, status=RouteStatus.HEALTHY), + _make_route(revision_id=_DEPLOYING_REVISION_ID, status=RouteStatus.HEALTHY), + ] + + result = blue_green_evaluate(deployment, routes, _default_spec()) + + # These are classified as blue (different revision), so no green → create + assert result.sub_step == DeploymentSubStep.PROVISIONING + assert len(result.route_changes.scale_out_specs) == 2 + + +class TestAtomicPromotion: + """Promotion is atomic: all green promoted + all blue terminated in one cycle.""" + + def test_atomic_promotion(self) -> None: + deployment = _make_deployment(desired=5) + green = _green_routes(5, status=RouteStatus.HEALTHY) + blue = _blue_routes(5) + spec = _default_spec(auto_promote=True) + + result = blue_green_evaluate(deployment, blue + green, spec) + + assert result.completed + green_ids = {g.route_id for g in green} + blue_ids = {b.route_id for b in blue} + assert set(result.route_changes.promote_route_ids) == green_ids + assert set(result.route_changes.scale_in_route_ids) == blue_ids + + +class TestNoScaleOutDuringWait: + """No new routes created when waiting for green to become healthy.""" + + def test_no_scale_out_during_provisioning_wait(self) -> None: + deployment = _make_deployment(desired=3) + routes = _blue_routes(3) + _green_routes(3, status=RouteStatus.PROVISIONING) + + result = blue_green_evaluate(deployment, routes, _default_spec()) + + assert not result.route_changes.scale_out_specs + + def test_no_scale_out_during_progressing(self) -> None: + deployment = _make_deployment(desired=3) + routes = _blue_routes(3) + _green_routes(3, status=RouteStatus.HEALTHY) + spec = _default_spec(auto_promote=False) + + result = blue_green_evaluate(deployment, routes, spec) + + assert not result.route_changes.scale_out_specs From 35a27aec1697b8df1770196db35921c6865637dc Mon Sep 17 00:00:00 2001 From: jopemachine Date: Mon, 2 Mar 2026 04:15:59 +0000 Subject: [PATCH 05/23] Revert "feat: Implement Blue-Green deployment strategy (BEP-1049)" This reverts commit 23732f418aeab6700e245dbbfbe4a9a98b397a14. --- .../backend/manager/data/deployment/types.py | 1 - ...0e1f2_add_status_updated_at_to_routings.py | 34 -- src/ai/backend/manager/models/routing/row.py | 7 - .../repositories/deployment/creators/route.py | 2 - .../deployment/db_source/db_source.py | 30 -- .../repositories/deployment/repository.py | 11 +- .../sokovan/deployment/strategy/blue_green.py | 116 +---- .../sokovan/deployment/strategy/evaluator.py | 24 +- .../sokovan/deployment/strategy/types.py | 1 - .../manager/sokovan/deployment/strategy/BUILD | 3 - .../sokovan/deployment/strategy/__init__.py | 0 .../deployment/strategy/test_blue_green.py | 491 ------------------ 12 files changed, 9 insertions(+), 711 deletions(-) delete mode 100644 src/ai/backend/manager/models/alembic/versions/a7b8c9d0e1f2_add_status_updated_at_to_routings.py delete mode 100644 tests/unit/manager/sokovan/deployment/strategy/BUILD delete mode 100644 tests/unit/manager/sokovan/deployment/strategy/__init__.py delete mode 100644 tests/unit/manager/sokovan/deployment/strategy/test_blue_green.py diff --git a/src/ai/backend/manager/data/deployment/types.py b/src/ai/backend/manager/data/deployment/types.py index 28a3ff8e050..ccee6a29ed1 100644 --- a/src/ai/backend/manager/data/deployment/types.py +++ b/src/ai/backend/manager/data/deployment/types.py @@ -405,7 +405,6 @@ class RouteInfo: created_at: datetime | None revision_id: UUID | None traffic_status: RouteTrafficStatus - status_updated_at: datetime | None = None error_data: dict[str, Any] = field(default_factory=dict) diff --git a/src/ai/backend/manager/models/alembic/versions/a7b8c9d0e1f2_add_status_updated_at_to_routings.py b/src/ai/backend/manager/models/alembic/versions/a7b8c9d0e1f2_add_status_updated_at_to_routings.py deleted file mode 100644 index b85b941ba00..00000000000 --- a/src/ai/backend/manager/models/alembic/versions/a7b8c9d0e1f2_add_status_updated_at_to_routings.py +++ /dev/null @@ -1,34 +0,0 @@ -"""add status_updated_at to routings - -Revision ID: a7b8c9d0e1f2 -Revises: 32ad43817452 -Create Date: 2026-03-02 - -""" - -import sqlalchemy as sa -from alembic import op - -# revision identifiers, used by Alembic. -revision = "a7b8c9d0e1f2" -down_revision = "32ad43817452" -branch_labels = None -depends_on = None - - -def upgrade() -> None: - op.add_column( - "routings", - sa.Column( - "status_updated_at", - sa.DateTime(timezone=True), - server_default=sa.text("now()"), - nullable=True, - ), - ) - # Backfill existing rows with created_at value - op.execute("UPDATE routings SET status_updated_at = COALESCE(created_at, now())") - - -def downgrade() -> None: - op.drop_column("routings", "status_updated_at") diff --git a/src/ai/backend/manager/models/routing/row.py b/src/ai/backend/manager/models/routing/row.py index 37ac464c8ce..51a9d9c1f9f 100644 --- a/src/ai/backend/manager/models/routing/row.py +++ b/src/ai/backend/manager/models/routing/row.py @@ -95,12 +95,6 @@ class RoutingRow(Base): # type: ignore[misc] # Revision reference without FK (relationship only) revision: Mapped[uuid.UUID | None] = mapped_column("revision", GUID, nullable=True) - status_updated_at: Mapped[datetime | None] = mapped_column( - "status_updated_at", - sa.DateTime(timezone=True), - server_default=sa.text("now()"), - nullable=True, - ) traffic_status: Mapped[RouteTrafficStatus] = mapped_column( "traffic_status", EnumValueType(RouteTrafficStatus), @@ -261,6 +255,5 @@ def to_route_info(self) -> RouteInfo: created_at=self.created_at, revision_id=self.revision, traffic_status=self.traffic_status, - status_updated_at=self.status_updated_at, error_data=self.error_data or {}, ) diff --git a/src/ai/backend/manager/repositories/deployment/creators/route.py b/src/ai/backend/manager/repositories/deployment/creators/route.py index 254b6c087f4..2b313d7c172 100644 --- a/src/ai/backend/manager/repositories/deployment/creators/route.py +++ b/src/ai/backend/manager/repositories/deployment/creators/route.py @@ -4,7 +4,6 @@ import uuid from dataclasses import dataclass -from datetime import UTC, datetime from typing import Any, override from ai.backend.manager.data.deployment.types import RouteStatus, RouteTrafficStatus @@ -67,7 +66,6 @@ def build_values(self) -> dict[str, Any]: values: dict[str, Any] = {} if self.status is not None: values["status"] = self.status - values["status_updated_at"] = datetime.now(UTC) if self.traffic_ratio is not None: values["traffic_ratio"] = self.traffic_ratio if self.traffic_status is not None: diff --git a/src/ai/backend/manager/repositories/deployment/db_source/db_source.py b/src/ai/backend/manager/repositories/deployment/db_source/db_source.py index 050e712f7e2..ac953e35c20 100644 --- a/src/ai/backend/manager/repositories/deployment/db_source/db_source.py +++ b/src/ai/backend/manager/repositories/deployment/db_source/db_source.py @@ -1406,37 +1406,10 @@ async def fetch_active_routes_by_endpoint_ids( routes_by_endpoint[row.endpoint].append(row.to_route_info()) return routes_by_endpoint - async def fetch_routes_by_endpoint_ids( - self, - endpoint_ids: set[uuid.UUID], - ) -> Mapping[uuid.UUID, list[RouteInfo]]: - """Fetch all routes for given endpoint IDs (no status filter). - - Unlike fetch_active_routes_by_endpoint_ids, this includes routes - in all statuses (FAILED_TO_START, TERMINATED, etc.), which is - required for blue-green rollback detection. - """ - if not endpoint_ids: - return {} - - async with self._begin_readonly_session_read_committed() as db_sess: - query = sa.select(RoutingRow).where( - RoutingRow.endpoint.in_(endpoint_ids), - ) - result = await db_sess.execute(query) - rows: Sequence[RoutingRow] = result.scalars().all() - routes_by_endpoint: defaultdict[uuid.UUID, list[RouteInfo]] = defaultdict(list) - for row in rows: - if row.endpoint not in routes_by_endpoint: - routes_by_endpoint[row.endpoint] = [] - routes_by_endpoint[row.endpoint].append(row.to_route_info()) - return routes_by_endpoint - async def scale_routes( self, scale_out_creators: Sequence[Creator[RoutingRow]], scale_in_updater: BatchUpdater[RoutingRow] | None, - promote_updater: BatchUpdater[RoutingRow] | None = None, ) -> None: """Scale out/in routes based on provided creators and updater.""" async with self._begin_session_read_committed() as db_sess: @@ -1446,9 +1419,6 @@ async def scale_routes( # Scale in routes if scale_in_updater: await execute_batch_updater(db_sess, scale_in_updater) - # Promote routes (blue-green) - if promote_updater: - await execute_batch_updater(db_sess, promote_updater) # Route operations diff --git a/src/ai/backend/manager/repositories/deployment/repository.py b/src/ai/backend/manager/repositories/deployment/repository.py index 4b28ef2596d..20a6c9df4c9 100644 --- a/src/ai/backend/manager/repositories/deployment/repository.py +++ b/src/ai/backend/manager/repositories/deployment/repository.py @@ -548,22 +548,13 @@ async def fetch_active_routes_by_endpoint_ids( """Fetch routes for multiple endpoints.""" return await self._db_source.fetch_active_routes_by_endpoint_ids(endpoint_ids) - @deployment_repository_resilience.apply() - async def fetch_routes_by_endpoint_ids( - self, - endpoint_ids: set[uuid.UUID], - ) -> Mapping[uuid.UUID, list[RouteInfo]]: - """Fetch all routes for multiple endpoints (no status filter).""" - return await self._db_source.fetch_routes_by_endpoint_ids(endpoint_ids) - @deployment_repository_resilience.apply() async def scale_routes( self, scale_out_creators: Sequence[Creator[RoutingRow]], scale_in_updater: BatchUpdater[RoutingRow] | None, - promote_updater: BatchUpdater[RoutingRow] | None = None, ) -> None: - await self._db_source.scale_routes(scale_out_creators, scale_in_updater, promote_updater) + await self._db_source.scale_routes(scale_out_creators, scale_in_updater) # Route operations diff --git a/src/ai/backend/manager/sokovan/deployment/strategy/blue_green.py b/src/ai/backend/manager/sokovan/deployment/strategy/blue_green.py index 37f071abd32..6e76625ed8b 100644 --- a/src/ai/backend/manager/sokovan/deployment/strategy/blue_green.py +++ b/src/ai/backend/manager/sokovan/deployment/strategy/blue_green.py @@ -7,21 +7,14 @@ from __future__ import annotations from collections.abc import Sequence -from datetime import UTC, datetime from ai.backend.manager.data.deployment.types import ( DeploymentInfo, - DeploymentSubStep, RouteInfo, - RouteStatus, - RouteTrafficStatus, ) from ai.backend.manager.models.deployment_policy import BlueGreenSpec -from ai.backend.manager.models.routing import RoutingRow -from ai.backend.manager.repositories.base import Creator -from ai.backend.manager.repositories.deployment.creators import RouteCreatorSpec -from .types import CycleEvaluationResult, RouteChanges +from .types import CycleEvaluationResult def blue_green_evaluate( @@ -29,108 +22,5 @@ def blue_green_evaluate( routes: Sequence[RouteInfo], spec: BlueGreenSpec, ) -> CycleEvaluationResult: - """Evaluate one cycle of blue-green deployment for a single deployment. - - FSM Steps: - 1. Classify routes into blue (old revision) and green (new/deploying revision). - 2. If no green routes exist, create ``desired`` green routes (INACTIVE). - 3. If any green is PROVISIONING, wait. - 4. If all green routes FAILED, rollback. - 5. If healthy green < desired, wait. - 6. All green healthy + auto_promote=False → wait for manual promotion. - 7. All green healthy + auto_promote=True + delay not elapsed → wait. - 8. All green healthy + auto_promote=True + delay elapsed → promote. - """ - deploying_revision = deployment.deploying_revision_id - desired = deployment.replica_spec.target_replica_count - - # ── Step 1: Classify routes ── - green: list[RouteInfo] = [] - blue: list[RouteInfo] = [] - for r in routes: - if r.revision_id is not None and r.revision_id == deploying_revision: - green.append(r) - else: - blue.append(r) - - green_provisioning = [r for r in green if r.status == RouteStatus.PROVISIONING] - green_healthy = [r for r in green if r.status == RouteStatus.HEALTHY] - green_failed = [ - r for r in green if r.status in (RouteStatus.FAILED_TO_START, RouteStatus.TERMINATED) - ] - blue_active = [r for r in blue if r.status.is_active()] - - # ── Step 2: No green routes → create them (INACTIVE, ratio=0.0) ── - if not green: - creators = _build_route_creators(deployment, desired) - return CycleEvaluationResult( - sub_step=DeploymentSubStep.PROVISIONING, - route_changes=RouteChanges(scale_out_specs=creators), - ) - - # ── Step 3: Green PROVISIONING exists → wait ── - if green_provisioning: - return CycleEvaluationResult(sub_step=DeploymentSubStep.PROVISIONING) - - # ── Step 4: All green failed → rollback ── - if green_failed and not green_healthy: - return CycleEvaluationResult( - sub_step=DeploymentSubStep.ROLLED_BACK, - route_changes=RouteChanges( - scale_in_route_ids=[r.route_id for r in green_failed], - ), - ) - - # ── Step 5: Healthy green < desired → wait (progressing) ── - if len(green_healthy) < desired: - return CycleEvaluationResult(sub_step=DeploymentSubStep.PROGRESSING) - - # ── Step 6: All green healthy + auto_promote=False → manual wait ── - if not spec.auto_promote: - return CycleEvaluationResult(sub_step=DeploymentSubStep.PROGRESSING) - - # ── Step 7: auto_promote=True + delay check ── - if spec.promote_delay_seconds > 0: - latest_healthy_at = _latest_status_updated_at(green_healthy) - if latest_healthy_at is None: - return CycleEvaluationResult(sub_step=DeploymentSubStep.PROGRESSING) - elapsed = (datetime.now(UTC) - latest_healthy_at).total_seconds() - if elapsed < spec.promote_delay_seconds: - return CycleEvaluationResult(sub_step=DeploymentSubStep.PROGRESSING) - - # ── Step 8: Promote green, terminate blue ── - return CycleEvaluationResult( - sub_step=DeploymentSubStep.PROGRESSING, - completed=True, - route_changes=RouteChanges( - promote_route_ids=[r.route_id for r in green_healthy], - scale_in_route_ids=[r.route_id for r in blue_active], - ), - ) - - -def _latest_status_updated_at(routes: list[RouteInfo]) -> datetime | None: - """Return the most recent status_updated_at among the given routes.""" - timestamps = [r.status_updated_at for r in routes if r.status_updated_at is not None] - return max(timestamps) if timestamps else None - - -def _build_route_creators( - deployment: DeploymentInfo, - count: int, -) -> list[Creator[RoutingRow]]: - """Build route creators for green routes (INACTIVE, traffic_ratio=0.0).""" - return [ - Creator( - spec=RouteCreatorSpec( - endpoint_id=deployment.id, - session_owner_id=deployment.metadata.session_owner, - domain=deployment.metadata.domain, - project_id=deployment.metadata.project, - traffic_ratio=0.0, - revision_id=deployment.deploying_revision_id, - traffic_status=RouteTrafficStatus.INACTIVE, - ) - ) - for _ in range(count) - ] + """Evaluate one cycle of blue-green deployment for a single deployment.""" + raise NotImplementedError("Blue-green deployment strategy is not yet implemented") diff --git a/src/ai/backend/manager/sokovan/deployment/strategy/evaluator.py b/src/ai/backend/manager/sokovan/deployment/strategy/evaluator.py index e10c8463f8f..69fe7f4a957 100644 --- a/src/ai/backend/manager/sokovan/deployment/strategy/evaluator.py +++ b/src/ai/backend/manager/sokovan/deployment/strategy/evaluator.py @@ -63,12 +63,11 @@ async def evaluate( policy_map = await self._deployment_repo.fetch_deployment_policies_by_endpoint_ids( endpoint_ids ) - route_map = await self._deployment_repo.fetch_routes_by_endpoint_ids(endpoint_ids) + route_map = await self._deployment_repo.fetch_active_routes_by_endpoint_ids(endpoint_ids) # ── 2. Per-deployment evaluation ── all_scale_out: list[Creator[RoutingRow]] = [] all_scale_in_ids: list[UUID] = [] - all_promote_ids: list[UUID] = [] for deployment in deployments: policy = policy_map.get(deployment.id) @@ -90,7 +89,6 @@ async def evaluate( changes = cycle_result.route_changes all_scale_out.extend(changes.scale_out_specs) all_scale_in_ids.extend(changes.scale_in_route_ids) - all_promote_ids.extend(changes.promote_route_ids) # Group by sub-step if cycle_result.completed: @@ -103,7 +101,7 @@ async def evaluate( group.deployments.append(deployment) # ── 3. Apply route mutations in batch ── - await self._apply_route_changes(all_scale_out, all_scale_in_ids, all_promote_ids) + await self._apply_route_changes(all_scale_out, all_scale_in_ids) return result @@ -137,10 +135,9 @@ async def _apply_route_changes( self, scale_out: list[Creator[RoutingRow]], scale_in_ids: list[UUID], - promote_ids: list[UUID] | None = None, ) -> None: """Apply aggregated route mutations in a single DB transaction.""" - if not scale_out and not scale_in_ids and not promote_ids: + if not scale_out and not scale_in_ids: return scale_in_updater: BatchUpdater[RoutingRow] | None = None @@ -154,20 +151,9 @@ async def _apply_route_changes( conditions=[RouteConditions.by_ids(scale_in_ids)], ) - promote_updater: BatchUpdater[RoutingRow] | None = None - if promote_ids: - promote_updater = BatchUpdater( - spec=RouteBatchUpdaterSpec( - traffic_status=RouteTrafficStatus.ACTIVE, - traffic_ratio=1.0, - ), - conditions=[RouteConditions.by_ids(promote_ids)], - ) - - await self._deployment_repo.scale_routes(scale_out, scale_in_updater, promote_updater) + await self._deployment_repo.scale_routes(scale_out, scale_in_updater) log.debug( - "Applied route changes: {} created, {} terminated, {} promoted", + "Applied route changes: {} created, {} terminated", len(scale_out), len(scale_in_ids), - len(promote_ids) if promote_ids else 0, ) diff --git a/src/ai/backend/manager/sokovan/deployment/strategy/types.py b/src/ai/backend/manager/sokovan/deployment/strategy/types.py index cb69cbe2f9e..dd8c61c13e5 100644 --- a/src/ai/backend/manager/sokovan/deployment/strategy/types.py +++ b/src/ai/backend/manager/sokovan/deployment/strategy/types.py @@ -19,7 +19,6 @@ class RouteChanges: scale_out_specs: list[Creator[RoutingRow]] = field(default_factory=list) scale_in_route_ids: list[UUID] = field(default_factory=list) - promote_route_ids: list[UUID] = field(default_factory=list) @dataclass diff --git a/tests/unit/manager/sokovan/deployment/strategy/BUILD b/tests/unit/manager/sokovan/deployment/strategy/BUILD deleted file mode 100644 index 57341b1358b..00000000000 --- a/tests/unit/manager/sokovan/deployment/strategy/BUILD +++ /dev/null @@ -1,3 +0,0 @@ -python_tests( - name="tests", -) diff --git a/tests/unit/manager/sokovan/deployment/strategy/__init__.py b/tests/unit/manager/sokovan/deployment/strategy/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/tests/unit/manager/sokovan/deployment/strategy/test_blue_green.py b/tests/unit/manager/sokovan/deployment/strategy/test_blue_green.py deleted file mode 100644 index cd3be817bf4..00000000000 --- a/tests/unit/manager/sokovan/deployment/strategy/test_blue_green.py +++ /dev/null @@ -1,491 +0,0 @@ -"""Unit tests for blue-green deployment strategy evaluation (BEP-1049).""" - -from __future__ import annotations - -import uuid -from datetime import UTC, datetime, timedelta - -from ai.backend.manager.data.deployment.types import ( - DeploymentInfo, - DeploymentMetadata, - DeploymentNetworkSpec, - DeploymentState, - DeploymentSubStep, - ReplicaSpec, - RouteInfo, - RouteStatus, - RouteTrafficStatus, -) -from ai.backend.manager.models.deployment_policy import BlueGreenSpec -from ai.backend.manager.repositories.base import Creator -from ai.backend.manager.repositories.deployment.creators import RouteCreatorSpec -from ai.backend.manager.sokovan.deployment.strategy.blue_green import blue_green_evaluate - -# ── Helpers ── - -_ENDPOINT_ID = uuid.uuid4() -_DEPLOYING_REVISION_ID = uuid.uuid4() -_OLD_REVISION_ID = uuid.uuid4() -_SESSION_OWNER = uuid.uuid4() -_PROJECT_ID = uuid.uuid4() -_DOMAIN = "default" - - -def _make_deployment( - *, - desired: int = 3, - deploying_revision_id: uuid.UUID | None = None, -) -> DeploymentInfo: - return DeploymentInfo( - id=_ENDPOINT_ID, - metadata=DeploymentMetadata( - name="test-deploy", - domain=_DOMAIN, - project=_PROJECT_ID, - resource_group="default", - created_user=_SESSION_OWNER, - session_owner=_SESSION_OWNER, - created_at=datetime.now(UTC), - revision_history_limit=5, - ), - state=DeploymentState( - lifecycle="DEPLOYING", # type: ignore[arg-type] - retry_count=0, - ), - replica_spec=ReplicaSpec( - replica_count=desired, - desired_replica_count=desired, - ), - network=DeploymentNetworkSpec(open_to_public=False), - model_revisions=[], - deploying_revision_id=deploying_revision_id or _DEPLOYING_REVISION_ID, - ) - - -def _make_route( - *, - revision_id: uuid.UUID | None = None, - status: RouteStatus = RouteStatus.HEALTHY, - traffic_status: RouteTrafficStatus = RouteTrafficStatus.ACTIVE, - traffic_ratio: float = 1.0, - status_updated_at: datetime | None = None, -) -> RouteInfo: - if status_updated_at is None: - status_updated_at = datetime.now(UTC) - return RouteInfo( - route_id=uuid.uuid4(), - endpoint_id=_ENDPOINT_ID, - session_id=None, - status=status, - traffic_ratio=traffic_ratio, - created_at=datetime.now(UTC), - revision_id=revision_id, - traffic_status=traffic_status, - status_updated_at=status_updated_at, - ) - - -def _blue_routes( - count: int, - *, - status: RouteStatus = RouteStatus.HEALTHY, -) -> list[RouteInfo]: - return [_make_route(revision_id=_OLD_REVISION_ID, status=status) for _ in range(count)] - - -def _green_routes( - count: int, - *, - status: RouteStatus = RouteStatus.HEALTHY, - traffic_status: RouteTrafficStatus = RouteTrafficStatus.INACTIVE, - traffic_ratio: float = 0.0, - status_updated_at: datetime | None = None, -) -> list[RouteInfo]: - return [ - _make_route( - revision_id=_DEPLOYING_REVISION_ID, - status=status, - traffic_status=traffic_status, - traffic_ratio=traffic_ratio, - status_updated_at=status_updated_at, - ) - for _ in range(count) - ] - - -def _default_spec( - *, - auto_promote: bool = False, - promote_delay_seconds: int = 0, -) -> BlueGreenSpec: - return BlueGreenSpec( - auto_promote=auto_promote, - promote_delay_seconds=promote_delay_seconds, - ) - - -# ── Test Classes ── - - -class TestNoGreenRoutes: - """Step 2: No green routes → create them (INACTIVE).""" - - def test_creates_green_routes_when_none_exist(self) -> None: - deployment = _make_deployment(desired=3) - routes = _blue_routes(3) - - result = blue_green_evaluate(deployment, routes, _default_spec()) - - assert result.sub_step == DeploymentSubStep.PROVISIONING - assert not result.completed - assert len(result.route_changes.scale_out_specs) == 3 - assert not result.route_changes.scale_in_route_ids - assert not result.route_changes.promote_route_ids - - def test_creator_spec_has_inactive_traffic(self) -> None: - deployment = _make_deployment(desired=2) - routes = _blue_routes(2) - - result = blue_green_evaluate(deployment, routes, _default_spec()) - - for creator in result.route_changes.scale_out_specs: - assert isinstance(creator, Creator) - spec = creator.spec - assert isinstance(spec, RouteCreatorSpec) - assert spec.traffic_status == RouteTrafficStatus.INACTIVE - assert spec.traffic_ratio == 0.0 - assert spec.revision_id == _DEPLOYING_REVISION_ID - - def test_creates_routes_when_no_blue_either(self) -> None: - deployment = _make_deployment(desired=2) - - result = blue_green_evaluate(deployment, [], _default_spec()) - - assert result.sub_step == DeploymentSubStep.PROVISIONING - assert len(result.route_changes.scale_out_specs) == 2 - - -class TestGreenProvisioning: - """Step 3: Green PROVISIONING → wait.""" - - def test_waits_when_green_provisioning(self) -> None: - deployment = _make_deployment(desired=3) - routes = _blue_routes(3) + _green_routes(3, status=RouteStatus.PROVISIONING) - - result = blue_green_evaluate(deployment, routes, _default_spec()) - - assert result.sub_step == DeploymentSubStep.PROVISIONING - assert not result.completed - assert not result.route_changes.scale_out_specs - assert not result.route_changes.scale_in_route_ids - - def test_waits_when_mixed_provisioning_and_healthy(self) -> None: - deployment = _make_deployment(desired=3) - routes = ( - _blue_routes(3) - + _green_routes(2, status=RouteStatus.HEALTHY) - + _green_routes(1, status=RouteStatus.PROVISIONING) - ) - - result = blue_green_evaluate(deployment, routes, _default_spec()) - - assert result.sub_step == DeploymentSubStep.PROVISIONING - assert not result.completed - - -class TestRollback: - """Step 4: All green failed → rollback.""" - - def test_rollback_when_all_green_failed(self) -> None: - deployment = _make_deployment(desired=3) - green_failed = _green_routes(3, status=RouteStatus.FAILED_TO_START) - routes = _blue_routes(3) + green_failed - - result = blue_green_evaluate(deployment, routes, _default_spec()) - - assert result.sub_step == DeploymentSubStep.ROLLED_BACK - assert not result.completed - assert len(result.route_changes.scale_in_route_ids) == 3 - for gf in green_failed: - assert gf.route_id in result.route_changes.scale_in_route_ids - - def test_rollback_with_terminated_green(self) -> None: - deployment = _make_deployment(desired=2) - routes = _blue_routes(2) + _green_routes(2, status=RouteStatus.TERMINATED) - - result = blue_green_evaluate(deployment, routes, _default_spec()) - - assert result.sub_step == DeploymentSubStep.ROLLED_BACK - assert len(result.route_changes.scale_in_route_ids) == 2 - - def test_no_rollback_when_some_green_healthy(self) -> None: - deployment = _make_deployment(desired=3) - routes = ( - _blue_routes(3) - + _green_routes(1, status=RouteStatus.HEALTHY) - + _green_routes(2, status=RouteStatus.FAILED_TO_START) - ) - - result = blue_green_evaluate(deployment, routes, _default_spec()) - - # Mixed: healthy < desired → PROGRESSING (step 5) - assert result.sub_step == DeploymentSubStep.PROGRESSING - assert not result.completed - - -class TestHealthyLessThanDesired: - """Step 5: Healthy green < desired → PROGRESSING.""" - - def test_progressing_when_healthy_less_than_desired(self) -> None: - deployment = _make_deployment(desired=5) - routes = _blue_routes(5) + _green_routes(3, status=RouteStatus.HEALTHY) - - result = blue_green_evaluate(deployment, routes, _default_spec()) - - assert result.sub_step == DeploymentSubStep.PROGRESSING - assert not result.completed - - -class TestManualPromotion: - """Step 6: All green healthy + auto_promote=False → manual wait.""" - - def test_waits_for_manual_promotion(self) -> None: - deployment = _make_deployment(desired=3) - routes = _blue_routes(3) + _green_routes(3, status=RouteStatus.HEALTHY) - spec = _default_spec(auto_promote=False) - - result = blue_green_evaluate(deployment, routes, spec) - - assert result.sub_step == DeploymentSubStep.PROGRESSING - assert not result.completed - assert not result.route_changes.promote_route_ids - assert not result.route_changes.scale_in_route_ids - - -class TestPromoteDelay: - """Step 7: auto_promote=True + promote_delay_seconds.""" - - def test_waits_when_delay_not_elapsed(self) -> None: - deployment = _make_deployment(desired=3) - recent = datetime.now(UTC) - timedelta(seconds=10) - routes = _blue_routes(3) + _green_routes( - 3, status=RouteStatus.HEALTHY, status_updated_at=recent - ) - spec = _default_spec(auto_promote=True, promote_delay_seconds=60) - - result = blue_green_evaluate(deployment, routes, spec) - - assert result.sub_step == DeploymentSubStep.PROGRESSING - assert not result.completed - - def test_promotes_when_delay_elapsed(self) -> None: - deployment = _make_deployment(desired=3) - past = datetime.now(UTC) - timedelta(seconds=120) - green = _green_routes(3, status=RouteStatus.HEALTHY, status_updated_at=past) - blue = _blue_routes(3) - routes = blue + green - spec = _default_spec(auto_promote=True, promote_delay_seconds=60) - - result = blue_green_evaluate(deployment, routes, spec) - - assert result.completed - assert len(result.route_changes.promote_route_ids) == 3 - assert len(result.route_changes.scale_in_route_ids) == 3 - - def test_waits_when_status_updated_at_is_none(self) -> None: - deployment = _make_deployment(desired=2) - green = _green_routes(2, status=RouteStatus.HEALTHY, status_updated_at=None) - # Manually set status_updated_at to None - for r in green: - object.__setattr__(r, "status_updated_at", None) - routes = _blue_routes(2) + green - spec = _default_spec(auto_promote=True, promote_delay_seconds=30) - - result = blue_green_evaluate(deployment, routes, spec) - - assert result.sub_step == DeploymentSubStep.PROGRESSING - assert not result.completed - - -class TestAutoPromotion: - """Step 8: auto_promote=True + delay=0 → immediate promotion.""" - - def test_promotes_immediately_with_zero_delay(self) -> None: - deployment = _make_deployment(desired=3) - green = _green_routes(3, status=RouteStatus.HEALTHY) - blue = _blue_routes(3) - routes = blue + green - spec = _default_spec(auto_promote=True, promote_delay_seconds=0) - - result = blue_green_evaluate(deployment, routes, spec) - - assert result.completed - assert result.sub_step == DeploymentSubStep.PROGRESSING - # Green route IDs promoted - assert len(result.route_changes.promote_route_ids) == 3 - for g in green: - assert g.route_id in result.route_changes.promote_route_ids - # Blue route IDs scaled in - assert len(result.route_changes.scale_in_route_ids) == 3 - for b in blue: - assert b.route_id in result.route_changes.scale_in_route_ids - - def test_no_blue_to_terminate(self) -> None: - deployment = _make_deployment(desired=2) - green = _green_routes(2, status=RouteStatus.HEALTHY) - spec = _default_spec(auto_promote=True) - - result = blue_green_evaluate(deployment, green, spec) - - assert result.completed - assert len(result.route_changes.promote_route_ids) == 2 - assert len(result.route_changes.scale_in_route_ids) == 0 - - -class TestSingleReplica: - """Edge case: desired=1.""" - - def test_single_replica_full_cycle(self) -> None: - deployment = _make_deployment(desired=1) - green = _green_routes(1, status=RouteStatus.HEALTHY) - blue = _blue_routes(1) - spec = _default_spec(auto_promote=True) - - result = blue_green_evaluate(deployment, blue + green, spec) - - assert result.completed - assert len(result.route_changes.promote_route_ids) == 1 - assert len(result.route_changes.scale_in_route_ids) == 1 - - -class TestLargeReplicaCount: - """Edge case: desired=10.""" - - def test_creates_correct_number_of_green_routes(self) -> None: - deployment = _make_deployment(desired=10) - routes = _blue_routes(10) - - result = blue_green_evaluate(deployment, routes, _default_spec()) - - assert len(result.route_changes.scale_out_specs) == 10 - - -class TestBlueRouteStatuses: - """Only active blue routes are terminated during promotion.""" - - def test_only_active_blue_terminated(self) -> None: - deployment = _make_deployment(desired=2) - blue_active = _blue_routes(2, status=RouteStatus.HEALTHY) - blue_inactive = [ - _make_route(revision_id=_OLD_REVISION_ID, status=RouteStatus.TERMINATED), - ] - green = _green_routes(2, status=RouteStatus.HEALTHY) - spec = _default_spec(auto_promote=True) - - result = blue_green_evaluate(deployment, blue_active + blue_inactive + green, spec) - - assert result.completed - # Only active blue routes are terminated - assert len(result.route_changes.scale_in_route_ids) == 2 - for b in blue_active: - assert b.route_id in result.route_changes.scale_in_route_ids - for b in blue_inactive: - assert b.route_id not in result.route_changes.scale_in_route_ids - - -class TestCreatorSpecFields: - """Verify RouteCreatorSpec fields for green routes.""" - - def test_creator_fields(self) -> None: - deployment = _make_deployment(desired=1) - - result = blue_green_evaluate(deployment, [], _default_spec()) - - wrapper = result.route_changes.scale_out_specs[0] - assert isinstance(wrapper, Creator) - spec = wrapper.spec - assert isinstance(spec, RouteCreatorSpec) - assert spec.endpoint_id == _ENDPOINT_ID - assert spec.session_owner_id == _SESSION_OWNER - assert spec.domain == _DOMAIN - assert spec.project_id == _PROJECT_ID - assert spec.traffic_ratio == 0.0 - assert spec.traffic_status == RouteTrafficStatus.INACTIVE - assert spec.revision_id == _DEPLOYING_REVISION_ID - - -class TestMixedGreenStatuses: - """Mixed green routes: some healthy, some failed, no provisioning.""" - - def test_mixed_healthy_and_failed_progresses(self) -> None: - deployment = _make_deployment(desired=4) - routes = ( - _blue_routes(4) - + _green_routes(2, status=RouteStatus.HEALTHY) - + _green_routes(2, status=RouteStatus.FAILED_TO_START) - ) - spec = _default_spec(auto_promote=True) - - result = blue_green_evaluate(deployment, routes, spec) - - # 2 healthy < 4 desired → PROGRESSING - assert result.sub_step == DeploymentSubStep.PROGRESSING - assert not result.completed - - -class TestDifferentEndpoints: - """Routes for different endpoints should still classify correctly.""" - - def test_different_deploying_revision(self) -> None: - other_revision = uuid.uuid4() - deployment = _make_deployment(desired=2, deploying_revision_id=other_revision) - # Routes with a different revision_id are classified as blue - routes = [ - _make_route(revision_id=_DEPLOYING_REVISION_ID, status=RouteStatus.HEALTHY), - _make_route(revision_id=_DEPLOYING_REVISION_ID, status=RouteStatus.HEALTHY), - ] - - result = blue_green_evaluate(deployment, routes, _default_spec()) - - # These are classified as blue (different revision), so no green → create - assert result.sub_step == DeploymentSubStep.PROVISIONING - assert len(result.route_changes.scale_out_specs) == 2 - - -class TestAtomicPromotion: - """Promotion is atomic: all green promoted + all blue terminated in one cycle.""" - - def test_atomic_promotion(self) -> None: - deployment = _make_deployment(desired=5) - green = _green_routes(5, status=RouteStatus.HEALTHY) - blue = _blue_routes(5) - spec = _default_spec(auto_promote=True) - - result = blue_green_evaluate(deployment, blue + green, spec) - - assert result.completed - green_ids = {g.route_id for g in green} - blue_ids = {b.route_id for b in blue} - assert set(result.route_changes.promote_route_ids) == green_ids - assert set(result.route_changes.scale_in_route_ids) == blue_ids - - -class TestNoScaleOutDuringWait: - """No new routes created when waiting for green to become healthy.""" - - def test_no_scale_out_during_provisioning_wait(self) -> None: - deployment = _make_deployment(desired=3) - routes = _blue_routes(3) + _green_routes(3, status=RouteStatus.PROVISIONING) - - result = blue_green_evaluate(deployment, routes, _default_spec()) - - assert not result.route_changes.scale_out_specs - - def test_no_scale_out_during_progressing(self) -> None: - deployment = _make_deployment(desired=3) - routes = _blue_routes(3) + _green_routes(3, status=RouteStatus.HEALTHY) - spec = _default_spec(auto_promote=False) - - result = blue_green_evaluate(deployment, routes, spec) - - assert not result.route_changes.scale_out_specs From 36067729a2fdc7e5c37330c84628c94dd94521b7 Mon Sep 17 00:00:00 2001 From: jopemachine Date: Mon, 2 Mar 2026 04:37:33 +0000 Subject: [PATCH 06/23] wip --- .../deployment/db_source/db_source.py | 70 +++++++++++++++ .../repositories/deployment/repository.py | 12 +++ .../manager/sokovan/deployment/coordinator.py | 85 +++++++++++++++---- .../sokovan/deployment/strategy/evaluator.py | 1 + .../sokovan/deployment/strategy/types.py | 19 ++++- 5 files changed, 170 insertions(+), 17 deletions(-) diff --git a/src/ai/backend/manager/repositories/deployment/db_source/db_source.py b/src/ai/backend/manager/repositories/deployment/db_source/db_source.py index ac953e35c20..82f6e598faa 100644 --- a/src/ai/backend/manager/repositories/deployment/db_source/db_source.py +++ b/src/ai/backend/manager/repositories/deployment/db_source/db_source.py @@ -2295,6 +2295,76 @@ async def complete_deployment_revision_swap( ) await db_sess.execute(stmt) + async def complete_deployment_and_transition_to_ready( + self, + endpoint_ids: set[uuid.UUID], + batch_updaters: Sequence[BatchUpdater[EndpointRow]], + bulk_creator: BulkCreator[DeploymentHistoryRow], + ) -> None: + """Atomically swap revisions, update lifecycle, and record history. + + Performs all three operations in a single transaction to prevent + inconsistent state if the process crashes between steps. + + The revision swap includes an idempotency guard + (deploying_revision IS NOT NULL) to prevent double-call issues. + + Args: + endpoint_ids: Set of endpoint IDs to swap revisions for. + batch_updaters: Sequence of BatchUpdaters for lifecycle status updates. + bulk_creator: BulkCreator containing all history records. + """ + if not endpoint_ids: + return + async with self._begin_session_read_committed() as db_sess: + # 1. Swap revisions with idempotency guard + swap_stmt = ( + sa.update(EndpointRow) + .where( + EndpointRow.id.in_(endpoint_ids), + EndpointRow.deploying_revision.isnot(None), + ) + .values( + current_revision=EndpointRow.deploying_revision, + deploying_revision=None, + ) + ) + await db_sess.execute(swap_stmt) + + # 2. Execute all lifecycle status updates + for batch_updater in batch_updaters: + await execute_batch_updater(db_sess, batch_updater) + + # 3. Record history (same logic as update_endpoint_lifecycle_bulk_with_history) + if not bulk_creator.specs: + return + + new_rows = [spec.build_row() for spec in bulk_creator.specs] + deployment_ids = [row.deployment_id for row in new_rows] + + last_records = await self._get_last_deployment_histories_bulk(db_sess, deployment_ids) + + merge_ids: list[uuid.UUID] = [] + create_rows: list[DeploymentHistoryRow] = [] + + for new_row in new_rows: + last_row = last_records.get(new_row.deployment_id) + if last_row is not None and last_row.should_merge_with(new_row): + merge_ids.append(last_row.id) + else: + create_rows.append(new_row) + + if merge_ids: + await db_sess.execute( + sa.update(DeploymentHistoryRow) + .where(DeploymentHistoryRow.id.in_(merge_ids)) + .values(attempts=DeploymentHistoryRow.attempts + 1) + ) + + if create_rows: + db_sess.add_all(create_rows) + await db_sess.flush() + async def clear_deploying_revision( self, endpoint_ids: set[uuid.UUID], diff --git a/src/ai/backend/manager/repositories/deployment/repository.py b/src/ai/backend/manager/repositories/deployment/repository.py index 20a6c9df4c9..ec44457c91b 100644 --- a/src/ai/backend/manager/repositories/deployment/repository.py +++ b/src/ai/backend/manager/repositories/deployment/repository.py @@ -1232,6 +1232,18 @@ async def complete_deployment_revision_swap( """Swap deploying_revision to current_revision for completed deployments.""" await self._db_source.complete_deployment_revision_swap(endpoint_ids) + @deployment_repository_resilience.apply() + async def complete_deployment_and_transition_to_ready( + self, + endpoint_ids: set[uuid.UUID], + batch_updaters: list[BatchUpdater[EndpointRow]], + bulk_creator: BulkCreator[DeploymentHistoryRow], + ) -> None: + """Atomically swap revisions, update lifecycle, and record history.""" + await self._db_source.complete_deployment_and_transition_to_ready( + endpoint_ids, batch_updaters, bulk_creator + ) + @deployment_repository_resilience.apply() async def clear_deploying_revision( self, diff --git a/src/ai/backend/manager/sokovan/deployment/coordinator.py b/src/ai/backend/manager/sokovan/deployment/coordinator.py index 4e2233c34d1..99ceef1cb2f 100644 --- a/src/ai/backend/manager/sokovan/deployment/coordinator.py +++ b/src/ai/backend/manager/sokovan/deployment/coordinator.py @@ -15,6 +15,7 @@ from ai.backend.common.clients.valkey_client.valkey_schedule import ValkeyScheduleClient from ai.backend.common.clients.valkey_client.valkey_stat.client import ValkeyStatClient from ai.backend.common.data.endpoint.types import EndpointLifecycle +from ai.backend.common.data.model_deployment.types import DeploymentStrategy from ai.backend.common.data.notification import NotificationRuleType from ai.backend.common.data.notification.messages import EndpointLifecycleChangedMessage from ai.backend.common.events.dispatcher import EventProducer @@ -364,7 +365,8 @@ async def _process_with_evaluator( 2. Load DEPLOYING deployments. 3. Run evaluator (evaluates strategy FSM + applies route mutations). 4. For each sub-step group, run the corresponding handler. - 5. For completed deployments, swap revisions and transition to READY. + 5. Handle errors and skipped deployments. + 6. For completed deployments, swap revisions and transition to READY. """ lock_lifetime = self._config_provider.config.manager.session_schedule_lock_lifetime async with self._lock_factory(LockID.LOCKID_DEPLOYMENT_DEPLOYING, lock_lifetime): @@ -377,6 +379,7 @@ async def _process_with_evaluator( log.info("DEPLOYING: processing {} deployments", len(deployments)) deployment_ids = [d.id for d in deployments] + sub_results: dict[DeploymentSubStep, DeploymentExecutionResult] = {} with DeploymentRecorderContext.scope( lifecycle_type.value, entity_ids=deployment_ids ) as pool: @@ -394,17 +397,61 @@ async def _process_with_evaluator( continue sub_result = await handler.execute(group.deployments) + sub_results[sub_step] = sub_result await self._handle_status_transitions(handler, sub_result, all_records) - # Post-process outside recorder scope + # Handle evaluation errors (Finding 3) — record history, keep DEPLOYING + if eval_result.errors: + error_history_specs = [ + DeploymentHistoryCreatorSpec( + deployment_id=deployment.id, + phase=lifecycle_type.value, + result=SchedulingResult.NEED_RETRY, + message=f"Evaluation error: {reason}", + from_status=EndpointLifecycle.DEPLOYING, + to_status=None, + sub_steps=[], + ) + for deployment, reason in eval_result.errors + ] + await self._deployment_repository.update_endpoint_lifecycle_bulk_with_history( + [], BulkCreator(specs=error_history_specs) + ) + for deployment, reason in eval_result.errors: + log.error("Deployment {} evaluation error: {}", deployment.id, reason) + + # Handle skipped deployments (Finding 5) — record history, keep DEPLOYING + if eval_result.skipped: + skipped_history_specs = [ + DeploymentHistoryCreatorSpec( + deployment_id=deployment.id, + phase=lifecycle_type.value, + result=SchedulingResult.SKIPPED, + message="No deployment policy found", + from_status=EndpointLifecycle.DEPLOYING, + to_status=None, + sub_steps=[], + ) + for deployment in eval_result.skipped + ] + await self._deployment_repository.update_endpoint_lifecycle_bulk_with_history( + [], BulkCreator(specs=skipped_history_specs) + ) + for deployment in eval_result.skipped: + log.warning("Deployment {} skipped: no deployment policy found", deployment.id) + + # Post-process outside recorder scope using actual sub_results (Finding 4) for sub_step, group in eval_result.groups.items(): handler_key = (lifecycle_type, sub_step) handler = self._deployment_handlers.get(handler_key) if handler is None: continue try: - result_for_post = DeploymentExecutionResult(successes=group.deployments) - await handler.post_process(result_for_post) + actual_result = sub_results.get( + sub_step, + DeploymentExecutionResult(successes=group.deployments), + ) + await handler.post_process(actual_result) except Exception as e: log.error( "Error during post-processing for sub-step {}: {}", @@ -414,28 +461,27 @@ async def _process_with_evaluator( # Transition completed deployments: swap revision and move to READY if eval_result.completed: - await self._transition_completed_deployments(lifecycle_type, eval_result.completed) + await self._transition_completed_deployments( + lifecycle_type, + eval_result.completed, + strategies=eval_result.completed_strategies, + ) async def _transition_completed_deployments( self, lifecycle_type: DeploymentLifecycleType, completed: list[DeploymentInfo], + strategies: dict[UUID, DeploymentStrategy], ) -> None: """Transition completed DEPLOYING deployments to READY. - 1. Swap deploying_revision → current_revision. + Atomically: + 1. Swap deploying_revision → current_revision (with idempotency guard). 2. Update lifecycle to READY with history recording. 3. Send notification events. """ endpoint_ids = {deployment.id for deployment in completed} - # Swap revisions - await self._deployment_repository.complete_deployment_revision_swap(endpoint_ids) - log.info( - "Swapped deploying_revision → current_revision for {} deployments", - len(endpoint_ids), - ) - # Build lifecycle transition target_statuses = [EndpointLifecycle.DEPLOYING] from_status = EndpointLifecycle.DEPLOYING @@ -455,7 +501,9 @@ async def _transition_completed_deployments( deployment_id=deployment.id, phase=lifecycle_type.value, result=SchedulingResult.SUCCESS, - message="Rolling update completed successfully", + message=f"Deployment completed successfully (strategy: {strategies[deployment.id].value})" + if deployment.id in strategies + else "Deployment completed successfully", from_status=from_status, to_status=to_status, sub_steps=[], @@ -463,8 +511,13 @@ async def _transition_completed_deployments( for deployment in completed ] - await self._deployment_repository.update_endpoint_lifecycle_bulk_with_history( - [batch_updater], BulkCreator(specs=history_specs) + # Atomic: revision swap + lifecycle update + history recording + await self._deployment_repository.complete_deployment_and_transition_to_ready( + endpoint_ids, [batch_updater], BulkCreator(specs=history_specs) + ) + log.info( + "Atomically swapped revision and transitioned {} deployments to READY", + len(endpoint_ids), ) # Send notifications diff --git a/src/ai/backend/manager/sokovan/deployment/strategy/evaluator.py b/src/ai/backend/manager/sokovan/deployment/strategy/evaluator.py index 69fe7f4a957..21cc194e43c 100644 --- a/src/ai/backend/manager/sokovan/deployment/strategy/evaluator.py +++ b/src/ai/backend/manager/sokovan/deployment/strategy/evaluator.py @@ -93,6 +93,7 @@ async def evaluate( # Group by sub-step if cycle_result.completed: result.completed.append(deployment) + result.completed_strategies[deployment.id] = policy.strategy else: group = result.groups.setdefault( cycle_result.sub_step, diff --git a/src/ai/backend/manager/sokovan/deployment/strategy/types.py b/src/ai/backend/manager/sokovan/deployment/strategy/types.py index dd8c61c13e5..a31fe4050fd 100644 --- a/src/ai/backend/manager/sokovan/deployment/strategy/types.py +++ b/src/ai/backend/manager/sokovan/deployment/strategy/types.py @@ -5,6 +5,7 @@ from dataclasses import dataclass, field from uuid import UUID +from ai.backend.common.data.model_deployment.types import DeploymentStrategy from ai.backend.manager.data.deployment.types import ( DeploymentInfo, DeploymentSubStep, @@ -23,7 +24,7 @@ class RouteChanges: @dataclass class CycleEvaluationResult: - """Result of evaluating a single deployment's rolling update cycle.""" + """Result of evaluating a single deployment's strategy cycle.""" sub_step: DeploymentSubStep completed: bool = False @@ -42,7 +43,23 @@ class EvaluationGroup: class EvaluationResult: """Aggregate result of evaluating all DEPLOYING deployments.""" + # In-progress deployments grouped by sub-step (PROVISIONING, PROGRESSING, etc.). + # The coordinator looks up the handler for each sub-step and calls execute(). groups: dict[DeploymentSubStep, EvaluationGroup] = field(default_factory=dict) + + # Deployments that satisfied all strategy FSM conditions and are ready to finish. + # The coordinator performs an atomic revision swap + READY transition for these. completed: list[DeploymentInfo] = field(default_factory=list) + + # Maps each completed deployment to the strategy (ROLLING, BLUE_GREEN) it used. + # The coordinator includes this in the history message for observability. + completed_strategies: dict[UUID, DeploymentStrategy] = field(default_factory=dict) + + # Deployments skipped because no deployment policy was found. + # The coordinator records SKIPPED history and emits a warning log. skipped: list[DeploymentInfo] = field(default_factory=list) + + # Deployments that raised an exception during strategy FSM evaluation, paired + # with the error message. The coordinator records NEED_RETRY history and keeps + # the lifecycle at DEPLOYING so the next cycle can retry. errors: list[tuple[DeploymentInfo, str]] = field(default_factory=list) From 94505ecca67e3e5603e88ac0e01d7d27a63c36c8 Mon Sep 17 00:00:00 2001 From: jopemachine Date: Mon, 2 Mar 2026 05:13:26 +0000 Subject: [PATCH 07/23] refactor: rename HandlerKey to DeploymentHandlerKey for clarity Co-Authored-By: Claude Opus 4.6 --- .../manager/sokovan/deployment/coordinator.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/ai/backend/manager/sokovan/deployment/coordinator.py b/src/ai/backend/manager/sokovan/deployment/coordinator.py index 99ceef1cb2f..33729200d65 100644 --- a/src/ai/backend/manager/sokovan/deployment/coordinator.py +++ b/src/ai/backend/manager/sokovan/deployment/coordinator.py @@ -65,7 +65,9 @@ from .types import DeploymentExecutionResult, DeploymentLifecycleType # Handler key: either a simple lifecycle type or a (lifecycle, sub-step) tuple -HandlerKey = DeploymentLifecycleType | tuple[DeploymentLifecycleType, DeploymentSubStep] +type DeploymentHandlerKey = ( + DeploymentLifecycleType | tuple[DeploymentLifecycleType, DeploymentSubStep] +) log = BraceStyleAdapter(logging.getLogger(__name__)) @@ -104,7 +106,7 @@ class DeploymentCoordinator: _valkey_schedule: ValkeyScheduleClient _deployment_controller: DeploymentController _deployment_repository: DeploymentRepository - _deployment_handlers: Mapping[HandlerKey, DeploymentHandler] + _deployment_handlers: Mapping[DeploymentHandlerKey, DeploymentHandler] _deployment_evaluators: Mapping[DeploymentLifecycleType, DeploymentStrategyEvaluator] _lock_factory: DistributedLockFactory _config_provider: ManagerConfigProvider @@ -149,9 +151,9 @@ def __init__( def _init_handlers( self, executor: DeploymentExecutor - ) -> Mapping[HandlerKey, DeploymentHandler]: + ) -> Mapping[DeploymentHandlerKey, DeploymentHandler]: """Initialize and return the mapping of handler keys to their handlers.""" - handlers: dict[HandlerKey, DeploymentHandler] = { + handlers: dict[DeploymentHandlerKey, DeploymentHandler] = { DeploymentLifecycleType.CHECK_PENDING: CheckPendingDeploymentHandler( deployment_executor=executor, deployment_controller=self._deployment_controller, @@ -388,7 +390,7 @@ async def _process_with_evaluator( # Process each sub-step group with its handler for sub_step, group in eval_result.groups.items(): - handler_key: HandlerKey = (lifecycle_type, sub_step) + handler_key: DeploymentHandlerKey = (lifecycle_type, sub_step) handler = self._deployment_handlers.get(handler_key) if handler is None: log.warning( From 605e1cee5ad9b6112f84d145eb0309e6269b84f8 Mon Sep 17 00:00:00 2001 From: jopemachine Date: Mon, 2 Mar 2026 07:12:10 +0000 Subject: [PATCH 08/23] docs: Update BEP --- proposals/BEP-1049/blue-green.md | 34 ++++++++++++++-------------- proposals/BEP-1049/rolling-update.md | 30 ++++++++++++------------ 2 files changed, 32 insertions(+), 32 deletions(-) diff --git a/proposals/BEP-1049/blue-green.md b/proposals/BEP-1049/blue-green.md index 1f21b8fbc64..0fd2dec8c00 100644 --- a/proposals/BEP-1049/blue-green.md +++ b/proposals/BEP-1049/blue-green.md @@ -77,15 +77,15 @@ The `DeploymentStrategyEvaluator` periodically evaluates each Blue-Green deploym ### Sub-Step Variants -Each cycle evaluation directly returns one of the shared sub-step variants: +Each cycle evaluation directly returns one of the shared sub-step variants. Completion is not a sub-step but a signal on `CycleEvaluationResult(sub_step=PROGRESSING, completed=True)` — the coordinator handles revision swap and READY transition directly. | Sub-Step | Condition | Handler Action | |----------|-----------|----------------| -| **provisioning** | No Green routes → created all as INACTIVE | DeployingInProgressHandler → DEPLOYING→DEPLOYING, reschedule | -| **provisioning** | Green routes are PROVISIONING | DeployingInProgressHandler → DEPLOYING→DEPLOYING, reschedule | -| **progressing** | Not all Green healthy (mixed state, no PROVISIONING) | DeployingInProgressHandler → DEPLOYING→DEPLOYING, reschedule | -| **progressing** | All Green healthy, waiting for promotion trigger (manual or delay) | DeployingInProgressHandler → DEPLOYING→DEPLOYING, reschedule | -| **completed** | Promotion executed (Green→ACTIVE, Blue→TERMINATING) | DeployingCompletedHandler → DEPLOYING→READY, revision swap | +| **provisioning** | No Green routes → created all as INACTIVE | DeployingProvisioningHandler → DEPLOYING→DEPLOYING, reschedule | +| **provisioning** | Green routes are PROVISIONING | DeployingProvisioningHandler → DEPLOYING→DEPLOYING, reschedule | +| **progressing** | Not all Green healthy (mixed state, no PROVISIONING) | DeployingProgressingHandler → DEPLOYING→DEPLOYING, reschedule | +| **progressing** | All Green healthy, waiting for promotion trigger (manual or delay) | DeployingProgressingHandler → DEPLOYING→DEPLOYING, reschedule | +| **progressing** (`completed=True`) | Promotion executed (Green→ACTIVE, Blue→TERMINATING) | Coordinator → atomic revision swap + DEPLOYING→READY | | **rolled_back** | All Green failed → terminate Green | DeployingRolledBackHandler → DEPLOYING→READY, deploying_revision=NULL | ## promote_delay_seconds Handling @@ -268,11 +268,12 @@ With `auto_promote=False`: ┌──────────────────────────────────────────────────────────────┐ │ Per-Sub-Step Handlers (coordinator generic path) │ │ │ - │ PROVISIONING/PROGRESSING → DeployingInProgressHandler │ + │ PROVISIONING → DeployingProvisioningHandler │ │ next_status: DEPLOYING → coordinator records history │ │ │ - │ COMPLETED → DeployingCompletedHandler │ - │ next_status: READY → revision swap + coordinator transit │ + │ PROGRESSING → DeployingProgressingHandler │ + │ next_status: DEPLOYING → coordinator records history │ + │ completed=True → coordinator atomic revision swap + READY │ │ │ │ ROLLED_BACK → DeployingRolledBackHandler │ │ next_status: READY → clear dep_rev + coordinator transit │ @@ -287,14 +288,13 @@ When all Green routes become ACTIVE and Blue routes are terminated: completed determination (evaluator) │ ▼ - DeployingCompletedHandler.execute() - → complete_deployment_revision_swap(ids) - current_revision = deploying_revision - deploying_revision = NULL - │ - ▼ - Coordinator generic path - → DEPLOYING → READY history recording + lifecycle transition + Coordinator._transition_completed_deployments() + → Atomic transaction: + 1. complete_deployment_revision_swap(ids) + current_revision = deploying_revision + deploying_revision = NULL + 2. DEPLOYING → READY lifecycle transition + 3. History recording ``` ## Comparison with Rolling Update diff --git a/proposals/BEP-1049/rolling-update.md b/proposals/BEP-1049/rolling-update.md index ed9aab3edbc..3c28d6ead0c 100644 --- a/proposals/BEP-1049/rolling-update.md +++ b/proposals/BEP-1049/rolling-update.md @@ -55,13 +55,13 @@ The `DeploymentStrategyEvaluator` periodically evaluates each Rolling Update dep ### Sub-Step Variants -Each cycle evaluation directly returns one of the shared sub-step variants: +Each cycle evaluation directly returns one of the shared sub-step variants. Completion is not a sub-step but a signal on `CycleEvaluationResult(sub_step=PROGRESSING, completed=True)` — the coordinator handles revision swap and READY transition directly. | Sub-Step | Condition | Handler Action | |----------|-----------|----------------| -| **provisioning** | New routes are PROVISIONING | DeployingInProgressHandler → DEPLOYING→DEPLOYING, reschedule | -| **progressing** | Calculated surge/unavailable, created/terminated routes | DeployingInProgressHandler → DEPLOYING→DEPLOYING, reschedule | -| **completed** | No Old routes and New healthy >= desired_replicas | DeployingCompletedHandler → DEPLOYING→READY, revision swap | +| **provisioning** | New routes are PROVISIONING | DeployingProvisioningHandler → DEPLOYING→DEPLOYING, reschedule | +| **progressing** | Calculated surge/unavailable, created/terminated routes | DeployingProgressingHandler → DEPLOYING→DEPLOYING, reschedule | +| **progressing** (`completed=True`) | No Old routes and New healthy >= desired_replicas | Coordinator → atomic revision swap + DEPLOYING→READY | ## max_surge / max_unavailable Calculation @@ -227,11 +227,12 @@ Example with `desired_replicas = 3`, `max_surge = 1`, `max_unavailable = 1`: ┌──────────────────────────────────────────────────────────────┐ │ Per-Sub-Step Handlers (coordinator generic path) │ │ │ - │ PROVISIONING/PROGRESSING → DeployingInProgressHandler │ + │ PROVISIONING → DeployingProvisioningHandler │ │ next_status: DEPLOYING → coordinator records history │ │ │ - │ COMPLETED → DeployingCompletedHandler │ - │ next_status: READY → revision swap + coordinator transit │ + │ PROGRESSING → DeployingProgressingHandler │ + │ next_status: DEPLOYING → coordinator records history │ + │ completed=True → coordinator atomic revision swap + READY │ └──────────────────────────────────────────────────────────────┘ ``` @@ -243,12 +244,11 @@ When all Old routes are removed and New routes reach desired_replicas or above a completed determination (evaluator) │ ▼ - DeployingCompletedHandler.execute() - → complete_deployment_revision_swap(ids) - current_revision = deploying_revision - deploying_revision = NULL - │ - ▼ - Coordinator generic path - → DEPLOYING → READY history recording + lifecycle transition + Coordinator._transition_completed_deployments() + → Atomic transaction: + 1. complete_deployment_revision_swap(ids) + current_revision = deploying_revision + deploying_revision = NULL + 2. DEPLOYING → READY lifecycle transition + 3. History recording ``` From 203c0e8e59b0699904d1350855d693053d1ae2ca Mon Sep 17 00:00:00 2001 From: jopemachine Date: Mon, 2 Mar 2026 07:22:38 +0000 Subject: [PATCH 09/23] docs: Update BEP --- .../BEP-1049-deployment-strategy-handler.md | 46 +++++++------------ 1 file changed, 17 insertions(+), 29 deletions(-) diff --git a/proposals/BEP-1049-deployment-strategy-handler.md b/proposals/BEP-1049-deployment-strategy-handler.md index 3de4e24338a..88b2aa22e83 100644 --- a/proposals/BEP-1049-deployment-strategy-handler.md +++ b/proposals/BEP-1049-deployment-strategy-handler.md @@ -30,7 +30,7 @@ Rolling Update similarly progresses gradually across cycles. Both strategies **k ### Evaluator + Sub-Step Handler Pattern -A single `evaluate()` call may produce different sub-steps for different deployments — some completed, others still PROGRESSING. To handle this, a **strategy evaluator** groups deployments by sub-step, and **per-sub-step handlers** process each group. Completed deployments are returned separately in `EvaluationResult.completed` and processed via the PROGRESSING handler's `post_process`. +A single `evaluate()` call may produce different sub-steps for different deployments — some completed, others still PROGRESSING. To handle this, a **strategy evaluator** groups deployments by sub-step, and **per-sub-step handlers** process each group. Completed deployments are returned separately in `EvaluationResult.completed` and processed directly by the coordinator's `_transition_completed_deployments()`. | Aspect | How it works | |--------|-------------| @@ -81,7 +81,7 @@ Core idea: A **strategy evaluator** evaluates DEPLOYING-state deployments and gr │ ├─ evaluator exists → _process_with_evaluator() (evaluator path) │ │ └─ no evaluator → existing single-handler path │ │ │ -│ Handler map key: HandlerKey │ +│ Handler map key: DeploymentHandlerKey │ │ DeploymentLifecycleType ← single handlers │ │ | (DeploymentLifecycleType, DeploymentSubStep) ← sub-step handlers │ │ │ @@ -113,8 +113,6 @@ Core idea: A **strategy evaluator** evaluates DEPLOYING-state deployments and gr │ (DEPLOYING, PROGRESSING) │ │ → DeployingProgressingHandler │ │ next_status: DEPLOYING │ - │ post_process: revision swap │ - │ for completed deployments │ │ │ │ (DEPLOYING, ROLLED_BACK) │ │ → DeployingRolledBackHandler │ @@ -168,7 +166,7 @@ Both Blue-Green and Rolling Update cycle FSMs share a common set of **sub-step v | **progressing** | Strategy making active progress — health checks pending, promotion waiting, or routes being replaced | DeployingProgressingHandler | DEPLOYING → DEPLOYING | | **rolled_back** | Strategy failed — rolled back to previous revision | DeployingRolledBackHandler | DEPLOYING → READY | -Completion is not a sub-step but a signal on `CycleEvaluationResult.completed`. When the strategy FSM detects that all new routes are healthy and no old routes remain, it returns `CycleEvaluationResult(sub_step=PROGRESSING, completed=True)`. The evaluator collects these into `EvaluationResult.completed`, and the coordinator passes them to the PROGRESSING handler's `post_process` for revision swap, then transitions to READY. +Completion is not a sub-step but a signal on `CycleEvaluationResult.completed`. When the strategy FSM detects that all new routes are healthy and no old routes remain, it returns `CycleEvaluationResult(sub_step=PROGRESSING, completed=True)`. The evaluator collects these into `EvaluationResult.completed`, and the coordinator directly calls `_transition_completed_deployments()` which atomically performs the revision swap and DEPLOYING→READY transition. ### DeploymentStrategyEvaluator @@ -266,20 +264,14 @@ class DeployingInProgressHandler(DeploymentHandler): return DeploymentExecutionResult(successes=list(deployments)) async def post_process(self, result): - if result.successes: - await self._deployment_controller.mark_lifecycle_needed( - DeploymentLifecycleType.DEPLOYING # reschedule next cycle - ) + # Re-schedule DEPLOYING for the next coordinator cycle + await self._deployment_controller.mark_lifecycle_needed( + DeploymentLifecycleType.DEPLOYING + ) + # Trigger route provisioning so new routes get sessions await self._route_controller.mark_lifecycle_needed( - RouteLifecycleType.PROVISIONING # trigger new route provisioning + RouteLifecycleType.PROVISIONING ) - # Revision swap for completed deployments - # (coordinator attaches eval_result.completed to result.completed) - if result.completed: - swap_ids = [d.id for d in result.completed - if d.deploying_revision_id is not None] - if swap_ids: - await repo.complete_deployment_revision_swap(swap_ids) class DeployingProvisioningHandler(DeployingInProgressHandler): @@ -302,7 +294,7 @@ class DeployingProgressingHandler(DeployingInProgressHandler): `next_status().lifecycle == DEPLOYING` so the coordinator records DEPLOYING→DEPLOYING SUCCESS history for in-progress deployments. The deployment stays in DEPLOYING state and is re-evaluated next cycle. -For completed deployments, the coordinator passes `EvaluationResult.completed` to the PROGRESSING handler's `post_process` via `result.completed`. The handler performs the revision swap, then the coordinator transitions the deployment to READY with history recording. +For completed deployments, the coordinator directly calls `_transition_completed_deployments()` after all handler post-processing. This method atomically performs the revision swap (`complete_deployment_revision_swap`) and transitions the deployment to READY with history recording. #### DeployingRolledBackHandler (ROLLED_BACK) @@ -357,24 +349,20 @@ _process_with_evaluator(lifecycle_type, evaluator) │ │ │ │ └───────────────────────────────────────────────────────────────┘ │ - │ 4. Attach completed deployments to PROGRESSING handler's result - │ if eval_result.completed: - │ handler_results[PROGRESSING].result.completed = eval_result.completed - │ - │ 5. Post-process outside RecorderContext scope + │ 4. Post-process outside RecorderContext scope │ for sub_step, (handler, result) in handler_results: │ handler.post_process(result) - │ ↑ PROGRESSING handler performs revision swap for result.completed + │ ↑ reschedule DEPLOYING cycle + trigger route provisioning │ - │ 6. Lifecycle transition for completed deployments + │ 5. Transition completed deployments (coordinator direct) │ if eval_result.completed: - │ _transition_completed_deployments(completed, all_records) - │ ↑ DEPLOYING → READY + history recording + │ _transition_completed_deployments(completed) + │ ↑ atomic revision swap + DEPLOYING → READY + history recording │ ▼ ``` -Key: `_handle_status_transitions()` uses the **exact same generic method** as the single-handler path. It performs batch updates and history recording based on each handler's `next_status()`/`failure_status()`. Completed deployments bypass this path — their lifecycle transition is handled by `_transition_completed_deployments()` after the revision swap in `post_process`. +Key: `_handle_status_transitions()` uses the **exact same generic method** as the single-handler path. It performs batch updates and history recording based on each handler's `next_status()`/`failure_status()`. Completed deployments bypass this path — their lifecycle transition is handled directly by the coordinator's `_transition_completed_deployments()`, which atomically performs the revision swap and DEPLOYING→READY transition. ### Sub-Step Recording @@ -413,7 +401,7 @@ sub_steps: [strategy_result] determine_sub_step → success (message: "completed") ``` -The revision swap (`complete_deployment_revision_swap`) is performed by the PROGRESSING handler's `post_process` outside the recorder scope, so it does not appear in sub_steps. The coordinator then transitions the deployment to READY with history recording. +The revision swap (`complete_deployment_revision_swap`) is performed by the coordinator's `_transition_completed_deployments()` outside the recorder scope, so it does not appear in sub_steps. This method atomically swaps the revision and transitions the deployment to READY with history recording. Format is `[phase] step`. The `determine_sub_step` step's `message` field records the determined sub-step value. This information is stored as JSON in the `deployment_history` table's `sub_steps` column and is queryable via API/CLI. From 17e121eab88a35b4174457cd3ac06a4d2a99c634 Mon Sep 17 00:00:00 2001 From: jopemachine Date: Mon, 2 Mar 2026 08:34:39 +0000 Subject: [PATCH 10/23] feat: Impl `activate_revision` --- .../deployment/db_source/db_source.py | 18 ++++++++++++------ .../repositories/deployment/repository.py | 8 ++++---- .../manager/services/deployment/service.py | 18 ++++++++++-------- 3 files changed, 26 insertions(+), 18 deletions(-) diff --git a/src/ai/backend/manager/repositories/deployment/db_source/db_source.py b/src/ai/backend/manager/repositories/deployment/db_source/db_source.py index 82f6e598faa..f28257fb5ec 100644 --- a/src/ai/backend/manager/repositories/deployment/db_source/db_source.py +++ b/src/ai/backend/manager/repositories/deployment/db_source/db_source.py @@ -2087,24 +2087,30 @@ async def update_endpoint( return row.to_deployment_info() - async def update_current_revision( + async def start_deploying_revision( self, endpoint_id: uuid.UUID, revision_id: uuid.UUID, ) -> uuid.UUID | None: - """Update the current_revision of an endpoint and return the previous revision ID.""" + """Set deploying_revision and transition lifecycle to DEPLOYING. + + Returns the current (previous) revision ID for reference. + The coordinator will swap deploying_revision → current_revision on completion. + """ async with self._begin_session_read_committed() as db_sess: # Get current revision first query = sa.select(EndpointRow.current_revision).where(EndpointRow.id == endpoint_id) result = await db_sess.execute(query) - row = result.scalar_one_or_none() - previous_revision_id = row + previous_revision_id = result.scalar_one_or_none() - # Update to new revision + # Set deploying_revision and transition to DEPLOYING update_query = ( sa.update(EndpointRow) .where(EndpointRow.id == endpoint_id) - .values(current_revision=revision_id) + .values( + deploying_revision=revision_id, + lifecycle_stage=EndpointLifecycle.DEPLOYING, + ) ) await db_sess.execute(update_query) diff --git a/src/ai/backend/manager/repositories/deployment/repository.py b/src/ai/backend/manager/repositories/deployment/repository.py index ec44457c91b..a087a717d45 100644 --- a/src/ai/backend/manager/repositories/deployment/repository.py +++ b/src/ai/backend/manager/repositories/deployment/repository.py @@ -1114,17 +1114,17 @@ async def update_endpoint( return await self._db_source.update_endpoint(updater) @deployment_repository_resilience.apply() - async def update_current_revision( + async def start_deploying_revision( self, endpoint_id: uuid.UUID, revision_id: uuid.UUID, ) -> uuid.UUID | None: - """Update the current revision of a deployment. + """Set deploying_revision and transition lifecycle to DEPLOYING. Returns: - The previous revision ID, or None if there was no previous revision. + The current (previous) revision ID, or None if there was no previous revision. """ - return await self._db_source.update_current_revision(endpoint_id, revision_id) + return await self._db_source.start_deploying_revision(endpoint_id, revision_id) # ========== Deployment Auto-Scaling Policy Operations ========== diff --git a/src/ai/backend/manager/services/deployment/service.py b/src/ai/backend/manager/services/deployment/service.py index 2b887aab2ad..c0ac60680b4 100644 --- a/src/ai/backend/manager/services/deployment/service.py +++ b/src/ai/backend/manager/services/deployment/service.py @@ -516,7 +516,11 @@ async def search_revisions(self, action: SearchRevisionsAction) -> SearchRevisio async def activate_revision( self, action: ActivateRevisionAction ) -> ActivateRevisionActionResult: - """Activate a specific revision to be the current revision. + """Activate a specific revision by initiating the deployment strategy. + + Sets deploying_revision and transitions the deployment to DEPLOYING state. + The coordinator will execute the configured deployment strategy (rolling update, + blue-green, etc.) and swap deploying_revision → current_revision on completion. Args: action: Action containing deployment and revision IDs @@ -527,18 +531,16 @@ async def activate_revision( # 1. Validate revision exists (raises exception if not found) _revision = await self._deployment_repository.get_revision(action.revision_id) - # 2. Update endpoint.current_revision and get previous revision - previous_revision_id = await self._deployment_repository.update_current_revision( + # 2. Set deploying_revision and transition to DEPLOYING lifecycle + previous_revision_id = await self._deployment_repository.start_deploying_revision( action.deployment_id, action.revision_id ) - # 3. Trigger lifecycle check to update routes with new revision - await self._deployment_controller.mark_lifecycle_needed( - DeploymentLifecycleType.CHECK_REPLICA - ) + # 3. Trigger DEPLOYING lifecycle to start strategy execution + await self._deployment_controller.mark_lifecycle_needed(DeploymentLifecycleType.DEPLOYING) log.info( - "Activated revision {} for deployment {} (previous: {})", + "Started deploying revision {} for deployment {} (current: {})", action.revision_id, action.deployment_id, previous_revision_id, From e0cd193c96d61ac08e24d350eb4d08e60ce4ba7e Mon Sep 17 00:00:00 2001 From: jopemachine Date: Tue, 3 Mar 2026 06:33:37 +0000 Subject: [PATCH 11/23] fix: Use match statement --- .../sokovan/deployment/strategy/evaluator.py | 34 +++++++++---------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/src/ai/backend/manager/sokovan/deployment/strategy/evaluator.py b/src/ai/backend/manager/sokovan/deployment/strategy/evaluator.py index 21cc194e43c..4108609df65 100644 --- a/src/ai/backend/manager/sokovan/deployment/strategy/evaluator.py +++ b/src/ai/backend/manager/sokovan/deployment/strategy/evaluator.py @@ -114,23 +114,23 @@ def _evaluate_single( policy: DeploymentPolicyData, ) -> CycleEvaluationResult: """Dispatch to the appropriate strategy FSM.""" - if strategy == DeploymentStrategy.ROLLING: - spec = policy.strategy_spec - if not isinstance(spec, RollingUpdateSpec): - raise ValueError( - f"Expected RollingUpdateSpec for ROLLING strategy, got {type(spec).__name__}" - ) - return rolling_update_evaluate(deployment, routes, spec) - - if strategy == DeploymentStrategy.BLUE_GREEN: - spec = policy.strategy_spec - if not isinstance(spec, BlueGreenSpec): - raise ValueError( - f"Expected BlueGreenSpec for BLUE_GREEN strategy, got {type(spec).__name__}" - ) - return blue_green_evaluate(deployment, routes, spec) - - raise ValueError(f"Unsupported deployment strategy: {strategy}") + match strategy: + case DeploymentStrategy.ROLLING: + spec = policy.strategy_spec + if not isinstance(spec, RollingUpdateSpec): + raise ValueError( + f"Expected RollingUpdateSpec for ROLLING strategy, got {type(spec).__name__}" + ) + return rolling_update_evaluate(deployment, routes, spec) + case DeploymentStrategy.BLUE_GREEN: + spec = policy.strategy_spec + if not isinstance(spec, BlueGreenSpec): + raise ValueError( + f"Expected BlueGreenSpec for BLUE_GREEN strategy, got {type(spec).__name__}" + ) + return blue_green_evaluate(deployment, routes, spec) + case _: + raise ValueError(f"Unsupported deployment strategy: {strategy}") async def _apply_route_changes( self, From 164ee45086d238baa030e011ff212f8d187b1831 Mon Sep 17 00:00:00 2001 From: jopemachine Date: Tue, 3 Mar 2026 07:00:51 +0000 Subject: [PATCH 12/23] WIP --- .../manager/sokovan/deployment/coordinator.py | 37 +++++++++++++++++-- 1 file changed, 33 insertions(+), 4 deletions(-) diff --git a/src/ai/backend/manager/sokovan/deployment/coordinator.py b/src/ai/backend/manager/sokovan/deployment/coordinator.py index 33729200d65..06beb37bc64 100644 --- a/src/ai/backend/manager/sokovan/deployment/coordinator.py +++ b/src/ai/backend/manager/sokovan/deployment/coordinator.py @@ -27,8 +27,12 @@ from ai.backend.common.leader.tasks.event_task import EventTaskSpec from ai.backend.logging import BraceStyleAdapter from ai.backend.manager.config.provider import ManagerConfigProvider -from ai.backend.manager.data.deployment.types import DeploymentInfo, DeploymentSubStep -from ai.backend.manager.data.session.types import SchedulingResult +from ai.backend.manager.data.deployment.types import ( + DeploymentInfo, + DeploymentSubStatus, + DeploymentSubStep, +) +from ai.backend.manager.data.session.types import SchedulingResult, SubStepResult from ai.backend.manager.defs import LockID from ai.backend.manager.models.endpoint import EndpointRow from ai.backend.manager.repositories.base.creator import BulkCreator @@ -270,6 +274,7 @@ async def _handle_status_transitions( next_lifecycle_status = transitions.success if next_lifecycle_status is not None and result.successes: next_lifecycle = next_lifecycle_status.lifecycle + sub_status = next_lifecycle_status.sub_status endpoint_ids = [d.id for d in result.successes] success_history_specs = [ DeploymentHistoryCreatorSpec( @@ -279,7 +284,9 @@ async def _handle_status_transitions( message=f"{handler_name} completed successfully", from_status=from_status, to_status=next_lifecycle, - sub_steps=extract_sub_steps_for_entity(d.id, records), + sub_steps=self._build_history_sub_steps( + d.id, records, sub_status, SchedulingResult.SUCCESS + ), ) for d in result.successes ] @@ -308,6 +315,7 @@ async def _handle_status_transitions( failure_lifecycle_status = transitions.failure if failure_lifecycle_status is not None and result.errors: failure_lifecycle = failure_lifecycle_status.lifecycle + failure_sub_status = failure_lifecycle_status.sub_status endpoint_ids = [e.deployment_info.id for e in result.errors] failure_history_specs = [ DeploymentHistoryCreatorSpec( @@ -318,7 +326,9 @@ async def _handle_status_transitions( from_status=from_status, to_status=failure_lifecycle, error_code=e.error_code, - sub_steps=extract_sub_steps_for_entity(e.deployment_info.id, records), + sub_steps=self._build_history_sub_steps( + e.deployment_info.id, records, failure_sub_status, SchedulingResult.FAILURE + ), ) for e in result.errors ] @@ -536,6 +546,25 @@ async def _transition_completed_deployments( except Exception as e: log.warning("Failed to send lifecycle notification: {}", e) + @staticmethod + def _build_history_sub_steps( + entity_id: UUID, + records: Mapping[UUID, ExecutionRecord], + sub_status: DeploymentSubStatus | None, + scheduling_result: SchedulingResult, + ) -> list[SubStepResult]: + """Build sub_steps list, appending sub_status as an entry if present.""" + sub_steps = extract_sub_steps_for_entity(entity_id, records) + if sub_status is not None: + now = datetime.now(UTC) + sub_steps.append(SubStepResult( + step=sub_status.value, + result=scheduling_result, + started_at=now, + ended_at=now, + )) + return sub_steps + def _build_lifecycle_notification_event( self, deployment: DeploymentInfo, From 688ff4132ec355fd2cdbd156583cb69fc2973110 Mon Sep 17 00:00:00 2001 From: jopemachine Date: Tue, 3 Mar 2026 07:38:38 +0000 Subject: [PATCH 13/23] fix: Replace `fetch_deployment_policies_by_endpoint_ids` with `search_deployment_polices` --- .../deployment/db_source/db_source.py | 22 ------------------- .../deployment/options/__init__.py | 3 +++ .../deployment/options/deployment_policy.py | 22 +++++++++++++++++++ .../repositories/deployment/repository.py | 8 ------- .../manager/sokovan/deployment/coordinator.py | 14 +++++++----- .../sokovan/deployment/strategy/evaluator.py | 15 +++++++++---- 6 files changed, 44 insertions(+), 40 deletions(-) create mode 100644 src/ai/backend/manager/repositories/deployment/options/deployment_policy.py diff --git a/src/ai/backend/manager/repositories/deployment/db_source/db_source.py b/src/ai/backend/manager/repositories/deployment/db_source/db_source.py index f28257fb5ec..38aa75d45f2 100644 --- a/src/ai/backend/manager/repositories/deployment/db_source/db_source.py +++ b/src/ai/backend/manager/repositories/deployment/db_source/db_source.py @@ -2254,28 +2254,6 @@ async def delete_deployment_policy( async with self._begin_session_read_committed() as db_sess: return await execute_purger(db_sess, purger) - async def fetch_deployment_policies_by_endpoint_ids( - self, - endpoint_ids: set[uuid.UUID], - ) -> Mapping[uuid.UUID, DeploymentPolicyData]: - """Fetch deployment policies for multiple endpoints in bulk. - - Args: - endpoint_ids: Set of endpoint IDs to fetch policies for. - - Returns: - Mapping of endpoint ID to DeploymentPolicyData. - """ - if not endpoint_ids: - return {} - async with self._db.begin_readonly_session_read_committed() as db_sess: - query = sa.select(DeploymentPolicyRow).where( - DeploymentPolicyRow.endpoint.in_(endpoint_ids) - ) - result = await db_sess.execute(query) - rows = result.scalars().all() - return {row.endpoint: row.to_data() for row in rows} - async def complete_deployment_revision_swap( self, endpoint_ids: set[uuid.UUID], diff --git a/src/ai/backend/manager/repositories/deployment/options/__init__.py b/src/ai/backend/manager/repositories/deployment/options/__init__.py index aec91afa3b1..4705a1d2879 100644 --- a/src/ai/backend/manager/repositories/deployment/options/__init__.py +++ b/src/ai/backend/manager/repositories/deployment/options/__init__.py @@ -3,6 +3,7 @@ from .access_token import AccessTokenConditions, AccessTokenOrders from .auto_scaling_rule import AutoScalingRuleConditions, AutoScalingRuleOrders from .deployment import DeploymentConditions, DeploymentOrders +from .deployment_policy import DeploymentPolicyConditions from .revision import RevisionConditions, RevisionOrders from .route import RouteConditions, RouteOrders @@ -16,6 +17,8 @@ # Deployment "DeploymentConditions", "DeploymentOrders", + # DeploymentPolicy + "DeploymentPolicyConditions", # Revision "RevisionConditions", "RevisionOrders", diff --git a/src/ai/backend/manager/repositories/deployment/options/deployment_policy.py b/src/ai/backend/manager/repositories/deployment/options/deployment_policy.py new file mode 100644 index 00000000000..306074d03d2 --- /dev/null +++ b/src/ai/backend/manager/repositories/deployment/options/deployment_policy.py @@ -0,0 +1,22 @@ +"""Query conditions and orders for deployment policies.""" + +from __future__ import annotations + +import uuid +from collections.abc import Collection + +import sqlalchemy as sa + +from ai.backend.manager.models.deployment_policy import DeploymentPolicyRow +from ai.backend.manager.repositories.base import QueryCondition + + +class DeploymentPolicyConditions: + """Query conditions for deployment policies.""" + + @staticmethod + def by_endpoint_ids(endpoint_ids: Collection[uuid.UUID]) -> QueryCondition: + def inner() -> sa.sql.expression.ColumnElement[bool]: + return DeploymentPolicyRow.endpoint.in_(endpoint_ids) + + return inner diff --git a/src/ai/backend/manager/repositories/deployment/repository.py b/src/ai/backend/manager/repositories/deployment/repository.py index a087a717d45..d78f05ffe04 100644 --- a/src/ai/backend/manager/repositories/deployment/repository.py +++ b/src/ai/backend/manager/repositories/deployment/repository.py @@ -1216,14 +1216,6 @@ async def delete_deployment_policy( """ return await self._db_source.delete_deployment_policy(purger) - @deployment_repository_resilience.apply() - async def fetch_deployment_policies_by_endpoint_ids( - self, - endpoint_ids: set[uuid.UUID], - ) -> Mapping[uuid.UUID, DeploymentPolicyData]: - """Fetch deployment policies for multiple endpoints in bulk.""" - return await self._db_source.fetch_deployment_policies_by_endpoint_ids(endpoint_ids) - @deployment_repository_resilience.apply() async def complete_deployment_revision_swap( self, diff --git a/src/ai/backend/manager/sokovan/deployment/coordinator.py b/src/ai/backend/manager/sokovan/deployment/coordinator.py index 06beb37bc64..ff4465e78f1 100644 --- a/src/ai/backend/manager/sokovan/deployment/coordinator.py +++ b/src/ai/backend/manager/sokovan/deployment/coordinator.py @@ -557,12 +557,14 @@ def _build_history_sub_steps( sub_steps = extract_sub_steps_for_entity(entity_id, records) if sub_status is not None: now = datetime.now(UTC) - sub_steps.append(SubStepResult( - step=sub_status.value, - result=scheduling_result, - started_at=now, - ended_at=now, - )) + sub_steps.append( + SubStepResult( + step=sub_status.value, + result=scheduling_result, + started_at=now, + ended_at=now, + ) + ) return sub_steps def _build_lifecycle_notification_event( diff --git a/src/ai/backend/manager/sokovan/deployment/strategy/evaluator.py b/src/ai/backend/manager/sokovan/deployment/strategy/evaluator.py index 4108609df65..2280a0d7b29 100644 --- a/src/ai/backend/manager/sokovan/deployment/strategy/evaluator.py +++ b/src/ai/backend/manager/sokovan/deployment/strategy/evaluator.py @@ -21,10 +21,13 @@ ) from ai.backend.manager.models.deployment_policy import BlueGreenSpec, RollingUpdateSpec from ai.backend.manager.models.routing import RoutingRow -from ai.backend.manager.repositories.base import Creator +from ai.backend.manager.repositories.base import BatchQuerier, Creator, NoPagination from ai.backend.manager.repositories.base.updater import BatchUpdater from ai.backend.manager.repositories.deployment.creators import RouteBatchUpdaterSpec -from ai.backend.manager.repositories.deployment.options import RouteConditions +from ai.backend.manager.repositories.deployment.options import ( + DeploymentPolicyConditions, + RouteConditions, +) from ai.backend.manager.repositories.deployment.repository import DeploymentRepository from .blue_green import blue_green_evaluate @@ -60,9 +63,13 @@ async def evaluate( endpoint_ids = {d.id for d in deployments} # ── 1. Bulk-load policies and routes ── - policy_map = await self._deployment_repo.fetch_deployment_policies_by_endpoint_ids( - endpoint_ids + policy_search = await self._deployment_repo.search_deployment_policies( + BatchQuerier( + pagination=NoPagination(), + conditions=[DeploymentPolicyConditions.by_endpoint_ids(endpoint_ids)], + ) ) + policy_map = {p.endpoint: p for p in policy_search.items} route_map = await self._deployment_repo.fetch_active_routes_by_endpoint_ids(endpoint_ids) # ── 2. Per-deployment evaluation ── From 4ee4ca203b8e80a114ce043b8947af3b6cdc9cbb Mon Sep 17 00:00:00 2001 From: jopemachine Date: Tue, 3 Mar 2026 07:42:13 +0000 Subject: [PATCH 14/23] fix: Raise correct exception --- .../sokovan/deployment/strategy/evaluator.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/src/ai/backend/manager/sokovan/deployment/strategy/evaluator.py b/src/ai/backend/manager/sokovan/deployment/strategy/evaluator.py index 2280a0d7b29..08188b427cb 100644 --- a/src/ai/backend/manager/sokovan/deployment/strategy/evaluator.py +++ b/src/ai/backend/manager/sokovan/deployment/strategy/evaluator.py @@ -19,6 +19,10 @@ RouteStatus, RouteTrafficStatus, ) +from ai.backend.manager.errors.deployment import ( + InvalidDeploymentStrategy, + InvalidDeploymentStrategySpec, +) from ai.backend.manager.models.deployment_policy import BlueGreenSpec, RollingUpdateSpec from ai.backend.manager.models.routing import RoutingRow from ai.backend.manager.repositories.base import BatchQuerier, Creator, NoPagination @@ -125,19 +129,21 @@ def _evaluate_single( case DeploymentStrategy.ROLLING: spec = policy.strategy_spec if not isinstance(spec, RollingUpdateSpec): - raise ValueError( - f"Expected RollingUpdateSpec for ROLLING strategy, got {type(spec).__name__}" + raise InvalidDeploymentStrategySpec( + extra_msg=f"Expected RollingUpdateSpec for ROLLING strategy, got {type(spec).__name__}" ) return rolling_update_evaluate(deployment, routes, spec) case DeploymentStrategy.BLUE_GREEN: spec = policy.strategy_spec if not isinstance(spec, BlueGreenSpec): - raise ValueError( - f"Expected BlueGreenSpec for BLUE_GREEN strategy, got {type(spec).__name__}" + raise InvalidDeploymentStrategySpec( + extra_msg=f"Expected BlueGreenSpec for BLUE_GREEN strategy, got {type(spec).__name__}" ) return blue_green_evaluate(deployment, routes, spec) case _: - raise ValueError(f"Unsupported deployment strategy: {strategy}") + raise InvalidDeploymentStrategy( + extra_msg=f"Unsupported deployment strategy: {strategy}" + ) async def _apply_route_changes( self, From f7a52158c058e898bd588d6c9eb32d8b8f8f0c06 Mon Sep 17 00:00:00 2001 From: jopemachine Date: Wed, 4 Mar 2026 01:45:27 +0000 Subject: [PATCH 15/23] fix: Move `_apply_route_changes` into deployment controller --- .../manager/sokovan/deployment/coordinator.py | 50 +++++++++++++++-- .../sokovan/deployment/handlers/deploying.py | 8 +-- .../sokovan/deployment/strategy/evaluator.py | 53 +++---------------- .../sokovan/deployment/strategy/types.py | 4 ++ 4 files changed, 60 insertions(+), 55 deletions(-) diff --git a/src/ai/backend/manager/sokovan/deployment/coordinator.py b/src/ai/backend/manager/sokovan/deployment/coordinator.py index ff4465e78f1..48007f540e6 100644 --- a/src/ai/backend/manager/sokovan/deployment/coordinator.py +++ b/src/ai/backend/manager/sokovan/deployment/coordinator.py @@ -31,17 +31,24 @@ DeploymentInfo, DeploymentSubStatus, DeploymentSubStep, + RouteStatus, + RouteTrafficStatus, ) from ai.backend.manager.data.session.types import SchedulingResult, SubStepResult from ai.backend.manager.defs import LockID from ai.backend.manager.models.endpoint import EndpointRow +from ai.backend.manager.models.routing import RoutingRow from ai.backend.manager.repositories.base.creator import BulkCreator from ai.backend.manager.repositories.base.updater import BatchUpdater from ai.backend.manager.repositories.deployment import ( DeploymentConditions, DeploymentRepository, ) -from ai.backend.manager.repositories.deployment.creators import EndpointLifecycleBatchUpdaterSpec +from ai.backend.manager.repositories.deployment.creators import ( + EndpointLifecycleBatchUpdaterSpec, + RouteBatchUpdaterSpec, +) +from ai.backend.manager.repositories.deployment.options import RouteConditions from ai.backend.manager.repositories.scheduling_history.creators import DeploymentHistoryCreatorSpec from ai.backend.manager.sokovan.deployment.recorder import DeploymentRecorderContext from ai.backend.manager.sokovan.deployment.route.route_controller import RouteController @@ -66,6 +73,7 @@ ScalingDeploymentHandler, ) from .strategy.evaluator import DeploymentStrategyEvaluator +from .strategy.types import EvaluationResult from .types import DeploymentExecutionResult, DeploymentLifecycleType # Handler key: either a simple lifecycle type or a (lifecycle, sub-step) tuple @@ -375,10 +383,11 @@ async def _process_with_evaluator( 1. Acquire distributed lock. 2. Load DEPLOYING deployments. - 3. Run evaluator (evaluates strategy FSM + applies route mutations). - 4. For each sub-step group, run the corresponding handler. - 5. Handle errors and skipped deployments. - 6. For completed deployments, swap revisions and transition to READY. + 3. Run evaluator (evaluates strategy FSM, aggregates route mutations). + 4. Apply aggregated route mutations. + 5. For each sub-step group, run the corresponding handler. + 6. Handle errors and skipped deployments. + 7. For completed deployments, swap revisions and transition to READY. """ lock_lifetime = self._config_provider.config.manager.session_schedule_lock_lifetime async with self._lock_factory(LockID.LOCKID_DEPLOYMENT_DEPLOYING, lock_lifetime): @@ -396,6 +405,10 @@ async def _process_with_evaluator( lifecycle_type.value, entity_ids=deployment_ids ) as pool: eval_result = await evaluator.evaluate(deployments) + + # Apply aggregated route mutations from the evaluation + await self._apply_route_changes(eval_result) + all_records = pool.build_all_records() # Process each sub-step group with its handler @@ -479,6 +492,33 @@ async def _process_with_evaluator( strategies=eval_result.completed_strategies, ) + async def _apply_route_changes( + self, + eval_result: EvaluationResult, + ) -> None: + """Apply aggregated route mutations from the evaluation result.""" + changes = eval_result.route_changes + if not changes.scale_out_specs and not changes.scale_in_route_ids: + return + + scale_in_updater: BatchUpdater[RoutingRow] | None = None + if changes.scale_in_route_ids: + scale_in_updater = BatchUpdater( + spec=RouteBatchUpdaterSpec( + status=RouteStatus.TERMINATING, + traffic_ratio=0.0, + traffic_status=RouteTrafficStatus.INACTIVE, + ), + conditions=[RouteConditions.by_ids(changes.scale_in_route_ids)], + ) + + await self._deployment_repository.scale_routes(changes.scale_out_specs, scale_in_updater) + log.debug( + "Applied route changes: {} created, {} terminated", + len(changes.scale_out_specs), + len(changes.scale_in_route_ids), + ) + async def _transition_completed_deployments( self, lifecycle_type: DeploymentLifecycleType, diff --git a/src/ai/backend/manager/sokovan/deployment/handlers/deploying.py b/src/ai/backend/manager/sokovan/deployment/handlers/deploying.py index 93e60da0467..95d07c94a32 100644 --- a/src/ai/backend/manager/sokovan/deployment/handlers/deploying.py +++ b/src/ai/backend/manager/sokovan/deployment/handlers/deploying.py @@ -1,9 +1,9 @@ """Handlers for DEPLOYING sub-steps (BEP-1049). -In-progress handlers (PROVISIONING, PROGRESSING) run *after* the strategy -evaluator has already applied route mutations. Their ``execute`` simply -returns success. ``post_process`` triggers the next DEPLOYING cycle and -route provisioning. +In-progress handlers (PROVISIONING, PROGRESSING) run *after* the coordinator +has applied route mutations from the evaluation result. Their ``execute`` +simply returns success. ``post_process`` triggers the next DEPLOYING cycle +and route provisioning. The rolled-back handler clears ``deploying_revision`` and transitions the deployment back to READY. diff --git a/src/ai/backend/manager/sokovan/deployment/strategy/evaluator.py b/src/ai/backend/manager/sokovan/deployment/strategy/evaluator.py index 08188b427cb..2c730bc5513 100644 --- a/src/ai/backend/manager/sokovan/deployment/strategy/evaluator.py +++ b/src/ai/backend/manager/sokovan/deployment/strategy/evaluator.py @@ -1,14 +1,14 @@ """Deployment strategy evaluator — orchestrates per-deployment FSM evaluation (BEP-1049). Loads policies and routes in bulk, dispatches each deployment to the appropriate -strategy FSM, aggregates route mutations, and applies them in one batch. +strategy FSM, and aggregates route mutations. The coordinator is responsible for +applying the aggregated route changes after evaluation. """ from __future__ import annotations import logging from collections.abc import Sequence -from uuid import UUID from ai.backend.common.data.model_deployment.types import DeploymentStrategy from ai.backend.logging import BraceStyleAdapter @@ -16,21 +16,15 @@ DeploymentInfo, DeploymentPolicyData, RouteInfo, - RouteStatus, - RouteTrafficStatus, ) from ai.backend.manager.errors.deployment import ( InvalidDeploymentStrategy, InvalidDeploymentStrategySpec, ) from ai.backend.manager.models.deployment_policy import BlueGreenSpec, RollingUpdateSpec -from ai.backend.manager.models.routing import RoutingRow -from ai.backend.manager.repositories.base import BatchQuerier, Creator, NoPagination -from ai.backend.manager.repositories.base.updater import BatchUpdater -from ai.backend.manager.repositories.deployment.creators import RouteBatchUpdaterSpec +from ai.backend.manager.repositories.base import BatchQuerier, NoPagination from ai.backend.manager.repositories.deployment.options import ( DeploymentPolicyConditions, - RouteConditions, ) from ai.backend.manager.repositories.deployment.repository import DeploymentRepository @@ -56,7 +50,7 @@ async def evaluate( Steps: 1. Bulk-load policies and active routes. 2. Per-deployment: dispatch to strategy FSM. - 3. Aggregate route changes and apply in one batch. + 3. Aggregate route changes into result (applied by coordinator). 4. Group deployments by sub-step and return. """ result = EvaluationResult() @@ -77,9 +71,6 @@ async def evaluate( route_map = await self._deployment_repo.fetch_active_routes_by_endpoint_ids(endpoint_ids) # ── 2. Per-deployment evaluation ── - all_scale_out: list[Creator[RoutingRow]] = [] - all_scale_in_ids: list[UUID] = [] - for deployment in deployments: policy = policy_map.get(deployment.id) if policy is None: @@ -96,10 +87,10 @@ async def evaluate( result.errors.append((deployment, str(e))) continue - # Collect route changes + # ── 3. Aggregate route changes ── changes = cycle_result.route_changes - all_scale_out.extend(changes.scale_out_specs) - all_scale_in_ids.extend(changes.scale_in_route_ids) + result.route_changes.scale_out_specs.extend(changes.scale_out_specs) + result.route_changes.scale_in_route_ids.extend(changes.scale_in_route_ids) # Group by sub-step if cycle_result.completed: @@ -112,9 +103,6 @@ async def evaluate( ) group.deployments.append(deployment) - # ── 3. Apply route mutations in batch ── - await self._apply_route_changes(all_scale_out, all_scale_in_ids) - return result def _evaluate_single( @@ -144,30 +132,3 @@ def _evaluate_single( raise InvalidDeploymentStrategy( extra_msg=f"Unsupported deployment strategy: {strategy}" ) - - async def _apply_route_changes( - self, - scale_out: list[Creator[RoutingRow]], - scale_in_ids: list[UUID], - ) -> None: - """Apply aggregated route mutations in a single DB transaction.""" - if not scale_out and not scale_in_ids: - return - - scale_in_updater: BatchUpdater[RoutingRow] | None = None - if scale_in_ids: - scale_in_updater = BatchUpdater( - spec=RouteBatchUpdaterSpec( - status=RouteStatus.TERMINATING, - traffic_ratio=0.0, - traffic_status=RouteTrafficStatus.INACTIVE, - ), - conditions=[RouteConditions.by_ids(scale_in_ids)], - ) - - await self._deployment_repo.scale_routes(scale_out, scale_in_updater) - log.debug( - "Applied route changes: {} created, {} terminated", - len(scale_out), - len(scale_in_ids), - ) diff --git a/src/ai/backend/manager/sokovan/deployment/strategy/types.py b/src/ai/backend/manager/sokovan/deployment/strategy/types.py index a31fe4050fd..4347e90a239 100644 --- a/src/ai/backend/manager/sokovan/deployment/strategy/types.py +++ b/src/ai/backend/manager/sokovan/deployment/strategy/types.py @@ -63,3 +63,7 @@ class EvaluationResult: # with the error message. The coordinator records NEED_RETRY history and keeps # the lifecycle at DEPLOYING so the next cycle can retry. errors: list[tuple[DeploymentInfo, str]] = field(default_factory=list) + + # Aggregated route mutations from all per-deployment evaluations. + # The coordinator applies these after evaluation completes. + route_changes: RouteChanges = field(default_factory=RouteChanges) From 19fe5c67760b11594542936d078cab4b5c578d2f Mon Sep 17 00:00:00 2001 From: jopemachine Date: Wed, 4 Mar 2026 02:18:32 +0000 Subject: [PATCH 16/23] wip --- .../manager/sokovan/deployment/coordinator.py | 12 +++---- .../sokovan/deployment/strategy/evaluator.py | 31 ++++++++++++++++--- .../sokovan/deployment/strategy/types.py | 4 +-- 3 files changed, 35 insertions(+), 12 deletions(-) diff --git a/src/ai/backend/manager/sokovan/deployment/coordinator.py b/src/ai/backend/manager/sokovan/deployment/coordinator.py index 48007f540e6..d617fda9569 100644 --- a/src/ai/backend/manager/sokovan/deployment/coordinator.py +++ b/src/ai/backend/manager/sokovan/deployment/coordinator.py @@ -498,25 +498,25 @@ async def _apply_route_changes( ) -> None: """Apply aggregated route mutations from the evaluation result.""" changes = eval_result.route_changes - if not changes.scale_out_specs and not changes.scale_in_route_ids: + if not changes.rollout_specs and not changes.drain_route_ids: return scale_in_updater: BatchUpdater[RoutingRow] | None = None - if changes.scale_in_route_ids: + if changes.drain_route_ids: scale_in_updater = BatchUpdater( spec=RouteBatchUpdaterSpec( status=RouteStatus.TERMINATING, traffic_ratio=0.0, traffic_status=RouteTrafficStatus.INACTIVE, ), - conditions=[RouteConditions.by_ids(changes.scale_in_route_ids)], + conditions=[RouteConditions.by_ids(changes.drain_route_ids)], ) - await self._deployment_repository.scale_routes(changes.scale_out_specs, scale_in_updater) + await self._deployment_repository.scale_routes(changes.rollout_specs, scale_in_updater) log.debug( "Applied route changes: {} created, {} terminated", - len(changes.scale_out_specs), - len(changes.scale_in_route_ids), + len(changes.rollout_specs), + len(changes.drain_route_ids), ) async def _transition_completed_deployments( diff --git a/src/ai/backend/manager/sokovan/deployment/strategy/evaluator.py b/src/ai/backend/manager/sokovan/deployment/strategy/evaluator.py index 2c730bc5513..9d84ced2104 100644 --- a/src/ai/backend/manager/sokovan/deployment/strategy/evaluator.py +++ b/src/ai/backend/manager/sokovan/deployment/strategy/evaluator.py @@ -27,10 +27,11 @@ DeploymentPolicyConditions, ) from ai.backend.manager.repositories.deployment.repository import DeploymentRepository +from ai.backend.manager.sokovan.deployment.recorder import DeploymentRecorderContext from .blue_green import blue_green_evaluate from .rolling_update import rolling_update_evaluate -from .types import CycleEvaluationResult, EvaluationGroup, EvaluationResult +from .types import CycleEvaluationResult, EvaluationGroup, EvaluationResult, RouteChanges log = BraceStyleAdapter(logging.getLogger(__name__)) @@ -87,10 +88,11 @@ async def evaluate( result.errors.append((deployment, str(e))) continue - # ── 3. Aggregate route changes ── + # ── 3. Aggregate route changes and record sub-steps ── changes = cycle_result.route_changes - result.route_changes.scale_out_specs.extend(changes.scale_out_specs) - result.route_changes.scale_in_route_ids.extend(changes.scale_in_route_ids) + result.route_changes.rollout_specs.extend(changes.rollout_specs) + result.route_changes.drain_route_ids.extend(changes.drain_route_ids) + self._record_route_changes(deployment, changes) # Group by sub-step if cycle_result.completed: @@ -105,6 +107,27 @@ async def evaluate( return result + @staticmethod + def _record_route_changes(deployment: DeploymentInfo, changes: RouteChanges) -> None: + """Record rollout/drain operations as sub-steps for observability.""" + if not changes.rollout_specs and not changes.drain_route_ids: + return + pool = DeploymentRecorderContext.current_pool() + recorder = pool.recorder(deployment.id) + with recorder.phase("route_mutations"): + if changes.rollout_specs: + with recorder.step( + "rollout", + success_detail=f"{len(changes.rollout_specs)} new route(s)", + ): + pass + if changes.drain_route_ids: + with recorder.step( + "drain", + success_detail=f"{len(changes.drain_route_ids)} route(s)", + ): + pass + def _evaluate_single( self, deployment: DeploymentInfo, diff --git a/src/ai/backend/manager/sokovan/deployment/strategy/types.py b/src/ai/backend/manager/sokovan/deployment/strategy/types.py index 4347e90a239..615d6e8238f 100644 --- a/src/ai/backend/manager/sokovan/deployment/strategy/types.py +++ b/src/ai/backend/manager/sokovan/deployment/strategy/types.py @@ -18,8 +18,8 @@ class RouteChanges: """Route mutations to apply for a single deployment cycle.""" - scale_out_specs: list[Creator[RoutingRow]] = field(default_factory=list) - scale_in_route_ids: list[UUID] = field(default_factory=list) + rollout_specs: list[Creator[RoutingRow]] = field(default_factory=list) + drain_route_ids: list[UUID] = field(default_factory=list) @dataclass From 7eabc9144f6657893c99c9ba4e786fcc82b201e9 Mon Sep 17 00:00:00 2001 From: jopemachine Date: Wed, 4 Mar 2026 03:57:55 +0000 Subject: [PATCH 17/23] wip --- src/ai/backend/manager/sokovan/deployment/coordinator.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/ai/backend/manager/sokovan/deployment/coordinator.py b/src/ai/backend/manager/sokovan/deployment/coordinator.py index d617fda9569..d32a3497133 100644 --- a/src/ai/backend/manager/sokovan/deployment/coordinator.py +++ b/src/ai/backend/manager/sokovan/deployment/coordinator.py @@ -490,6 +490,7 @@ async def _process_with_evaluator( lifecycle_type, eval_result.completed, strategies=eval_result.completed_strategies, + records=all_records, ) async def _apply_route_changes( @@ -524,6 +525,7 @@ async def _transition_completed_deployments( lifecycle_type: DeploymentLifecycleType, completed: list[DeploymentInfo], strategies: dict[UUID, DeploymentStrategy], + records: Mapping[UUID, ExecutionRecord], ) -> None: """Transition completed DEPLOYING deployments to READY. @@ -558,7 +560,7 @@ async def _transition_completed_deployments( else "Deployment completed successfully", from_status=from_status, to_status=to_status, - sub_steps=[], + sub_steps=extract_sub_steps_for_entity(deployment.id, records), ) for deployment in completed ] From 9afc4a28609d79c997ef48916ba69d33722e22ba Mon Sep 17 00:00:00 2001 From: jopemachine Date: Wed, 4 Mar 2026 04:36:04 +0000 Subject: [PATCH 18/23] docs: Update BEP --- .../BEP-1049-deployment-strategy-handler.md | 73 ++++++++++--------- proposals/BEP-1049/blue-green.md | 24 +++--- proposals/BEP-1049/rolling-update.md | 14 ++-- 3 files changed, 55 insertions(+), 56 deletions(-) diff --git a/proposals/BEP-1049-deployment-strategy-handler.md b/proposals/BEP-1049-deployment-strategy-handler.md index 88b2aa22e83..ec208bdca66 100644 --- a/proposals/BEP-1049-deployment-strategy-handler.md +++ b/proposals/BEP-1049-deployment-strategy-handler.md @@ -36,7 +36,7 @@ A single `evaluate()` call may produce different sub-steps for different deploym |--------|-------------| | **State transition** | Each sub-step handler returns explicit `next_status()` → coordinator's generic path handles all transitions | | **Routing** | Coordinator branches to evaluator path for `DeploymentLifecycleType.DEPLOYING` | -| **Cycles** | Evaluator runs strategy FSM + applies route changes → handlers process results → coordinator records history | +| **Cycles** | Evaluator runs strategy FSM → coordinator applies route changes → handlers process results → coordinator records history | ## Sub-documents @@ -100,7 +100,7 @@ Core idea: A **strategy evaluator** evaluates DEPLOYING-state deployments and gr │ Implementations: │ │ evaluate(deployments) → EvaluationResult │ │ ├─ CheckPending │ │ 1. Load policies/routes │ │ ├─ Scaling │ │ 2. Run strategy FSM → CycleEvaluationResult │ -│ ├─ CheckReplica │ │ 3. Apply route changes (scale_out/scale_in) │ +│ ├─ CheckReplica │ │ 3. Aggregate route changes │ │ ├─ Reconcile │ │ 4. Group by sub-step │ │ └─ Destroying │ └───────────────┬──────────────────────────────────┘ └─────────────────────┘ │ @@ -180,7 +180,7 @@ DeploymentStrategyEvaluator.evaluate(deployments) │ Phase 1: Load policies and routes │ ┌─────────────────────────────────────────────────────────┐ │ │ policy_map = load_policies(deployments) │ - │ │ route_map = fetch_active_routes_by_endpoint_ids(...) │ + │ │ route_map = fetch_routes_by_endpoint_ids(...) │ │ └─────────────────────────────────────────────────────────┘ │ │ Phase 2: Run per-deployment strategy FSM @@ -200,14 +200,6 @@ DeploymentStrategyEvaluator.evaluate(deployments) │ │ groups[cycle_result.sub_step].append(deployment) │ │ └─────────────────────────────────────────────────────────┘ │ - │ Phase 3: Apply route changes (in-progress only) - │ ┌─────────────────────────────────────────────────────────┐ - │ │ Collect route changes from PROVISIONING/PROGRESSING: │ - │ │ scale_out_creators → create new routes │ - │ │ scale_in_updater → terminate old routes │ - │ │ repo.scale_routes(scale_out, scale_in) │ - │ └─────────────────────────────────────────────────────────┘ - │ ▼ EvaluationResult { groups: { @@ -217,13 +209,18 @@ DeploymentStrategyEvaluator.evaluate(deployments) completed: [deploy_D], # strategy completed (revision swap pending) skipped: [deploy_E], # no policy / unsupported strategy errors: [error_F], # exception during evaluation + route_changes: RouteChanges { + rollout_specs: [Creator, ...], # new routes to create + drain_route_ids: [UUID, ...], # old routes to terminate + promote_route_ids: [UUID, ...], # green routes to activate (Blue-Green) + }, } ``` #### Key Design Principles -1. **Route changes are applied by the evaluator**: scale_out/scale_in are applied once in the evaluator. Individual handlers do not touch routes. -2. **Strategy FSMs live in the evaluator**: `_rolling_update_evaluate()`, `_blue_green_evaluate()` and other strategy FSM logic are internal helper methods of the evaluator. +1. **Route changes are aggregated by the evaluator, applied by the coordinator**: The evaluator collects route mutations (rollout/drain/promote) from each strategy FSM into `EvaluationResult.route_changes`. The coordinator's `_apply_route_changes()` applies them after evaluation. Individual handlers do not touch routes. +2. **Strategy FSMs are separate modules dispatched by the evaluator**: `rolling_update_evaluate()` and `blue_green_evaluate()` live in dedicated module files (`strategy/rolling_update.py`, `strategy/blue_green.py`). The evaluator dispatches to them based on the deployment policy's strategy type. 3. **Only grouping is returned**: The evaluator classifies deployments by sub-step; actual processing (revision swap, deploying_revision cleanup, etc.) is delegated to handlers. ### Per-Sub-Step Handlers @@ -245,7 +242,7 @@ The coordinator's `_handle_status_transitions()` extracts `.lifecycle` for DB up #### DeployingInProgressHandler (base) → Provisioning / Progressing -PROVISIONING and PROGRESSING share the same logic (evaluator already applied route changes; handler returns success + reschedules), so `DeployingInProgressHandler` base class defines common behavior, and subclasses hard-code their sub-step-specific `next_status()` and `status_transitions()`: +PROVISIONING and PROGRESSING share the same logic (coordinator already applied route changes; handler returns success + reschedules), so `DeployingInProgressHandler` base class defines common behavior, and subclasses hard-code their sub-step-specific `next_status()` and `status_transitions()`: ```python class DeployingInProgressHandler(DeploymentHandler): @@ -260,7 +257,7 @@ class DeployingInProgressHandler(DeploymentHandler): return None async def execute(self, deployments): - # Route changes already applied by evaluator + # Route changes already applied by coordinator return DeploymentExecutionResult(successes=list(deployments)) async def post_process(self, result): @@ -335,6 +332,8 @@ _process_with_evaluator(lifecycle_type, evaluator) │ ┌───────────────────────────────────────────────────────────────┐ │ │ │ │ │ eval_result = evaluator.evaluate(deployments) │ + │ │ _apply_route_changes(eval_result) │ + │ │ ↑ coordinator applies rollout/drain/promote │ │ │ │ │ │ for sub_step, group in eval_result.groups: │ │ │ handler = handlers[(lifecycle_type, sub_step)] │ @@ -356,8 +355,9 @@ _process_with_evaluator(lifecycle_type, evaluator) │ │ 5. Transition completed deployments (coordinator direct) │ if eval_result.completed: - │ _transition_completed_deployments(completed) + │ _transition_completed_deployments(completed, records=all_records) │ ↑ atomic revision swap + DEPLOYING → READY + history recording + │ ↑ includes route mutation sub_steps from this cycle │ ▼ ``` @@ -370,45 +370,50 @@ Each cycle evaluation produces sub-step variants recorded via the existing `Depl The coordinator's `_handle_status_transitions()` calls `extract_sub_steps_for_entity()` for each handler's result, including the deployment's sub-step information in the history. -#### Rolling Update Per-Cycle Recording Examples +#### Sub-Step Recording: Route Mutation Granularity + +Sub-steps are recorded at the **route mutation level** by the evaluator's `_record_route_changes()`. Each route mutation type (rollout, drain, promote) is recorded as a separate sub-step entry with the count of affected routes. -**PROVISIONING cycle** — new routes still being provisioned: +**PROVISIONING cycle** — new routes created: ``` sub_steps: - [rolling_update_evaluate] classify_routes → success - [rolling_update_evaluate] wait_provisioning → success - [strategy_result] determine_sub_step → success (message: "provisioning") + rollout → SUCCESS (message: "3 new route(s)") + provisioning → SUCCESS ``` **PROGRESSING cycle** — creating new routes / terminating old routes: ``` sub_steps: - [rolling_update_evaluate] classify_routes → success - [rolling_update_evaluate] check_completion → success - [rolling_update_evaluate] calculate_surge → success - [rolling_update_evaluate] build_route_changes → success - [strategy_result] determine_sub_step → success (message: "progressing") + rollout → SUCCESS (message: "1 new route(s)") + drain → SUCCESS (message: "1 route(s)") + progressing → SUCCESS +``` + +**COMPLETED cycle (Blue-Green)** — promotion executed: + +``` +sub_steps: + drain → SUCCESS (message: "3 route(s)") + promote → SUCCESS (message: "3 route(s)") ``` -**COMPLETED cycle** — all new routes healthy, no old routes remaining: +**COMPLETED cycle (Rolling Update)** — final drain: ``` sub_steps: - [rolling_update_evaluate] classify_routes → success - [rolling_update_evaluate] check_completion → success - [strategy_result] determine_sub_step → success (message: "completed") + drain → SUCCESS (message: "1 route(s)") ``` -The revision swap (`complete_deployment_revision_swap`) is performed by the coordinator's `_transition_completed_deployments()` outside the recorder scope, so it does not appear in sub_steps. This method atomically swaps the revision and transitions the deployment to READY with history recording. +Route mutation sub-steps are recorded within the `DeploymentRecorderContext` scope. For in-progress deployments, handlers add their own sub-step (e.g., `provisioning`, `progressing`) to the same record. For completed deployments, `_transition_completed_deployments()` receives the recorder pool's `all_records` and includes the current cycle's route mutation sub-steps in the completion history. -Format is `[phase] step`. The `determine_sub_step` step's `message` field records the determined sub-step value. This information is stored as JSON in the `deployment_history` table's `sub_steps` column and is queryable via API/CLI. +The revision swap (`complete_deployment_revision_swap`) is an atomic DB operation that does not appear as a sub-step. This enables: -- **Observability**: Each deployment's progress is tracked per-entity with sub-step granularity (e.g., "provisioning", "progressing", "completed") -- **Debugging**: The sub-step history shows exactly which phase each deployment was in at each cycle +- **Observability**: Each deployment's progress is tracked per-entity with route mutation granularity +- **Debugging**: The sub-step history shows exactly which route mutations occurred at each cycle - **Consistency**: All handlers use the same coordinator generic path ### Per-Strategy Configuration diff --git a/proposals/BEP-1049/blue-green.md b/proposals/BEP-1049/blue-green.md index 0fd2dec8c00..a5d25f0f68e 100644 --- a/proposals/BEP-1049/blue-green.md +++ b/proposals/BEP-1049/blue-green.md @@ -220,8 +220,8 @@ With `auto_promote=False`: │ strategy = policy.strategy │ │ 3. Dispatch by strategy: │ │ BLUE_GREEN → blue_green_evaluate(...) │ - │ 4. Group by sub_step and return │ - │ 5. Apply route changes (scale_out + scale_in) │ + │ 4. Aggregate route changes + group by sub_step │ + │ Coordinator applies route changes after evaluation │ └──────────────────────────┬───────────────────────────────────┘ │ ▼ @@ -240,27 +240,23 @@ With `auto_promote=False`: │ │ blue_active: blue + is_active() │ │ │ └────────────────────────────────────────────────────┘ │ │ │ - │ Actions applied: │ + │ Route changes returned (applied by coordinator): │ │ ┌────────────────────────────────────────────────────┐ │ - │ │ ● Green creation: │ │ + │ │ ● Green creation (rollout_specs): │ │ │ │ RouteCreatorSpec( │ │ │ │ revision_id = deploying_revision, │ │ │ │ traffic_status = INACTIVE ← differs from RU │ │ │ │ ) × target_count │ │ │ │ │ │ │ │ ● Promotion (traffic switch): │ │ - │ │ Green: RouteBatchUpdaterSpec( │ │ - │ │ traffic_status = ACTIVE │ │ - │ │ ) │ │ - │ │ Blue: RouteBatchUpdaterSpec( │ │ - │ │ status = TERMINATING, │ │ - │ │ traffic_status = INACTIVE │ │ - │ │ ) │ │ + │ │ promote_route_ids: Green route IDs │ │ + │ │ → traffic_status = ACTIVE │ │ + │ │ drain_route_ids: Blue route IDs │ │ + │ │ → status = TERMINATING │ │ │ │ │ │ │ │ ● Rollback: │ │ - │ │ Green: RouteBatchUpdaterSpec( │ │ - │ │ status = TERMINATING │ │ - │ │ ) │ │ + │ │ drain_route_ids: Green route IDs │ │ + │ │ → status = TERMINATING │ │ │ └────────────────────────────────────────────────────┘ │ └──────────────────────────────────────────────────────────────┘ │ diff --git a/proposals/BEP-1049/rolling-update.md b/proposals/BEP-1049/rolling-update.md index 3c28d6ead0c..1714f9b72ab 100644 --- a/proposals/BEP-1049/rolling-update.md +++ b/proposals/BEP-1049/rolling-update.md @@ -190,8 +190,8 @@ Example with `desired_replicas = 3`, `max_surge = 1`, `max_unavailable = 1`: │ strategy = policy.strategy │ │ 3. Dispatch by strategy: │ │ ROLLING → rolling_update_evaluate(...) │ - │ 4. Group by sub_step and return │ - │ 5. Apply route changes (scale_out + scale_in) │ + │ 4. Aggregate route changes + group by sub_step │ + │ Coordinator applies route changes after evaluation │ └──────────────────────────┬───────────────────────────────────┘ │ ▼ @@ -209,17 +209,15 @@ Example with `desired_replicas = 3`, `max_surge = 1`, `max_unavailable = 1`: │ │ old_active: old + is_active() │ │ │ └────────────────────────────────────────────────────┘ │ │ │ - │ Actions applied: │ + │ Route changes returned (applied by coordinator): │ │ ┌────────────────────────────────────────────────────┐ │ - │ │ scale_out: RouteCreatorSpec( │ │ + │ │ rollout_specs: RouteCreatorSpec( │ │ │ │ revision_id = deploying_revision, │ │ │ │ traffic_status = ACTIVE ← differs from BG │ │ │ │ ) │ │ │ │ │ │ - │ │ scale_in: RouteBatchUpdaterSpec( │ │ - │ │ status = TERMINATING, │ │ - │ │ traffic_status = INACTIVE │ │ - │ │ ) │ │ + │ │ drain_route_ids: old route IDs │ │ + │ │ → status = TERMINATING │ │ │ └────────────────────────────────────────────────────┘ │ └──────────────────────────────────────────────────────────────┘ │ From ee615d35f1595c196445ef633f80d1f90f811366 Mon Sep 17 00:00:00 2001 From: jopemachine Date: Wed, 4 Mar 2026 06:25:15 +0000 Subject: [PATCH 19/23] feat: Introduce `BaseDeploymentStrategy`, `DeploymentStrategyRegistry` --- .../BEP-1049-deployment-strategy-handler.md | 8 +- .../manager/sokovan/deployment/coordinator.py | 15 +++- .../sokovan/deployment/strategy/__init__.py | 10 +++ .../sokovan/deployment/strategy/blue_green.py | 23 +++-- .../sokovan/deployment/strategy/evaluator.py | 86 ++++++++++++------- .../deployment/strategy/rolling_update.py | 23 +++-- .../sokovan/deployment/strategy/types.py | 23 +++++ 7 files changed, 136 insertions(+), 52 deletions(-) diff --git a/proposals/BEP-1049-deployment-strategy-handler.md b/proposals/BEP-1049-deployment-strategy-handler.md index ec208bdca66..604af7bf369 100644 --- a/proposals/BEP-1049-deployment-strategy-handler.md +++ b/proposals/BEP-1049-deployment-strategy-handler.md @@ -189,10 +189,8 @@ DeploymentStrategyEvaluator.evaluate(deployments) │ │ policy = policy_map[deployment.id] │ │ │ routes = route_map[deployment.id] │ │ │ │ - │ │ if policy.strategy == ROLLING: │ - │ │ cycle_result = rolling_update_evaluate(...) │ - │ │ elif policy.strategy == BLUE_GREEN: │ - │ │ cycle_result = blue_green_evaluate(...) │ + │ │ strategy_fsm = create_strategy(policy) │ + │ │ cycle_result = strategy_fsm.evaluate_cycle(...) │ │ │ │ │ │ if cycle_result.completed: │ │ │ completed.append(deployment) │ @@ -220,7 +218,7 @@ DeploymentStrategyEvaluator.evaluate(deployments) #### Key Design Principles 1. **Route changes are aggregated by the evaluator, applied by the coordinator**: The evaluator collects route mutations (rollout/drain/promote) from each strategy FSM into `EvaluationResult.route_changes`. The coordinator's `_apply_route_changes()` applies them after evaluation. Individual handlers do not touch routes. -2. **Strategy FSMs are separate modules dispatched by the evaluator**: `rolling_update_evaluate()` and `blue_green_evaluate()` live in dedicated module files (`strategy/rolling_update.py`, `strategy/blue_green.py`). The evaluator dispatches to them based on the deployment policy's strategy type. +2. **Strategy FSMs implement a common interface via registry**: All strategy implementations extend the `BaseDeploymentStrategy` abstract base class and implement `evaluate_cycle()`. Concrete classes (`RollingUpdateStrategy`, `BlueGreenStrategy`) live in dedicated module files (`strategy/rolling_update.py`, `strategy/blue_green.py`). The coordinator owns a `StrategyRegistry` that maps each `DeploymentStrategy` enum to its implementation class and expected spec type. The registry is injected into the evaluator, which uses it to instantiate the appropriate strategy per deployment. 3. **Only grouping is returned**: The evaluator classifies deployments by sub-step; actual processing (revision swap, deploying_revision cleanup, etc.) is delegated to handlers. ### Per-Sub-Step Handlers diff --git a/src/ai/backend/manager/sokovan/deployment/coordinator.py b/src/ai/backend/manager/sokovan/deployment/coordinator.py index d32a3497133..9b3bcedb13f 100644 --- a/src/ai/backend/manager/sokovan/deployment/coordinator.py +++ b/src/ai/backend/manager/sokovan/deployment/coordinator.py @@ -36,6 +36,7 @@ ) from ai.backend.manager.data.session.types import SchedulingResult, SubStepResult from ai.backend.manager.defs import LockID +from ai.backend.manager.models.deployment_policy import BlueGreenSpec, RollingUpdateSpec from ai.backend.manager.models.endpoint import EndpointRow from ai.backend.manager.models.routing import RoutingRow from ai.backend.manager.repositories.base.creator import BulkCreator @@ -72,7 +73,9 @@ ReconcileDeploymentHandler, ScalingDeploymentHandler, ) -from .strategy.evaluator import DeploymentStrategyEvaluator +from .strategy.blue_green import BlueGreenStrategy +from .strategy.evaluator import DeploymentStrategyEvaluator, DeploymentStrategyRegistry +from .strategy.rolling_update import RollingUpdateStrategy from .strategy.types import EvaluationResult from .types import DeploymentExecutionResult, DeploymentLifecycleType @@ -155,12 +158,22 @@ def __init__( valkey_stat=valkey_stat, ) self._deployment_handlers = self._init_handlers(executor) + self._strategy_registry = self._init_deployment_strategy_registry() self._deployment_evaluators = { DeploymentLifecycleType.DEPLOYING: DeploymentStrategyEvaluator( deployment_repo=self._deployment_repository, + strategy_registry=self._strategy_registry, ), } + @staticmethod + def _init_deployment_strategy_registry() -> DeploymentStrategyRegistry: + """Initialize the strategy registry with all supported deployment strategies.""" + registry = DeploymentStrategyRegistry() + registry.register(DeploymentStrategy.ROLLING, RollingUpdateStrategy, RollingUpdateSpec) + registry.register(DeploymentStrategy.BLUE_GREEN, BlueGreenStrategy, BlueGreenSpec) + return registry + def _init_handlers( self, executor: DeploymentExecutor ) -> Mapping[DeploymentHandlerKey, DeploymentHandler]: diff --git a/src/ai/backend/manager/sokovan/deployment/strategy/__init__.py b/src/ai/backend/manager/sokovan/deployment/strategy/__init__.py index a2ecf59ecb4..964ab31c132 100644 --- a/src/ai/backend/manager/sokovan/deployment/strategy/__init__.py +++ b/src/ai/backend/manager/sokovan/deployment/strategy/__init__.py @@ -1 +1,11 @@ """Deployment strategy evaluation for rolling update and blue-green deployments (BEP-1049).""" + +from .blue_green import BlueGreenStrategy +from .rolling_update import RollingUpdateStrategy +from .types import BaseDeploymentStrategy + +__all__ = [ + "BaseDeploymentStrategy", + "BlueGreenStrategy", + "RollingUpdateStrategy", +] diff --git a/src/ai/backend/manager/sokovan/deployment/strategy/blue_green.py b/src/ai/backend/manager/sokovan/deployment/strategy/blue_green.py index 6e76625ed8b..a282f0f8095 100644 --- a/src/ai/backend/manager/sokovan/deployment/strategy/blue_green.py +++ b/src/ai/backend/manager/sokovan/deployment/strategy/blue_green.py @@ -14,13 +14,20 @@ ) from ai.backend.manager.models.deployment_policy import BlueGreenSpec -from .types import CycleEvaluationResult +from .types import BaseDeploymentStrategy, CycleEvaluationResult -def blue_green_evaluate( - deployment: DeploymentInfo, - routes: Sequence[RouteInfo], - spec: BlueGreenSpec, -) -> CycleEvaluationResult: - """Evaluate one cycle of blue-green deployment for a single deployment.""" - raise NotImplementedError("Blue-green deployment strategy is not yet implemented") +class BlueGreenStrategy(BaseDeploymentStrategy): + """Blue-green deployment strategy FSM.""" + + def __init__(self, spec: BlueGreenSpec) -> None: + super().__init__(spec) + self._spec = spec + + def evaluate_cycle( + self, + deployment: DeploymentInfo, + routes: Sequence[RouteInfo], + ) -> CycleEvaluationResult: + """Evaluate one cycle of blue-green deployment for a single deployment.""" + raise NotImplementedError("Blue-green deployment strategy is not yet implemented") diff --git a/src/ai/backend/manager/sokovan/deployment/strategy/evaluator.py b/src/ai/backend/manager/sokovan/deployment/strategy/evaluator.py index 9d84ced2104..ffa95df5eb0 100644 --- a/src/ai/backend/manager/sokovan/deployment/strategy/evaluator.py +++ b/src/ai/backend/manager/sokovan/deployment/strategy/evaluator.py @@ -9,6 +9,9 @@ import logging from collections.abc import Sequence +from dataclasses import dataclass + +from pydantic import BaseModel from ai.backend.common.data.model_deployment.types import DeploymentStrategy from ai.backend.logging import BraceStyleAdapter @@ -21,7 +24,6 @@ InvalidDeploymentStrategy, InvalidDeploymentStrategySpec, ) -from ai.backend.manager.models.deployment_policy import BlueGreenSpec, RollingUpdateSpec from ai.backend.manager.repositories.base import BatchQuerier, NoPagination from ai.backend.manager.repositories.deployment.options import ( DeploymentPolicyConditions, @@ -29,9 +31,34 @@ from ai.backend.manager.repositories.deployment.repository import DeploymentRepository from ai.backend.manager.sokovan.deployment.recorder import DeploymentRecorderContext -from .blue_green import blue_green_evaluate -from .rolling_update import rolling_update_evaluate -from .types import CycleEvaluationResult, EvaluationGroup, EvaluationResult, RouteChanges +from .types import BaseDeploymentStrategy, EvaluationGroup, EvaluationResult, RouteChanges + + +@dataclass(frozen=True) +class DeploymentStrategyRegistryEntry: + """Maps a deployment strategy to its implementation class and expected spec type.""" + + strategy_cls: type[BaseDeploymentStrategy] + spec_type: type[BaseModel] + + +class DeploymentStrategyRegistry: + """Registry of deployment strategy implementations.""" + + def __init__(self) -> None: + self._entries: dict[DeploymentStrategy, DeploymentStrategyRegistryEntry] = {} + + def register( + self, + strategy: DeploymentStrategy, + strategy_cls: type[BaseDeploymentStrategy], + spec_type: type[BaseModel], + ) -> None: + self._entries[strategy] = DeploymentStrategyRegistryEntry(strategy_cls, spec_type) + + def get(self, strategy: DeploymentStrategy) -> DeploymentStrategyRegistryEntry | None: + return self._entries.get(strategy) + log = BraceStyleAdapter(logging.getLogger(__name__)) @@ -39,8 +66,13 @@ class DeploymentStrategyEvaluator: """Evaluates DEPLOYING deployments and produces grouped results + route mutations.""" - def __init__(self, deployment_repo: DeploymentRepository) -> None: + def __init__( + self, + deployment_repo: DeploymentRepository, + strategy_registry: DeploymentStrategyRegistry, + ) -> None: self._deployment_repo = deployment_repo + self._strategy_registry = strategy_registry async def evaluate( self, @@ -82,7 +114,8 @@ async def evaluate( routes: list[RouteInfo] = list(route_map.get(deployment.id, [])) try: - cycle_result = self._evaluate_single(deployment, routes, policy.strategy, policy) + strategy = self._create_strategy(policy.strategy, policy) + cycle_result = strategy.evaluate_cycle(deployment, routes) except Exception as e: log.warning("deployment {}: evaluation error — {}", deployment.id, e) result.errors.append((deployment, str(e))) @@ -128,30 +161,23 @@ def _record_route_changes(deployment: DeploymentInfo, changes: RouteChanges) -> ): pass - def _evaluate_single( + def _create_strategy( self, - deployment: DeploymentInfo, - routes: list[RouteInfo], strategy: DeploymentStrategy, policy: DeploymentPolicyData, - ) -> CycleEvaluationResult: - """Dispatch to the appropriate strategy FSM.""" - match strategy: - case DeploymentStrategy.ROLLING: - spec = policy.strategy_spec - if not isinstance(spec, RollingUpdateSpec): - raise InvalidDeploymentStrategySpec( - extra_msg=f"Expected RollingUpdateSpec for ROLLING strategy, got {type(spec).__name__}" - ) - return rolling_update_evaluate(deployment, routes, spec) - case DeploymentStrategy.BLUE_GREEN: - spec = policy.strategy_spec - if not isinstance(spec, BlueGreenSpec): - raise InvalidDeploymentStrategySpec( - extra_msg=f"Expected BlueGreenSpec for BLUE_GREEN strategy, got {type(spec).__name__}" - ) - return blue_green_evaluate(deployment, routes, spec) - case _: - raise InvalidDeploymentStrategy( - extra_msg=f"Unsupported deployment strategy: {strategy}" - ) + ) -> BaseDeploymentStrategy: + """Create a strategy instance for the given deployment policy.""" + entry = self._strategy_registry.get(strategy) + if entry is None: + raise InvalidDeploymentStrategy( + extra_msg=f"Unsupported deployment strategy: {strategy}" + ) + spec = policy.strategy_spec + if not isinstance(spec, entry.spec_type): + raise InvalidDeploymentStrategySpec( + extra_msg=( + f"Expected {entry.spec_type.__name__} for {strategy.name} strategy," + f" got {type(spec).__name__}" + ), + ) + return entry.strategy_cls(spec) diff --git a/src/ai/backend/manager/sokovan/deployment/strategy/rolling_update.py b/src/ai/backend/manager/sokovan/deployment/strategy/rolling_update.py index fbcb764355c..923254ab388 100644 --- a/src/ai/backend/manager/sokovan/deployment/strategy/rolling_update.py +++ b/src/ai/backend/manager/sokovan/deployment/strategy/rolling_update.py @@ -14,13 +14,20 @@ ) from ai.backend.manager.models.deployment_policy import RollingUpdateSpec -from .types import CycleEvaluationResult +from .types import BaseDeploymentStrategy, CycleEvaluationResult -def rolling_update_evaluate( - deployment: DeploymentInfo, - routes: Sequence[RouteInfo], - spec: RollingUpdateSpec, -) -> CycleEvaluationResult: - """Evaluate one cycle of rolling update for a single deployment.""" - raise NotImplementedError("Rolling update strategy is not yet implemented") +class RollingUpdateStrategy(BaseDeploymentStrategy): + """Rolling update deployment strategy FSM.""" + + def __init__(self, spec: RollingUpdateSpec) -> None: + super().__init__(spec) + self._spec = spec + + def evaluate_cycle( + self, + deployment: DeploymentInfo, + routes: Sequence[RouteInfo], + ) -> CycleEvaluationResult: + """Evaluate one cycle of rolling update for a single deployment.""" + raise NotImplementedError("Rolling update strategy is not yet implemented") diff --git a/src/ai/backend/manager/sokovan/deployment/strategy/types.py b/src/ai/backend/manager/sokovan/deployment/strategy/types.py index 615d6e8238f..48367c58843 100644 --- a/src/ai/backend/manager/sokovan/deployment/strategy/types.py +++ b/src/ai/backend/manager/sokovan/deployment/strategy/types.py @@ -2,13 +2,18 @@ from __future__ import annotations +from abc import ABC, abstractmethod +from collections.abc import Sequence from dataclasses import dataclass, field from uuid import UUID +from pydantic import BaseModel + from ai.backend.common.data.model_deployment.types import DeploymentStrategy from ai.backend.manager.data.deployment.types import ( DeploymentInfo, DeploymentSubStep, + RouteInfo, ) from ai.backend.manager.models.routing import RoutingRow from ai.backend.manager.repositories.base import Creator @@ -67,3 +72,21 @@ class EvaluationResult: # Aggregated route mutations from all per-deployment evaluations. # The coordinator applies these after evaluation completes. route_changes: RouteChanges = field(default_factory=RouteChanges) + + +class BaseDeploymentStrategy(ABC): + """Base interface for deployment strategy cycle evaluation. + + Each concrete strategy (Blue-Green, Rolling Update) implements this interface. + The spec is injected via ``__init__`` — one instance per deployment. + """ + + def __init__(self, spec: BaseModel) -> None: + self._spec = spec + + @abstractmethod + def evaluate_cycle( + self, + deployment: DeploymentInfo, + routes: Sequence[RouteInfo], + ) -> CycleEvaluationResult: ... From 18a853ddb8dde1cebb983282cf441f110ceb156d Mon Sep 17 00:00:00 2001 From: jopemachine Date: Wed, 4 Mar 2026 06:30:14 +0000 Subject: [PATCH 20/23] docs: Update BEP --- proposals/BEP-1049-deployment-strategy-handler.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/proposals/BEP-1049-deployment-strategy-handler.md b/proposals/BEP-1049-deployment-strategy-handler.md index 604af7bf369..66df9129456 100644 --- a/proposals/BEP-1049-deployment-strategy-handler.md +++ b/proposals/BEP-1049-deployment-strategy-handler.md @@ -218,7 +218,7 @@ DeploymentStrategyEvaluator.evaluate(deployments) #### Key Design Principles 1. **Route changes are aggregated by the evaluator, applied by the coordinator**: The evaluator collects route mutations (rollout/drain/promote) from each strategy FSM into `EvaluationResult.route_changes`. The coordinator's `_apply_route_changes()` applies them after evaluation. Individual handlers do not touch routes. -2. **Strategy FSMs implement a common interface via registry**: All strategy implementations extend the `BaseDeploymentStrategy` abstract base class and implement `evaluate_cycle()`. Concrete classes (`RollingUpdateStrategy`, `BlueGreenStrategy`) live in dedicated module files (`strategy/rolling_update.py`, `strategy/blue_green.py`). The coordinator owns a `StrategyRegistry` that maps each `DeploymentStrategy` enum to its implementation class and expected spec type. The registry is injected into the evaluator, which uses it to instantiate the appropriate strategy per deployment. +2. **Strategy FSMs implement a common interface via registry**: All strategy implementations extend the `BaseDeploymentStrategy` abstract base class and implement `evaluate_cycle()`. Concrete classes (`RollingUpdateStrategy`, `BlueGreenStrategy`) live in dedicated module files (`strategy/rolling_update.py`, `strategy/blue_green.py`). The coordinator owns a `DeploymentStrategyRegistry` that maps each `DeploymentStrategy` enum to its implementation class and expected spec type. The registry is injected into the evaluator, which uses it to instantiate the appropriate strategy per deployment. 3. **Only grouping is returned**: The evaluator classifies deployments by sub-step; actual processing (revision swap, deploying_revision cleanup, etc.) is delegated to handlers. ### Per-Sub-Step Handlers From a8efb87aa55784848b2bb22f515b4e1b0365c19b Mon Sep 17 00:00:00 2001 From: jopemachine Date: Wed, 4 Mar 2026 07:34:05 +0000 Subject: [PATCH 21/23] fix: guard `complete_deployment_revision_swap` --- .../manager/repositories/deployment/db_source/db_source.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/ai/backend/manager/repositories/deployment/db_source/db_source.py b/src/ai/backend/manager/repositories/deployment/db_source/db_source.py index 38aa75d45f2..4864d72f048 100644 --- a/src/ai/backend/manager/repositories/deployment/db_source/db_source.py +++ b/src/ai/backend/manager/repositories/deployment/db_source/db_source.py @@ -2271,7 +2271,10 @@ async def complete_deployment_revision_swap( async with self._begin_session_read_committed() as db_sess: stmt = ( sa.update(EndpointRow) - .where(EndpointRow.id.in_(endpoint_ids)) + .where( + EndpointRow.id.in_(endpoint_ids), + EndpointRow.deploying_revision.isnot(None), + ) .values( current_revision=EndpointRow.deploying_revision, deploying_revision=None, From 9c24d1df33aacbc12957cd098fc58e31ef39bb9f Mon Sep 17 00:00:00 2001 From: jopemachine Date: Wed, 4 Mar 2026 09:49:03 +0000 Subject: [PATCH 22/23] refactoring wip --- .../manager/sokovan/deployment/coordinator.py | 380 ++++-------------- .../sokovan/deployment/handlers/__init__.py | 4 + .../sokovan/deployment/handlers/base.py | 35 +- .../sokovan/deployment/handlers/deploying.py | 282 ++++++++++++- .../sokovan/deployment/strategy/types.py | 14 +- .../deployment/test_coordinator_history.py | 6 + 6 files changed, 407 insertions(+), 314 deletions(-) diff --git a/src/ai/backend/manager/sokovan/deployment/coordinator.py b/src/ai/backend/manager/sokovan/deployment/coordinator.py index 9b3bcedb13f..9781f084ce3 100644 --- a/src/ai/backend/manager/sokovan/deployment/coordinator.py +++ b/src/ai/backend/manager/sokovan/deployment/coordinator.py @@ -5,7 +5,7 @@ from __future__ import annotations import logging -from collections.abc import Mapping +from collections.abc import Mapping, Sequence from contextlib import AsyncExitStack from dataclasses import dataclass from datetime import UTC, datetime @@ -14,10 +14,7 @@ from ai.backend.common.clients.http_client.client_pool import ClientPool from ai.backend.common.clients.valkey_client.valkey_schedule import ValkeyScheduleClient from ai.backend.common.clients.valkey_client.valkey_stat.client import ValkeyStatClient -from ai.backend.common.data.endpoint.types import EndpointLifecycle from ai.backend.common.data.model_deployment.types import DeploymentStrategy -from ai.backend.common.data.notification import NotificationRuleType -from ai.backend.common.data.notification.messages import EndpointLifecycleChangedMessage from ai.backend.common.events.dispatcher import EventProducer from ai.backend.common.events.event_types.notification import NotificationTriggeredEvent from ai.backend.common.events.event_types.schedule.anycast import ( @@ -31,14 +28,10 @@ DeploymentInfo, DeploymentSubStatus, DeploymentSubStep, - RouteStatus, - RouteTrafficStatus, ) from ai.backend.manager.data.session.types import SchedulingResult, SubStepResult -from ai.backend.manager.defs import LockID from ai.backend.manager.models.deployment_policy import BlueGreenSpec, RollingUpdateSpec from ai.backend.manager.models.endpoint import EndpointRow -from ai.backend.manager.models.routing import RoutingRow from ai.backend.manager.repositories.base.creator import BulkCreator from ai.backend.manager.repositories.base.updater import BatchUpdater from ai.backend.manager.repositories.deployment import ( @@ -47,12 +40,11 @@ ) from ai.backend.manager.repositories.deployment.creators import ( EndpointLifecycleBatchUpdaterSpec, - RouteBatchUpdaterSpec, ) -from ai.backend.manager.repositories.deployment.options import RouteConditions from ai.backend.manager.repositories.scheduling_history.creators import DeploymentHistoryCreatorSpec from ai.backend.manager.sokovan.deployment.recorder import DeploymentRecorderContext from ai.backend.manager.sokovan.deployment.route.route_controller import RouteController +from ai.backend.manager.sokovan.recorder.pool import RecordPool from ai.backend.manager.sokovan.recorder.types import ExecutionRecord from ai.backend.manager.sokovan.recorder.utils import extract_sub_steps_for_entity from ai.backend.manager.sokovan.scheduling_controller.scheduling_controller import ( @@ -65,6 +57,7 @@ from .handlers import ( CheckPendingDeploymentHandler, CheckReplicaDeploymentHandler, + DeployingHandler, DeployingProgressingHandler, DeployingProvisioningHandler, DeployingRolledBackHandler, @@ -72,18 +65,13 @@ DestroyingDeploymentHandler, ReconcileDeploymentHandler, ScalingDeploymentHandler, + build_lifecycle_notification_event, ) from .strategy.blue_green import BlueGreenStrategy from .strategy.evaluator import DeploymentStrategyEvaluator, DeploymentStrategyRegistry from .strategy.rolling_update import RollingUpdateStrategy -from .strategy.types import EvaluationResult from .types import DeploymentExecutionResult, DeploymentLifecycleType -# Handler key: either a simple lifecycle type or a (lifecycle, sub-step) tuple -type DeploymentHandlerKey = ( - DeploymentLifecycleType | tuple[DeploymentLifecycleType, DeploymentSubStep] -) - log = BraceStyleAdapter(logging.getLogger(__name__)) @@ -121,8 +109,7 @@ class DeploymentCoordinator: _valkey_schedule: ValkeyScheduleClient _deployment_controller: DeploymentController _deployment_repository: DeploymentRepository - _deployment_handlers: Mapping[DeploymentHandlerKey, DeploymentHandler] - _deployment_evaluators: Mapping[DeploymentLifecycleType, DeploymentStrategyEvaluator] + _deployment_handlers: Mapping[DeploymentLifecycleType, DeploymentHandler] _lock_factory: DistributedLockFactory _config_provider: ManagerConfigProvider _event_producer: EventProducer @@ -158,13 +145,6 @@ def __init__( valkey_stat=valkey_stat, ) self._deployment_handlers = self._init_handlers(executor) - self._strategy_registry = self._init_deployment_strategy_registry() - self._deployment_evaluators = { - DeploymentLifecycleType.DEPLOYING: DeploymentStrategyEvaluator( - deployment_repo=self._deployment_repository, - strategy_registry=self._strategy_registry, - ), - } @staticmethod def _init_deployment_strategy_registry() -> DeploymentStrategyRegistry: @@ -176,9 +156,31 @@ def _init_deployment_strategy_registry() -> DeploymentStrategyRegistry: def _init_handlers( self, executor: DeploymentExecutor - ) -> Mapping[DeploymentHandlerKey, DeploymentHandler]: - """Initialize and return the mapping of handler keys to their handlers.""" - handlers: dict[DeploymentHandlerKey, DeploymentHandler] = { + ) -> Mapping[DeploymentLifecycleType, DeploymentHandler]: + """Initialize and return the mapping of lifecycle types to their handlers.""" + # Strategy registry + evaluator for DEPLOYING composite handler + strategy_registry = self._init_deployment_strategy_registry() + evaluator = DeploymentStrategyEvaluator( + deployment_repo=self._deployment_repository, + strategy_registry=strategy_registry, + ) + + # Sub-step handlers used internally by DeployingHandler + sub_step_handlers: Mapping[DeploymentSubStep, DeploymentHandler] = { + DeploymentSubStep.PROVISIONING: DeployingProvisioningHandler( + deployment_controller=self._deployment_controller, + route_controller=self._route_controller, + ), + DeploymentSubStep.PROGRESSING: DeployingProgressingHandler( + deployment_controller=self._deployment_controller, + route_controller=self._route_controller, + ), + DeploymentSubStep.ROLLED_BACK: DeployingRolledBackHandler( + deployment_repo=self._deployment_repository, + ), + } + + handlers: dict[DeploymentLifecycleType, DeploymentHandler] = { DeploymentLifecycleType.CHECK_PENDING: CheckPendingDeploymentHandler( deployment_executor=executor, deployment_controller=self._deployment_controller, @@ -196,29 +198,17 @@ def _init_handlers( deployment_executor=executor, deployment_controller=self._deployment_controller, ), + DeploymentLifecycleType.DEPLOYING: DeployingHandler( + evaluator=evaluator, + sub_step_handlers=sub_step_handlers, + deployment_repo=self._deployment_repository, + event_producer=self._event_producer, + ), DeploymentLifecycleType.DESTROYING: DestroyingDeploymentHandler( deployment_executor=executor, deployment_controller=self._deployment_controller, route_controller=self._route_controller, ), - # DEPLOYING sub-step handlers (keyed by composite key) - (DeploymentLifecycleType.DEPLOYING, DeploymentSubStep.PROVISIONING): ( - DeployingProvisioningHandler( - deployment_controller=self._deployment_controller, - route_controller=self._route_controller, - ) - ), - (DeploymentLifecycleType.DEPLOYING, DeploymentSubStep.PROGRESSING): ( - DeployingProgressingHandler( - deployment_controller=self._deployment_controller, - route_controller=self._route_controller, - ) - ), - (DeploymentLifecycleType.DEPLOYING, DeploymentSubStep.ROLLED_BACK): ( - DeployingRolledBackHandler( - deployment_repo=self._deployment_repository, - ) - ), } return handlers @@ -226,12 +216,6 @@ async def process_deployment_lifecycle( self, lifecycle_type: DeploymentLifecycleType, ) -> None: - # Check if this lifecycle type uses an evaluator (e.g. DEPLOYING) - evaluator = self._deployment_evaluators.get(lifecycle_type) - if evaluator is not None: - await self._process_with_evaluator(lifecycle_type, evaluator) - return - handler = self._deployment_handlers.get(lifecycle_type) if not handler: log.warning("No handler for deployment lifecycle type: {}", lifecycle_type.value) @@ -248,21 +232,17 @@ async def process_deployment_lifecycle( return log.info("handler: {} - processing {} deployments", handler.name(), len(deployments)) - # Execute handler with recorder context deployment_ids = [d.id for d in deployments] with DeploymentRecorderContext.scope( lifecycle_type.value, entity_ids=deployment_ids ) as pool: - result = await handler.execute(deployments) - all_records = pool.build_all_records() - - # Handle status transitions with history recording - await self._handle_status_transitions(handler, result, all_records) + handler_tasks = await handler.prepare(deployments) + handler_results, all_records = await self._execute_and_transition_handlers( + handler_tasks, pool + ) - try: - await handler.post_process(result) - except Exception as e: - log.error("Error during post-processing: {}", e) + await handler.finalize(all_records) + await self._post_process_handlers(handler_results) async def _handle_status_transitions( self, @@ -322,7 +302,7 @@ async def _handle_status_transitions( ) all_history_specs.extend(success_history_specs) notification_events.extend([ - self._build_lifecycle_notification_event( + build_lifecycle_notification_event( deployment=d, from_status=from_status, to_status=next_lifecycle, @@ -364,7 +344,7 @@ async def _handle_status_transitions( ) all_history_specs.extend(failure_history_specs) notification_events.extend([ - self._build_lifecycle_notification_event( + build_lifecycle_notification_event( deployment=e.deployment_info, from_status=from_status, to_status=failure_lifecycle, @@ -387,219 +367,48 @@ async def _handle_status_transitions( except Exception as e: log.warning("Failed to send lifecycle notification: {}", e) - async def _process_with_evaluator( - self, - lifecycle_type: DeploymentLifecycleType, - evaluator: DeploymentStrategyEvaluator, - ) -> None: - """Process deployments that use a strategy evaluator (e.g. DEPLOYING). - - 1. Acquire distributed lock. - 2. Load DEPLOYING deployments. - 3. Run evaluator (evaluates strategy FSM, aggregates route mutations). - 4. Apply aggregated route mutations. - 5. For each sub-step group, run the corresponding handler. - 6. Handle errors and skipped deployments. - 7. For completed deployments, swap revisions and transition to READY. - """ - lock_lifetime = self._config_provider.config.manager.session_schedule_lock_lifetime - async with self._lock_factory(LockID.LOCKID_DEPLOYMENT_DEPLOYING, lock_lifetime): - deployments = await self._deployment_repository.get_endpoints_by_statuses([ - EndpointLifecycle.DEPLOYING - ]) - if not deployments: - log.trace("No DEPLOYING deployments to process") - return - log.info("DEPLOYING: processing {} deployments", len(deployments)) - - deployment_ids = [d.id for d in deployments] - sub_results: dict[DeploymentSubStep, DeploymentExecutionResult] = {} - with DeploymentRecorderContext.scope( - lifecycle_type.value, entity_ids=deployment_ids - ) as pool: - eval_result = await evaluator.evaluate(deployments) - - # Apply aggregated route mutations from the evaluation - await self._apply_route_changes(eval_result) - - all_records = pool.build_all_records() - - # Process each sub-step group with its handler - for sub_step, group in eval_result.groups.items(): - handler_key: DeploymentHandlerKey = (lifecycle_type, sub_step) - handler = self._deployment_handlers.get(handler_key) - if handler is None: - log.warning( - "No handler for sub-step {}/{}", lifecycle_type.value, sub_step.value - ) - continue - - sub_result = await handler.execute(group.deployments) - sub_results[sub_step] = sub_result - await self._handle_status_transitions(handler, sub_result, all_records) - - # Handle evaluation errors (Finding 3) — record history, keep DEPLOYING - if eval_result.errors: - error_history_specs = [ - DeploymentHistoryCreatorSpec( - deployment_id=deployment.id, - phase=lifecycle_type.value, - result=SchedulingResult.NEED_RETRY, - message=f"Evaluation error: {reason}", - from_status=EndpointLifecycle.DEPLOYING, - to_status=None, - sub_steps=[], - ) - for deployment, reason in eval_result.errors - ] - await self._deployment_repository.update_endpoint_lifecycle_bulk_with_history( - [], BulkCreator(specs=error_history_specs) - ) - for deployment, reason in eval_result.errors: - log.error("Deployment {} evaluation error: {}", deployment.id, reason) - - # Handle skipped deployments (Finding 5) — record history, keep DEPLOYING - if eval_result.skipped: - skipped_history_specs = [ - DeploymentHistoryCreatorSpec( - deployment_id=deployment.id, - phase=lifecycle_type.value, - result=SchedulingResult.SKIPPED, - message="No deployment policy found", - from_status=EndpointLifecycle.DEPLOYING, - to_status=None, - sub_steps=[], - ) - for deployment in eval_result.skipped - ] - await self._deployment_repository.update_endpoint_lifecycle_bulk_with_history( - [], BulkCreator(specs=skipped_history_specs) - ) - for deployment in eval_result.skipped: - log.warning("Deployment {} skipped: no deployment policy found", deployment.id) - - # Post-process outside recorder scope using actual sub_results (Finding 4) - for sub_step, group in eval_result.groups.items(): - handler_key = (lifecycle_type, sub_step) - handler = self._deployment_handlers.get(handler_key) - if handler is None: - continue - try: - actual_result = sub_results.get( - sub_step, - DeploymentExecutionResult(successes=group.deployments), - ) - await handler.post_process(actual_result) - except Exception as e: - log.error( - "Error during post-processing for sub-step {}: {}", - sub_step.value, - e, - ) - - # Transition completed deployments: swap revision and move to READY - if eval_result.completed: - await self._transition_completed_deployments( - lifecycle_type, - eval_result.completed, - strategies=eval_result.completed_strategies, - records=all_records, - ) - - async def _apply_route_changes( + async def _execute_and_transition_handlers( self, - eval_result: EvaluationResult, - ) -> None: - """Apply aggregated route mutations from the evaluation result.""" - changes = eval_result.route_changes - if not changes.rollout_specs and not changes.drain_route_ids: - return + handler_tasks: Sequence[tuple[DeploymentHandler, Sequence[DeploymentInfo]]], + pool: RecordPool[UUID], + ) -> tuple[ + list[tuple[DeploymentHandler, DeploymentExecutionResult]], + Mapping[UUID, ExecutionRecord], + ]: + """Execute handlers, build records, and handle status transitions. - scale_in_updater: BatchUpdater[RoutingRow] | None = None - if changes.drain_route_ids: - scale_in_updater = BatchUpdater( - spec=RouteBatchUpdaterSpec( - status=RouteStatus.TERMINATING, - traffic_ratio=0.0, - traffic_status=RouteTrafficStatus.INACTIVE, - ), - conditions=[RouteConditions.by_ids(changes.drain_route_ids)], - ) - - await self._deployment_repository.scale_routes(changes.rollout_specs, scale_in_updater) - log.debug( - "Applied route changes: {} created, {} terminated", - len(changes.rollout_specs), - len(changes.drain_route_ids), - ) + Must be called within a recorder scope. Records are built after all + handlers have executed to capture all execution records. - async def _transition_completed_deployments( - self, - lifecycle_type: DeploymentLifecycleType, - completed: list[DeploymentInfo], - strategies: dict[UUID, DeploymentStrategy], - records: Mapping[UUID, ExecutionRecord], - ) -> None: - """Transition completed DEPLOYING deployments to READY. + Args: + handler_tasks: Sequence of (handler, deployments) pairs to execute. + pool: The recorder pool for building execution records. - Atomically: - 1. Swap deploying_revision → current_revision (with idempotency guard). - 2. Update lifecycle to READY with history recording. - 3. Send notification events. + Returns: + Tuple of (handler results, execution records). """ - endpoint_ids = {deployment.id for deployment in completed} - - # Build lifecycle transition - target_statuses = [EndpointLifecycle.DEPLOYING] - from_status = EndpointLifecycle.DEPLOYING - to_status = EndpointLifecycle.READY - - batch_updater = BatchUpdater( - spec=EndpointLifecycleBatchUpdaterSpec(lifecycle_stage=to_status), - conditions=[ - DeploymentConditions.by_ids(list(endpoint_ids)), - DeploymentConditions.by_lifecycle_stages(target_statuses), - ], - ) + handler_results: list[tuple[DeploymentHandler, DeploymentExecutionResult]] = [] + for handler, handler_deployments in handler_tasks: + result = await handler.execute(handler_deployments) + handler_results.append((handler, result)) - timestamp_now = datetime.now(UTC).isoformat() - history_specs = [ - DeploymentHistoryCreatorSpec( - deployment_id=deployment.id, - phase=lifecycle_type.value, - result=SchedulingResult.SUCCESS, - message=f"Deployment completed successfully (strategy: {strategies[deployment.id].value})" - if deployment.id in strategies - else "Deployment completed successfully", - from_status=from_status, - to_status=to_status, - sub_steps=extract_sub_steps_for_entity(deployment.id, records), - ) - for deployment in completed - ] + all_records = pool.build_all_records() - # Atomic: revision swap + lifecycle update + history recording - await self._deployment_repository.complete_deployment_and_transition_to_ready( - endpoint_ids, [batch_updater], BulkCreator(specs=history_specs) - ) - log.info( - "Atomically swapped revision and transitioned {} deployments to READY", - len(endpoint_ids), - ) + for handler, result in handler_results: + await self._handle_status_transitions(handler, result, all_records) + + return handler_results, all_records - # Send notifications - for deployment in completed: + async def _post_process_handlers( + self, + handler_results: Sequence[tuple[DeploymentHandler, DeploymentExecutionResult]], + ) -> None: + """Run post-processing for handlers outside the recorder scope.""" + for handler, result in handler_results: try: - event = self._build_lifecycle_notification_event( - deployment=deployment, - from_status=from_status, - to_status=to_status, - transition_result="success", - timestamp=timestamp_now, - ) - await self._event_producer.anycast_event(event) + await handler.post_process(result) except Exception as e: - log.warning("Failed to send lifecycle notification: {}", e) + log.error("Error during post-processing for {}: {}", handler.name(), e) @staticmethod def _build_history_sub_steps( @@ -622,43 +431,8 @@ def _build_history_sub_steps( ) return sub_steps - def _build_lifecycle_notification_event( - self, - deployment: DeploymentInfo, - from_status: EndpointLifecycle | None, - to_status: EndpointLifecycle, - transition_result: str, - timestamp: str, - ) -> NotificationTriggeredEvent: - """Build a notification event for a lifecycle transition.""" - message = EndpointLifecycleChangedMessage( - endpoint_id=str(deployment.id), - endpoint_name=deployment.metadata.name, - domain=deployment.metadata.domain, - project_id=str(deployment.metadata.project), - resource_group=deployment.metadata.resource_group, - from_status=from_status.value if from_status else None, - to_status=to_status.value, - transition_result=transition_result, - event_timestamp=timestamp, - ) - return NotificationTriggeredEvent( - rule_type=NotificationRuleType.ENDPOINT_LIFECYCLE_CHANGED.value, - timestamp=datetime.now(UTC), - notification_data=message.model_dump(), - ) - async def process_if_needed(self, lifecycle_type: DeploymentLifecycleType) -> None: - """ - Process deployment lifecycle operation if needed (based on internal state). - - Args: - lifecycle_type: Type of deployment lifecycle operation - - Returns: - True if operation was performed, False otherwise - """ - # Check internal state (uses Redis marks) + """Process deployment lifecycle operation if needed (based on internal state).""" if not await self._valkey_schedule.load_and_delete_deployment_mark(lifecycle_type.value): return await self.process_deployment_lifecycle(lifecycle_type) diff --git a/src/ai/backend/manager/sokovan/deployment/handlers/__init__.py b/src/ai/backend/manager/sokovan/deployment/handlers/__init__.py index a5c94ed0ae4..a38922ad64a 100644 --- a/src/ai/backend/manager/sokovan/deployment/handlers/__init__.py +++ b/src/ai/backend/manager/sokovan/deployment/handlers/__init__.py @@ -4,10 +4,12 @@ from .base import DeploymentHandler from .deploying import ( + DeployingHandler, DeployingInProgressHandler, DeployingProgressingHandler, DeployingProvisioningHandler, DeployingRolledBackHandler, + build_lifecycle_notification_event, ) from .destroying import DestroyingDeploymentHandler from .pending import CheckPendingDeploymentHandler @@ -18,6 +20,7 @@ __all__ = [ "CheckPendingDeploymentHandler", "CheckReplicaDeploymentHandler", + "DeployingHandler", "DeployingInProgressHandler", "DeployingProgressingHandler", "DeployingProvisioningHandler", @@ -26,4 +29,5 @@ "DestroyingDeploymentHandler", "ReconcileDeploymentHandler", "ScalingDeploymentHandler", + "build_lifecycle_notification_event", ] diff --git a/src/ai/backend/manager/sokovan/deployment/handlers/base.py b/src/ai/backend/manager/sokovan/deployment/handlers/base.py index d3a40cb31dd..2cf2f16bf89 100644 --- a/src/ai/backend/manager/sokovan/deployment/handlers/base.py +++ b/src/ai/backend/manager/sokovan/deployment/handlers/base.py @@ -1,5 +1,8 @@ +from __future__ import annotations + from abc import abstractmethod -from collections.abc import Sequence +from collections.abc import Mapping, Sequence +from uuid import UUID from ai.backend.manager.data.deployment.types import ( DeploymentInfo, @@ -8,6 +11,7 @@ from ai.backend.manager.data.model_serving.types import EndpointLifecycle from ai.backend.manager.defs import LockID from ai.backend.manager.sokovan.deployment.types import DeploymentExecutionResult +from ai.backend.manager.sokovan.recorder.types import ExecutionRecord class DeploymentHandler: @@ -53,6 +57,16 @@ def status_transitions(cls) -> DeploymentStatusTransitions: """ raise NotImplementedError("Subclasses must implement status_transitions()") + async def prepare( + self, deployments: Sequence[DeploymentInfo] + ) -> list[tuple[DeploymentHandler, Sequence[DeploymentInfo]]]: + """Prepare handler tasks for execution. + + Default: treat self as a single sub-step. + Override for composite handlers (e.g., DeployingHandler) that dispatch to sub-handlers. + """ + return [(self, deployments)] + @abstractmethod async def execute(self, deployments: Sequence[DeploymentInfo]) -> DeploymentExecutionResult: """Execute the scheduling operation. @@ -64,9 +78,24 @@ async def execute(self, deployments: Sequence[DeploymentInfo]) -> DeploymentExec @abstractmethod async def post_process(self, result: DeploymentExecutionResult) -> None: - """Handle post-processing after the operation. + """Per-handler post-processing after execute(). + + Called for each (handler, result) pair returned by prepare(). + For composite handlers, this means each sub-step handler's post_process + is called individually — not the composite handler itself. + + Typical use: reschedule the next lifecycle cycle, trigger dependent lifecycles. Args: - result: The result from execute() + result: The result from this handler's execute() """ raise NotImplementedError("Subclasses must implement post_process()") + + async def finalize(self, records: Mapping[UUID, ExecutionRecord]) -> None: + """Post-execution finalization with access to execution records. + + Called after all handler tasks have been executed and status transitions recorded, + but before post_process. Default: no-op. + Override for composite handlers that need atomic completion transitions. + """ + pass diff --git a/src/ai/backend/manager/sokovan/deployment/handlers/deploying.py b/src/ai/backend/manager/sokovan/deployment/handlers/deploying.py index 95d07c94a32..c3f1be8562b 100644 --- a/src/ai/backend/manager/sokovan/deployment/handlers/deploying.py +++ b/src/ai/backend/manager/sokovan/deployment/handlers/deploying.py @@ -7,31 +7,58 @@ The rolled-back handler clears ``deploying_revision`` and transitions the deployment back to READY. + +The composite ``DeployingHandler`` encapsulates strategy evaluation, route +mutations, sub-step dispatch, and completed deployment transitions so that +the coordinator can treat DEPLOYING identically to every other lifecycle type. """ from __future__ import annotations import logging -from collections.abc import Sequence +from collections.abc import Mapping, Sequence +from datetime import UTC, datetime from typing import override +from uuid import UUID +from ai.backend.common.data.notification import NotificationRuleType +from ai.backend.common.data.notification.messages import EndpointLifecycleChangedMessage +from ai.backend.common.events.dispatcher import EventProducer +from ai.backend.common.events.event_types.notification import NotificationTriggeredEvent from ai.backend.logging import BraceStyleAdapter from ai.backend.manager.data.deployment.types import ( DeploymentInfo, DeploymentLifecycleStatus, DeploymentStatusTransitions, DeploymentSubStep, + RouteStatus, + RouteTrafficStatus, ) from ai.backend.manager.data.model_serving.types import EndpointLifecycle +from ai.backend.manager.data.session.types import SchedulingResult from ai.backend.manager.defs import LockID +from ai.backend.manager.models.routing import RoutingRow +from ai.backend.manager.repositories.base.creator import BulkCreator +from ai.backend.manager.repositories.base.updater import BatchUpdater +from ai.backend.manager.repositories.deployment import DeploymentConditions +from ai.backend.manager.repositories.deployment.creators import ( + EndpointLifecycleBatchUpdaterSpec, + RouteBatchUpdaterSpec, +) +from ai.backend.manager.repositories.deployment.options import RouteConditions from ai.backend.manager.repositories.deployment.repository import DeploymentRepository +from ai.backend.manager.repositories.scheduling_history.creators import DeploymentHistoryCreatorSpec from ai.backend.manager.sokovan.deployment.deployment_controller import DeploymentController from ai.backend.manager.sokovan.deployment.route.route_controller import RouteController from ai.backend.manager.sokovan.deployment.route.types import RouteLifecycleType +from ai.backend.manager.sokovan.deployment.strategy.evaluator import DeploymentStrategyEvaluator +from ai.backend.manager.sokovan.deployment.strategy.types import EvaluationGroup, EvaluationResult from ai.backend.manager.sokovan.deployment.types import ( DeploymentExecutionResult, DeploymentLifecycleType, ) +from ai.backend.manager.sokovan.recorder.types import ExecutionRecord +from ai.backend.manager.sokovan.recorder.utils import extract_sub_steps_for_entity from .base import DeploymentHandler @@ -187,3 +214,256 @@ async def execute(self, deployments: Sequence[DeploymentInfo]) -> DeploymentExec @override async def post_process(self, result: DeploymentExecutionResult) -> None: pass + + +# --------------------------------------------------------------------------- +# Composite handler +# --------------------------------------------------------------------------- + + +def build_lifecycle_notification_event( + deployment: DeploymentInfo, + from_status: EndpointLifecycle | None, + to_status: EndpointLifecycle, + transition_result: str, + timestamp: str, +) -> NotificationTriggeredEvent: + """Build a notification event for a lifecycle transition.""" + message = EndpointLifecycleChangedMessage( + endpoint_id=str(deployment.id), + endpoint_name=deployment.metadata.name, + domain=deployment.metadata.domain, + project_id=str(deployment.metadata.project), + resource_group=deployment.metadata.resource_group, + from_status=from_status.value if from_status else None, + to_status=to_status.value, + transition_result=transition_result, + event_timestamp=timestamp, + ) + return NotificationTriggeredEvent( + rule_type=NotificationRuleType.ENDPOINT_LIFECYCLE_CHANGED.value, + timestamp=datetime.now(UTC), + notification_data=message.model_dump(), + ) + + +class DeployingHandler(DeploymentHandler): + """Composite handler for DEPLOYING lifecycle. + + Encapsulates strategy evaluation, route mutations, sub-step dispatch, + and completed deployment transitions so the coordinator treats DEPLOYING + identically to every other lifecycle type. + """ + + def __init__( + self, + evaluator: DeploymentStrategyEvaluator, + sub_step_handlers: Mapping[DeploymentSubStep, DeploymentHandler], + deployment_repo: DeploymentRepository, + event_producer: EventProducer, + ) -> None: + self._evaluator = evaluator + self._sub_step_handlers = sub_step_handlers + self._deployment_repo = deployment_repo + self._event_producer = event_producer + self._eval_result: EvaluationResult | None = None + + @classmethod + @override + def name(cls) -> str: + return "deploying" + + @property + @override + def lock_id(self) -> LockID | None: + return LockID.LOCKID_DEPLOYMENT_DEPLOYING + + @classmethod + @override + def target_statuses(cls) -> list[EndpointLifecycle]: + return [EndpointLifecycle.DEPLOYING] + + @classmethod + @override + def status_transitions(cls) -> DeploymentStatusTransitions: + return DeploymentStatusTransitions(success=None, failure=None) + + @override + async def prepare( + self, deployments: Sequence[DeploymentInfo] + ) -> list[tuple[DeploymentHandler, Sequence[DeploymentInfo]]]: + """Run evaluator, apply route changes, return sub-step handler tasks.""" + eval_result = await self._evaluator.evaluate(deployments) + self._eval_result = eval_result + await self._apply_route_changes(eval_result) + return self._resolve_handler_tasks(eval_result.groups) + + @override + async def execute(self, deployments: Sequence[DeploymentInfo]) -> DeploymentExecutionResult: + # Not called directly; prepare() returns sub-step handlers + return DeploymentExecutionResult(successes=list(deployments)) + + @override + async def post_process(self, result: DeploymentExecutionResult) -> None: + # Not called directly; sub-step handlers handle post-processing + pass + + @override + async def finalize(self, records: Mapping[UUID, ExecutionRecord]) -> None: + """Record evaluation outcomes and transition completed deployments.""" + eval_result = self._eval_result + if eval_result is None: + return + await self._record_evaluation_outcomes(eval_result) + if eval_result.completed: + await self._transition_completed_deployments(eval_result, records) + self._eval_result = None + + # -- Private helpers (moved from coordinator) -- + + async def _apply_route_changes(self, eval_result: EvaluationResult) -> None: + """Apply aggregated route mutations from the evaluation result.""" + changes = eval_result.route_changes + if not changes.rollout_specs and not changes.drain_route_ids: + return + + scale_in_updater: BatchUpdater[RoutingRow] | None = None + if changes.drain_route_ids: + scale_in_updater = BatchUpdater( + spec=RouteBatchUpdaterSpec( + status=RouteStatus.TERMINATING, + traffic_ratio=0.0, + traffic_status=RouteTrafficStatus.INACTIVE, + ), + conditions=[RouteConditions.by_ids(changes.drain_route_ids)], + ) + + await self._deployment_repo.scale_routes(changes.rollout_specs, scale_in_updater) + log.debug( + "Applied route changes: {} created, {} terminated", + len(changes.rollout_specs), + len(changes.drain_route_ids), + ) + + def _resolve_handler_tasks( + self, groups: dict[DeploymentSubStep, EvaluationGroup] + ) -> list[tuple[DeploymentHandler, Sequence[DeploymentInfo]]]: + """Resolve sub-step groups into handler-deployment pairs.""" + tasks: list[tuple[DeploymentHandler, Sequence[DeploymentInfo]]] = [] + for sub_step, group in groups.items(): + handler = self._sub_step_handlers.get(sub_step) + if handler is None: + log.warning("No handler for DEPLOYING sub-step {}", sub_step.value) + continue + tasks.append((handler, group.deployments)) + return tasks + + async def _record_evaluation_outcomes(self, eval_result: EvaluationResult) -> None: + """Record history for evaluation errors and skipped deployments.""" + lifecycle_value = DeploymentLifecycleType.DEPLOYING.value + + if eval_result.errors: + error_history_specs = [ + DeploymentHistoryCreatorSpec( + deployment_id=deployment.id, + phase=lifecycle_value, + result=SchedulingResult.NEED_RETRY, + message=f"Evaluation error: {reason}", + from_status=EndpointLifecycle.DEPLOYING, + to_status=None, + sub_steps=[], + ) + for deployment, reason in eval_result.errors + ] + await self._deployment_repo.update_endpoint_lifecycle_bulk_with_history( + [], BulkCreator(specs=error_history_specs) + ) + for deployment, reason in eval_result.errors: + log.error("Deployment {} evaluation error: {}", deployment.id, reason) + + if eval_result.skipped: + skipped_history_specs = [ + DeploymentHistoryCreatorSpec( + deployment_id=deployment.id, + phase=lifecycle_value, + result=SchedulingResult.SKIPPED, + message="No deployment policy found", + from_status=EndpointLifecycle.DEPLOYING, + to_status=None, + sub_steps=[], + ) + for deployment in eval_result.skipped + ] + await self._deployment_repo.update_endpoint_lifecycle_bulk_with_history( + [], BulkCreator(specs=skipped_history_specs) + ) + for deployment in eval_result.skipped: + log.warning("Deployment {} skipped: no deployment policy found", deployment.id) + + async def _transition_completed_deployments( + self, + eval_result: EvaluationResult, + records: Mapping[UUID, ExecutionRecord], + ) -> None: + """Transition completed DEPLOYING deployments to READY. + + Atomically: + 1. Swap deploying_revision -> current_revision (with idempotency guard). + 2. Update lifecycle to READY with history recording. + 3. Send notification events. + """ + completed = eval_result.completed + strategies = eval_result.completed_strategies + endpoint_ids = {deployment.id for deployment in completed} + lifecycle_value = DeploymentLifecycleType.DEPLOYING.value + + target_statuses = [EndpointLifecycle.DEPLOYING] + from_status = EndpointLifecycle.DEPLOYING + to_status = EndpointLifecycle.READY + + batch_updater = BatchUpdater( + spec=EndpointLifecycleBatchUpdaterSpec(lifecycle_stage=to_status), + conditions=[ + DeploymentConditions.by_ids(list(endpoint_ids)), + DeploymentConditions.by_lifecycle_stages(target_statuses), + ], + ) + + timestamp_now = datetime.now(UTC).isoformat() + history_specs = [ + DeploymentHistoryCreatorSpec( + deployment_id=deployment.id, + phase=lifecycle_value, + result=SchedulingResult.SUCCESS, + message=f"Deployment completed successfully (strategy: {strategies[deployment.id].value})" + if deployment.id in strategies + else "Deployment completed successfully", + from_status=from_status, + to_status=to_status, + sub_steps=extract_sub_steps_for_entity(deployment.id, records), + ) + for deployment in completed + ] + + # Atomic: revision swap + lifecycle update + history recording + await self._deployment_repo.complete_deployment_and_transition_to_ready( + endpoint_ids, [batch_updater], BulkCreator(specs=history_specs) + ) + log.info( + "Atomically swapped revision and transitioned {} deployments to READY", + len(endpoint_ids), + ) + + # Send notifications + for deployment in completed: + try: + event = build_lifecycle_notification_event( + deployment=deployment, + from_status=from_status, + to_status=to_status, + transition_result="success", + timestamp=timestamp_now, + ) + await self._event_producer.anycast_event(event) + except Exception as e: + log.warning("Failed to send lifecycle notification: {}", e) diff --git a/src/ai/backend/manager/sokovan/deployment/strategy/types.py b/src/ai/backend/manager/sokovan/deployment/strategy/types.py index 48367c58843..6adcb204b24 100644 --- a/src/ai/backend/manager/sokovan/deployment/strategy/types.py +++ b/src/ai/backend/manager/sokovan/deployment/strategy/types.py @@ -49,28 +49,28 @@ class EvaluationResult: """Aggregate result of evaluating all DEPLOYING deployments.""" # In-progress deployments grouped by sub-step (PROVISIONING, PROGRESSING, etc.). - # The coordinator looks up the handler for each sub-step and calls execute(). + # DeployingHandler.prepare() resolves the sub-step handler for each group. groups: dict[DeploymentSubStep, EvaluationGroup] = field(default_factory=dict) # Deployments that satisfied all strategy FSM conditions and are ready to finish. - # The coordinator performs an atomic revision swap + READY transition for these. + # DeployingHandler.finalize() performs an atomic revision swap + READY transition. completed: list[DeploymentInfo] = field(default_factory=list) # Maps each completed deployment to the strategy (ROLLING, BLUE_GREEN) it used. - # The coordinator includes this in the history message for observability. + # Included in the history message for observability. completed_strategies: dict[UUID, DeploymentStrategy] = field(default_factory=dict) # Deployments skipped because no deployment policy was found. - # The coordinator records SKIPPED history and emits a warning log. + # DeployingHandler.finalize() records SKIPPED history and emits a warning log. skipped: list[DeploymentInfo] = field(default_factory=list) # Deployments that raised an exception during strategy FSM evaluation, paired - # with the error message. The coordinator records NEED_RETRY history and keeps - # the lifecycle at DEPLOYING so the next cycle can retry. + # with the error message. DeployingHandler.finalize() records NEED_RETRY history + # and keeps the lifecycle at DEPLOYING so the next cycle can retry. errors: list[tuple[DeploymentInfo, str]] = field(default_factory=list) # Aggregated route mutations from all per-deployment evaluations. - # The coordinator applies these after evaluation completes. + # DeployingHandler.prepare() applies these after evaluation completes. route_changes: RouteChanges = field(default_factory=RouteChanges) diff --git a/tests/unit/manager/sokovan/deployment/test_coordinator_history.py b/tests/unit/manager/sokovan/deployment/test_coordinator_history.py index ca3e9259b9e..73f3811b42e 100644 --- a/tests/unit/manager/sokovan/deployment/test_coordinator_history.py +++ b/tests/unit/manager/sokovan/deployment/test_coordinator_history.py @@ -171,6 +171,7 @@ def mock_handler_with_success( failure=None, ) ) + mock.prepare = AsyncMock(side_effect=lambda deployments: [(mock, deployments)]) mock.execute = AsyncMock( return_value=DeploymentExecutionResult( successes=[sample_deployment_info], @@ -178,6 +179,7 @@ def mock_handler_with_success( ) ) mock.post_process = AsyncMock() + mock.finalize = AsyncMock() return mock @@ -196,6 +198,7 @@ def mock_handler_with_failure( failure=DeploymentLifecycleStatus(lifecycle=EndpointLifecycle.DESTROYED), ) ) + mock.prepare = AsyncMock(side_effect=lambda deployments: [(mock, deployments)]) mock.execute = AsyncMock( return_value=DeploymentExecutionResult( successes=[], @@ -203,6 +206,7 @@ def mock_handler_with_failure( ) ) mock.post_process = AsyncMock() + mock.finalize = AsyncMock() return mock @@ -219,8 +223,10 @@ def mock_handler_with_empty_result() -> MagicMock: failure=None, ) ) + mock.prepare = AsyncMock(side_effect=lambda deployments: [(mock, deployments)]) mock.execute = AsyncMock(return_value=DeploymentExecutionResult()) mock.post_process = AsyncMock() + mock.finalize = AsyncMock() return mock From d90c39b0ca4c5468680c97590fe019eabbb71985 Mon Sep 17 00:00:00 2001 From: jopemachine Date: Wed, 4 Mar 2026 09:49:36 +0000 Subject: [PATCH 23/23] refactoring wip --- .../BEP-1049-deployment-strategy-handler.md | 166 ++++++++++-------- 1 file changed, 95 insertions(+), 71 deletions(-) diff --git a/proposals/BEP-1049-deployment-strategy-handler.md b/proposals/BEP-1049-deployment-strategy-handler.md index 66df9129456..10e7d14f9ea 100644 --- a/proposals/BEP-1049-deployment-strategy-handler.md +++ b/proposals/BEP-1049-deployment-strategy-handler.md @@ -28,15 +28,15 @@ Blue-Green deployment spans multiple coordinator cycles through several phases: Rolling Update similarly progresses gradually across cycles. Both strategies **keep the deployment in `DEPLOYING` state across multiple processing cycles until strategy completion or rollback.** -### Evaluator + Sub-Step Handler Pattern +### Composite Handler Pattern (DeployingHandler) -A single `evaluate()` call may produce different sub-steps for different deployments — some completed, others still PROGRESSING. To handle this, a **strategy evaluator** groups deployments by sub-step, and **per-sub-step handlers** process each group. Completed deployments are returned separately in `EvaluationResult.completed` and processed directly by the coordinator's `_transition_completed_deployments()`. +A single `evaluate()` call may produce different sub-steps for different deployments — some completed, others still PROGRESSING. To handle this, DEPLOYING is represented as a **composite handler** (`DeployingHandler`) that internally owns the strategy evaluator and sub-step handlers. The coordinator treats DEPLOYING identically to every other lifecycle type through the unified `prepare → execute → finalize → post_process` flow. | Aspect | How it works | |--------|-------------| -| **State transition** | Each sub-step handler returns explicit `next_status()` → coordinator's generic path handles all transitions | -| **Routing** | Coordinator branches to evaluator path for `DeploymentLifecycleType.DEPLOYING` | -| **Cycles** | Evaluator runs strategy FSM → coordinator applies route changes → handlers process results → coordinator records history | +| **State transition** | Each sub-step handler returns explicit `status_transitions()` → coordinator's generic path handles all transitions | +| **Routing** | No special branching — `DeployingHandler.prepare()` runs the evaluator and returns sub-step handler tasks | +| **Cycles** | `prepare()`: evaluator runs strategy FSM + applies route changes → coordinator executes sub-step handlers → `finalize()`: records evaluation outcomes + transitions completed deployments | ## Sub-documents @@ -50,7 +50,7 @@ A single `evaluate()` call may produce different sub-steps for different deploym ### Overall Architecture -Core idea: A **strategy evaluator** evaluates DEPLOYING-state deployments and groups them by sub-step, then **per-sub-step handlers** process each group. The coordinator's generic `_handle_status_transitions()` path handles all history recording and lifecycle transitions. +Core idea: All lifecycle types — including DEPLOYING — follow a **single coordinator code path**: `prepare → execute → finalize → post_process`. The base `DeploymentHandler` provides default `prepare()` (returns self as single task) and `finalize()` (no-op). The composite `DeployingHandler` overrides these to run strategy evaluation, apply route changes, dispatch to sub-step handlers, and transition completed deployments. ``` ┌──────────────────────────────────────────────────────────────────────────────┐ @@ -76,14 +76,20 @@ Core idea: A **strategy evaluator** evaluates DEPLOYING-state deployments and gr ┌──────────────────────────────────────────────────────────────────────────────┐ │ DeploymentCoordinator │ │ │ -│ process_deployment_lifecycle(type) │ -│ evaluator = evaluators.get(type) │ -│ ├─ evaluator exists → _process_with_evaluator() (evaluator path) │ -│ └─ no evaluator → existing single-handler path │ +│ process_deployment_lifecycle(type) ← single unified code path │ +│ handler = handlers[type] │ +│ ├─ handler.prepare(deployments) → handler_tasks │ +│ ├─ _execute_and_transition_handlers(handler_tasks) │ +│ ├─ handler.finalize(records) │ +│ └─ _post_process_handlers(results) │ │ │ -│ Handler map key: DeploymentHandlerKey │ -│ DeploymentLifecycleType ← single handlers │ -│ | (DeploymentLifecycleType, DeploymentSubStep) ← sub-step handlers │ +│ Handler map: Mapping[DeploymentLifecycleType, DeploymentHandler] │ +│ ├─ CHECK_PENDING → CheckPendingHandler │ +│ ├─ CHECK_REPLICA → CheckReplicaHandler │ +│ ├─ SCALING → ScalingHandler │ +│ ├─ RECONCILE → ReconcileHandler │ +│ ├─ DEPLOYING → DeployingHandler (composite) │ +│ └─ DESTROYING → DestroyingHandler │ │ │ │ Result handling (same generic path for all handlers): │ │ successes → next_status (transition + history) │ @@ -91,33 +97,28 @@ Core idea: A **strategy evaluator** evaluates DEPLOYING-state deployments and gr │ skipped → keep (no transition) │ └────────────────┬─────────────────────────────────────────────────────────────┘ │ - ┌──────────┴──────────────────────────┐ - ▼ ▼ -┌─────────────────────┐ ┌──────────────────────────────────────────────────┐ -│ DeploymentHandler │ │ DeploymentStrategyEvaluator │ -│ (single-handler) │ │ (evaluator path — DEPLOYING only) │ -│ │ │ │ -│ Implementations: │ │ evaluate(deployments) → EvaluationResult │ -│ ├─ CheckPending │ │ 1. Load policies/routes │ -│ ├─ Scaling │ │ 2. Run strategy FSM → CycleEvaluationResult │ -│ ├─ CheckReplica │ │ 3. Aggregate route changes │ -│ ├─ Reconcile │ │ 4. Group by sub-step │ -│ └─ Destroying │ └───────────────┬──────────────────────────────────┘ -└─────────────────────┘ │ - ▼ - ┌──────────────────────────────────────┐ - │ Per-Sub-Step Handlers (composite) │ - │ │ - │ (DEPLOYING, PROVISIONING) │ - │ → DeployingProvisioningHandler │ - │ (DEPLOYING, PROGRESSING) │ - │ → DeployingProgressingHandler │ - │ next_status: DEPLOYING │ - │ │ - │ (DEPLOYING, ROLLED_BACK) │ - │ → DeployingRolledBackHandler │ - │ next_status: READY │ - └──────────────────────────────────────┘ + ▼ +┌──────────────────────────────────────────────────────────────────────────────┐ +│ DeploymentHandler (base) │ +│ ├─ prepare(deployments) → [(self, deployments)] ← default: single task │ +│ ├─ execute(deployments) → result ← abstract │ +│ ├─ finalize(records) → no-op ← default │ +│ └─ post_process(result) → ... ← abstract │ +│ │ +│ Simple handlers (CheckPending, Scaling, etc.): │ +│ Use defaults — prepare returns self, finalize is no-op │ +│ │ +│ DeployingHandler (composite): │ +│ ├─ prepare(): evaluator.evaluate() + apply route changes │ +│ │ → [(sub_handler, subset), ...] for each sub-step │ +│ ├─ finalize(): record evaluation outcomes + transition completed │ +│ └─ owns: │ +│ ├─ DeploymentStrategyEvaluator │ +│ └─ sub_step_handlers: │ +│ ├─ PROVISIONING → DeployingProvisioningHandler │ +│ ├─ PROGRESSING → DeployingProgressingHandler │ +│ └─ ROLLED_BACK → DeployingRolledBackHandler │ +└──────────────────────────────────────────────────────────────────────────────┘ ``` ### Revision Activation Trigger Branching @@ -170,7 +171,7 @@ Completion is not a sub-step but a signal on `CycleEvaluationResult.completed`. ### DeploymentStrategyEvaluator -`DeploymentStrategyEvaluator` evaluates DEPLOYING-state deployments and groups them by sub-step. It is a separate component (not a handler) that the coordinator invokes before handler execution. +`DeploymentStrategyEvaluator` evaluates DEPLOYING-state deployments and groups them by sub-step. It is owned by `DeployingHandler`, which invokes it during `prepare()`. The coordinator does not interact with the evaluator directly. #### Execution Flow @@ -217,13 +218,13 @@ DeploymentStrategyEvaluator.evaluate(deployments) #### Key Design Principles -1. **Route changes are aggregated by the evaluator, applied by the coordinator**: The evaluator collects route mutations (rollout/drain/promote) from each strategy FSM into `EvaluationResult.route_changes`. The coordinator's `_apply_route_changes()` applies them after evaluation. Individual handlers do not touch routes. -2. **Strategy FSMs implement a common interface via registry**: All strategy implementations extend the `BaseDeploymentStrategy` abstract base class and implement `evaluate_cycle()`. Concrete classes (`RollingUpdateStrategy`, `BlueGreenStrategy`) live in dedicated module files (`strategy/rolling_update.py`, `strategy/blue_green.py`). The coordinator owns a `DeploymentStrategyRegistry` that maps each `DeploymentStrategy` enum to its implementation class and expected spec type. The registry is injected into the evaluator, which uses it to instantiate the appropriate strategy per deployment. -3. **Only grouping is returned**: The evaluator classifies deployments by sub-step; actual processing (revision swap, deploying_revision cleanup, etc.) is delegated to handlers. +1. **Route changes are aggregated by the evaluator, applied by `DeployingHandler`**: The evaluator collects route mutations (rollout/drain/promote) from each strategy FSM into `EvaluationResult.route_changes`. `DeployingHandler._apply_route_changes()` applies them during `prepare()`. Individual sub-step handlers do not touch routes. +2. **Strategy FSMs implement a common interface via registry**: All strategy implementations extend the `BaseDeploymentStrategy` abstract base class and implement `evaluate_cycle()`. Concrete classes (`RollingUpdateStrategy`, `BlueGreenStrategy`) live in dedicated module files (`strategy/rolling_update.py`, `strategy/blue_green.py`). `DeployingHandler` creates and owns the `DeploymentStrategyRegistry`, which is injected into the evaluator to instantiate the appropriate strategy per deployment. +3. **Only grouping is returned**: The evaluator classifies deployments by sub-step; actual processing (revision swap, deploying_revision cleanup, etc.) is delegated to `DeployingHandler.finalize()` and sub-step handlers. ### Per-Sub-Step Handlers -Each handler is registered with a `(DeploymentLifecycleType, DeploymentSubStep)` composite key in the coordinator. +Each sub-step handler is owned by `DeployingHandler` and registered in its `sub_step_handlers` map keyed by `DeploymentSubStep`. They are not directly visible to the coordinator. #### State Transition Type: `DeploymentLifecycleStatus` @@ -316,51 +317,51 @@ class DeployingRolledBackHandler(DeploymentHandler): On rollback, only `deploying_revision` is cleared; `current_revision` is preserved. The coordinator transitions to READY. -### Coordinator Evaluator Path (`_process_with_evaluator`) +### Unified Coordinator Flow -The coordinator takes a separate path for lifecycle types that have an evaluator registered in `_deployment_evaluators`: +The coordinator uses a single code path for all lifecycle types, including DEPLOYING: ``` -_process_with_evaluator(lifecycle_type, evaluator) +process_deployment_lifecycle(lifecycle_type) │ - │ 1. Acquire distributed lock (evaluator.lock_id) - │ 2. Query DEPLOYING-state deployments + │ 1. Look up handler by lifecycle_type (simple enum key) + │ 2. Acquire distributed lock if handler.lock_id is set + │ 3. Query deployments by handler.target_statuses() │ - │ 3. Enter DeploymentRecorderContext.scope() + │ 4. Enter DeploymentRecorderContext.scope() │ ┌───────────────────────────────────────────────────────────────┐ │ │ │ - │ │ eval_result = evaluator.evaluate(deployments) │ - │ │ _apply_route_changes(eval_result) │ - │ │ ↑ coordinator applies rollout/drain/promote │ + │ │ handler_tasks = handler.prepare(deployments) │ + │ │ ↑ simple handlers: [(self, deployments)] │ + │ │ ↑ DeployingHandler: evaluator.evaluate() │ + │ │ + _apply_route_changes() │ + │ │ → [(sub_handler_A, subset_A), (sub_handler_B, subset_B)] │ │ │ │ - │ │ for sub_step, group in eval_result.groups: │ - │ │ handler = handlers[(lifecycle_type, sub_step)] │ - │ │ result = handler.execute(group) │ - │ │ handler_results[sub_step] = (handler, result) │ + │ │ for (h, deps) in handler_tasks: │ + │ │ result = h.execute(deps) │ │ │ │ │ │ all_records = pool.build_all_records() │ │ │ │ - │ │ for sub_step, (handler, result) in handler_results: │ - │ │ _handle_status_transitions(handler, result, all_records) │ - │ │ ↑ same generic transition logic as single-handler path │ + │ │ for (h, result) in handler_results: │ + │ │ _handle_status_transitions(h, result, all_records) │ + │ │ ↑ same generic transition logic for ALL handlers │ │ │ │ │ └───────────────────────────────────────────────────────────────┘ │ - │ 4. Post-process outside RecorderContext scope - │ for sub_step, (handler, result) in handler_results: - │ handler.post_process(result) - │ ↑ reschedule DEPLOYING cycle + trigger route provisioning + │ 5. handler.finalize(all_records) + │ ↑ simple handlers: no-op + │ ↑ DeployingHandler: record evaluation outcomes + │ + transition completed deployments (atomic revision swap + │ + DEPLOYING → READY + history recording) │ - │ 5. Transition completed deployments (coordinator direct) - │ if eval_result.completed: - │ _transition_completed_deployments(completed, records=all_records) - │ ↑ atomic revision swap + DEPLOYING → READY + history recording - │ ↑ includes route mutation sub_steps from this cycle + │ 6. Post-process outside RecorderContext scope + │ for (h, result) in handler_results: + │ h.post_process(result) │ ▼ ``` -Key: `_handle_status_transitions()` uses the **exact same generic method** as the single-handler path. It performs batch updates and history recording based on each handler's `next_status()`/`failure_status()`. Completed deployments bypass this path — their lifecycle transition is handled directly by the coordinator's `_transition_completed_deployments()`, which atomically performs the revision swap and DEPLOYING→READY transition. +Key: The coordinator has **no DEPLOYING-specific logic**. `_handle_status_transitions()` uses the same generic method for all handlers. DEPLOYING-specific concerns (evaluator invocation, route mutations, completion transitions) are fully encapsulated in `DeployingHandler.prepare()` and `DeployingHandler.finalize()`. ### Sub-Step Recording @@ -425,6 +426,29 @@ This enables: On strategy failure (all new routes fail), automatic rollback always occurs. +## Decision Log + +### 2026-03-04: Unified coordinator code path via composite handler pattern + +**Context**: PR #9566 review identified that the coordinator treated DEPLOYING as a special case with a separate method (`process_deploying_lifecycle`) and separate code path. This created two parallel flows, a union type for handler keys (`DeploymentLifecycleType | (DeploymentLifecycleType, DeploymentSubStep)`), and DEPLOYING-specific branching in the event handler. + +**Decision**: Refactor to a single unified code path using the composite handler pattern. + +Three design principles drove the change: + +1. **DEPLOYING generalization**: DEPLOYING is no longer a special lifecycle type. The coordinator processes it through the same `process_deployment_lifecycle()` as all other types. No `if DEPLOYING` branches exist in the coordinator or event handler. + +2. **Sub-step unification via `prepare()`/`finalize()`**: The base `DeploymentHandler` gains two concrete methods with defaults — `prepare()` returns `[(self, deployments)]` (treat self as single task) and `finalize()` is a no-op. Simple handlers use these defaults unchanged. The composite `DeployingHandler` overrides `prepare()` to run the evaluator and return sub-step handler tasks, and `finalize()` to record evaluation outcomes and transition completed deployments. + +3. **Evaluator interface integration**: The evaluator is no longer called directly by the coordinator. Instead, `DeployingHandler` owns the evaluator and invokes it within `prepare()`. The coordinator has no knowledge of strategy evaluation, route mutations, or completion transitions — these are fully encapsulated in the handler. + +**Changes**: +- `DeploymentHandler` base: added `prepare()`, `finalize()` with defaults +- New `DeployingHandler` composite class: owns evaluator, sub-step handlers, route mutation logic, completion transition logic +- `DeploymentCoordinator`: removed `process_deploying_lifecycle()`, `DeploymentHandlerKey` type, `_strategy_registry`/`_deploying_evaluator` fields, and four private methods moved to `DeployingHandler` +- Handler map key simplified: `Mapping[DeploymentLifecycleType, DeploymentHandler]` +- Event handler: removed DEPLOYING branch + ## References - [BEP-1006: Service Deployment Strategy](BEP-1006-service-deployment-strategy.md) — High-level design for Blue-Green and Rolling Update