-
Notifications
You must be signed in to change notification settings - Fork 168
feat(BA-3436): Implement Blue-Green deployment strategy #9568
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: BA-4821
Are you sure you want to change the base?
Changes from all commits
7f2615e
12347c5
c8bc1f1
febce73
506588b
0df575d
03bac41
af7f67b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| Implement Blue-Green deployment strategy |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1406,19 +1406,49 @@ async def fetch_active_routes_by_endpoint_ids( | |
| routes_by_endpoint[row.endpoint].append(row.to_route_info()) | ||
| return routes_by_endpoint | ||
|
|
||
| async def fetch_routes_by_endpoint_ids( | ||
| self, | ||
| endpoint_ids: set[uuid.UUID], | ||
| ) -> Mapping[uuid.UUID, list[RouteInfo]]: | ||
| """Fetch all routes for given endpoint IDs (no status filter). | ||
|
|
||
| Unlike fetch_active_routes_by_endpoint_ids, this includes routes | ||
| in all statuses (FAILED_TO_START, TERMINATED, etc.), which is | ||
| required for blue-green rollback detection. | ||
| """ | ||
| if not endpoint_ids: | ||
| return {} | ||
|
|
||
| async with self._begin_readonly_session_read_committed() as db_sess: | ||
| query = sa.select(RoutingRow).where( | ||
| RoutingRow.endpoint.in_(endpoint_ids), | ||
| ) | ||
| result = await db_sess.execute(query) | ||
| rows: Sequence[RoutingRow] = result.scalars().all() | ||
| routes_by_endpoint: defaultdict[uuid.UUID, list[RouteInfo]] = defaultdict(list) | ||
| for row in rows: | ||
| if row.endpoint not in routes_by_endpoint: | ||
| routes_by_endpoint[row.endpoint] = [] | ||
| routes_by_endpoint[row.endpoint].append(row.to_route_info()) | ||
| return routes_by_endpoint | ||
|
|
||
| async def scale_routes( | ||
| self, | ||
| scale_out_creators: Sequence[Creator[RoutingRow]], | ||
| scale_in_updater: BatchUpdater[RoutingRow] | None, | ||
| promote_updater: BatchUpdater[RoutingRow] | None = None, | ||
| ) -> None: | ||
| """Scale out/in routes based on provided creators and updater.""" | ||
| """Scale out/in/promote routes based on provided creators and updaters.""" | ||
| async with self._begin_session_read_committed() as db_sess: | ||
| # Scale out routes | ||
| for creator in scale_out_creators: | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [MEDIUM] Finding #5: Scale-in executes before promote in the DB transaction In
During step 2, the blue routes are set to While this is within a single transaction and the commit is atomic, with READ COMMITTED isolation, other transactions reading during this window could see intermediate state. Suggested fix: Consider reordering to promote first, then scale-in. This way, there is a brief overlap period where both blue and green are ACTIVE (which is safer for availability than having neither active). Alternatively, consider using SERIALIZABLE isolation for this specific operation. |
||
| await execute_creator(db_sess, creator) | ||
| # Scale in routes | ||
| if scale_in_updater: | ||
| await execute_batch_updater(db_sess, scale_in_updater) | ||
| # Promote routes (blue-green) | ||
| if promote_updater: | ||
| await execute_batch_updater(db_sess, promote_updater) | ||
|
|
||
| # Route operations | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,26 +1,194 @@ | ||
| """Blue-green deployment strategy evaluation for a single deployment cycle (BEP-1049). | ||
|
|
||
| Provisions a full set of new-revision routes, validates them, then atomically | ||
| switches traffic from the old revision to the new one. | ||
| Provisions a full set of new-revision routes (INACTIVE), validates them, then | ||
| atomically switches traffic from the old revision to the new one. | ||
| """ | ||
|
|
||
| from __future__ import annotations | ||
|
|
||
| import logging | ||
| from collections.abc import Sequence | ||
| from datetime import UTC, datetime | ||
|
|
||
| from ai.backend.logging import BraceStyleAdapter | ||
| from ai.backend.manager.data.deployment.types import ( | ||
| DeploymentInfo, | ||
| DeploymentSubStep, | ||
| RouteInfo, | ||
| RouteStatus, | ||
| RouteTrafficStatus, | ||
| ) | ||
| from ai.backend.manager.models.deployment_policy import BlueGreenSpec | ||
| from ai.backend.manager.models.routing import RoutingRow | ||
| from ai.backend.manager.repositories.base import Creator | ||
| from ai.backend.manager.repositories.deployment.creators import RouteCreatorSpec | ||
|
|
||
| from .types import CycleEvaluationResult | ||
| from .types import CycleEvaluationResult, RouteChanges | ||
|
|
||
| log = BraceStyleAdapter(logging.getLogger(__name__)) | ||
|
|
||
|
|
||
| def blue_green_evaluate( | ||
| deployment: DeploymentInfo, | ||
| routes: Sequence[RouteInfo], | ||
| spec: BlueGreenSpec, | ||
| ) -> CycleEvaluationResult: | ||
| """Evaluate one cycle of blue-green deployment for a single deployment.""" | ||
| raise NotImplementedError("Blue-green deployment strategy is not yet implemented") | ||
| """Evaluate one cycle of blue-green deployment for a single deployment. | ||
|
|
||
| FSM flow: | ||
| 1. Classify routes into blue (old) / green (new) by revision_id. | ||
| 2. If no green routes → create all green (INACTIVE) → PROVISIONING. | ||
| 3. If any green PROVISIONING → PROVISIONING (wait). | ||
| 4. If all green failed → scale_in green → ROLLED_BACK. | ||
| 5. If not all green healthy → PROGRESSING (wait). | ||
| 6. If all green healthy + auto_promote=False → PROGRESSING (manual wait). | ||
| 7. If all green healthy + auto_promote=True + delay>0 → PROGRESSING (delay wait). | ||
| 8. If all green healthy + auto_promote=True + delay=0 → promote + completed. | ||
| """ | ||
| deploying_rev = deployment.deploying_revision_id | ||
| desired = deployment.replica_spec.target_replica_count | ||
|
|
||
| # ── 1. Classify routes ── | ||
| blue_active: list[RouteInfo] = [] | ||
| green_provisioning: list[RouteInfo] = [] | ||
| green_healthy: list[RouteInfo] = [] | ||
| green_failed: list[RouteInfo] = [] | ||
|
|
||
| for r in routes: | ||
| is_green = r.revision_id == deploying_rev | ||
| if not is_green: | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [HIGH] Finding #1: Rollback path (step 4) is unreachable in production The evaluator calls This means the FSM will never see The unit tests pass because they call Suggested fix: Either:
|
||
| if r.status.is_active(): | ||
| blue_active.append(r) | ||
| continue | ||
|
|
||
| if r.status == RouteStatus.PROVISIONING: | ||
| green_provisioning.append(r) | ||
| elif r.status == RouteStatus.HEALTHY: | ||
| green_healthy.append(r) | ||
| elif r.status in (RouteStatus.FAILED_TO_START, RouteStatus.TERMINATED): | ||
| green_failed.append(r) | ||
| elif r.status.is_active(): | ||
| green_healthy.append(r) | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [MEDIUM] Finding #4: UNHEALTHY and DEGRADED green routes silently classified as "healthy" The While Suggested fix: Consider adding explicit handling for
|
||
|
|
||
| total_green_live = len(green_provisioning) + len(green_healthy) | ||
|
|
||
| # ── 2. No green routes → create all green (INACTIVE) ── | ||
| if total_green_live == 0 and not green_failed: | ||
| log.debug( | ||
| "deployment {}: no green routes — creating {} INACTIVE routes", | ||
| deployment.id, | ||
| desired, | ||
| ) | ||
| route_changes = RouteChanges( | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [MEDIUM] Finding #3: Mixed healthy+failed green routes return PROGRESSING indefinitely with no recovery path When some green routes are healthy and some have failed (but none are provisioning), the FSM reaches step 5 (
This creates a stuck deployment that will return Suggested fix: Consider one of:
Note: This finding is partially related to Finding #1 -- if failed routes are not fetched from the DB, this scenario manifests as fewer green routes than expected, but the FSM still gets stuck at step 5. |
||
| rollout_specs=_build_route_creators(deployment, desired), | ||
| ) | ||
| return CycleEvaluationResult( | ||
| sub_step=DeploymentSubStep.PROVISIONING, | ||
| route_changes=route_changes, | ||
| ) | ||
|
|
||
| # ── 3. Green PROVISIONING → wait ── | ||
| if green_provisioning: | ||
| log.debug( | ||
| "deployment {}: {} green routes still provisioning", | ||
| deployment.id, | ||
| len(green_provisioning), | ||
| ) | ||
| return CycleEvaluationResult(sub_step=DeploymentSubStep.PROVISIONING) | ||
|
|
||
| # ── 4. All green failed → rollback ── | ||
| if total_green_live == 0 and green_failed: | ||
| log.warning( | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [HIGH] Finding #2: When Every subsequent cycle will re-evaluate, see Suggested fix: Either:
|
||
| "deployment {}: all {} green routes failed — rolling back", | ||
| deployment.id, | ||
| len(green_failed), | ||
| ) | ||
| route_changes = RouteChanges( | ||
| drain_route_ids=[r.route_id for r in green_failed], | ||
| ) | ||
| return CycleEvaluationResult( | ||
| sub_step=DeploymentSubStep.ROLLED_BACK, | ||
| route_changes=route_changes, | ||
| ) | ||
|
|
||
| # ── 5. Not all green healthy → PROGRESSING (wait) ── | ||
| if len(green_healthy) < desired: | ||
| log.debug( | ||
| "deployment {}: green healthy={}/{} — waiting", | ||
| deployment.id, | ||
| len(green_healthy), | ||
| desired, | ||
| ) | ||
| return CycleEvaluationResult(sub_step=DeploymentSubStep.PROGRESSING) | ||
|
|
||
| # ── All green healthy from here ── | ||
|
|
||
| # ── 6. auto_promote=False → PROGRESSING (manual wait) ── | ||
| if not spec.auto_promote: | ||
| log.debug( | ||
| "deployment {}: all green healthy, waiting for manual promotion", | ||
| deployment.id, | ||
| ) | ||
| return CycleEvaluationResult(sub_step=DeploymentSubStep.PROGRESSING) | ||
|
|
||
| # ── 7. auto_promote=True + delay>0 → check elapsed time ── | ||
| if spec.promote_delay_seconds > 0: | ||
| latest_healthy_at = _latest_status_updated_at(green_healthy) | ||
| if latest_healthy_at is None: | ||
| log.debug( | ||
| "deployment {}: all green healthy but status_updated_at unknown — waiting", | ||
| deployment.id, | ||
| ) | ||
| return CycleEvaluationResult(sub_step=DeploymentSubStep.PROGRESSING) | ||
| elapsed = (datetime.now(UTC) - latest_healthy_at).total_seconds() | ||
| if elapsed < spec.promote_delay_seconds: | ||
| log.debug( | ||
| "deployment {}: promote delay {:.0f}/{} seconds elapsed — waiting", | ||
| deployment.id, | ||
| elapsed, | ||
| spec.promote_delay_seconds, | ||
| ) | ||
| return CycleEvaluationResult(sub_step=DeploymentSubStep.PROGRESSING) | ||
|
|
||
| # ── 8. Promotion: green → ACTIVE, blue → TERMINATING ── | ||
| log.info( | ||
| "deployment {}: promoting {} green routes, terminating {} blue routes", | ||
| deployment.id, | ||
| len(green_healthy), | ||
| len(blue_active), | ||
| ) | ||
| route_changes = RouteChanges( | ||
| promote_route_ids=[r.route_id for r in green_healthy], | ||
| drain_route_ids=[r.route_id for r in blue_active], | ||
| ) | ||
| return CycleEvaluationResult( | ||
| sub_step=DeploymentSubStep.PROGRESSING, | ||
| completed=True, | ||
| route_changes=route_changes, | ||
| ) | ||
|
|
||
|
|
||
| def _latest_status_updated_at(routes: list[RouteInfo]) -> datetime | None: | ||
| """Return the most recent status_updated_at among the given routes.""" | ||
| timestamps = [r.status_updated_at for r in routes if r.status_updated_at is not None] | ||
| return max(timestamps) if timestamps else None | ||
|
|
||
|
|
||
| def _build_route_creators( | ||
| deployment: DeploymentInfo, | ||
| count: int, | ||
| ) -> list[Creator[RoutingRow]]: | ||
| """Build route creator specs for green routes (INACTIVE, traffic_ratio=0.0).""" | ||
| creators: list[Creator[RoutingRow]] = [] | ||
| for _ in range(count): | ||
| creator_spec = RouteCreatorSpec( | ||
| endpoint_id=deployment.id, | ||
| session_owner_id=deployment.metadata.session_owner, | ||
| domain=deployment.metadata.domain, | ||
| project_id=deployment.metadata.project, | ||
| revision_id=deployment.deploying_revision_id, | ||
| traffic_status=RouteTrafficStatus.INACTIVE, | ||
| traffic_ratio=0.0, | ||
| ) | ||
| creators.append(Creator(spec=creator_spec)) | ||
| return creators | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Scaling in/out and promoting should be considered separately.