From 34f58eff9722aa1fc5e2f7f136c54e90cd127f72 Mon Sep 17 00:00:00 2001 From: Pavel Tcholakov Date: Tue, 5 May 2026 21:36:06 +0200 Subject: [PATCH] Cap health check backoff and add Lambda deadline guard The exponential backoff in the ServiceDeployer custom resource handler grew without a cap, so a sustained admin reachability failure could keep the loop running for ~35 minutes - well past Lambda's 15-minute hard limit. The Lambda was killed mid-sleep without writing a CloudFormation response, leaving the stack to wait for its 60-minute step timeout. Cap each iteration's backoff at 20 seconds so the worst case stays under the deployer Lambda's default 5-minute timeout, and add a deadline guard based on context.getRemainingTimeInMillis() that aborts the loop early if the remaining budget cannot cover the next request plus reserve for registration retries, optional pruning, and the CFN response submission. Expose healthCheckRetryAttempts and healthCheckMaxBackoff on ServiceRegistrationProps for users who need to tune the loop. Defaults are emitted only when explicitly set so existing CloudFormation templates do not see a property diff. --- .../register-service-handler/index.mts | 58 +++++++++++++++++-- lib/restate-constructs/service-deployer.ts | 26 +++++++++ .../restate-constructs.test.ts.snap | 2 +- test/restate-constructs.test.ts | 48 +++++++++++++++ 4 files changed, 128 insertions(+), 6 deletions(-) diff --git a/lib/restate-constructs/register-service-handler/index.mts b/lib/restate-constructs/register-service-handler/index.mts index 999ece5..8362cef 100644 --- a/lib/restate-constructs/register-service-handler/index.mts +++ b/lib/restate-constructs/register-service-handler/index.mts @@ -10,6 +10,7 @@ */ import type { CloudFormationCustomResourceEvent } from "aws-lambda/trigger/cloudformation-custom-resource"; +import type { Context } from "aws-lambda"; import { GetSecretValueCommand, SecretsManagerClient } from "@aws-sdk/client-secrets-manager"; @@ -89,6 +90,12 @@ export interface RegistrationProperties { * @see force for allowing both breaking changes and overwrites */ breaking?: "true" | "false"; + + /** Override the maximum number of admin health check attempts before giving up. */ + healthCheckRetryAttempts?: number; + + /** Cap, in seconds, on the per-iteration backoff sleep used during health check retries. */ + healthCheckMaxBackoffSeconds?: number; } type RegisterDeploymentResponse = { @@ -96,12 +103,22 @@ type RegisterDeploymentResponse = { services: { name: string; revision: number; public: boolean }[]; }; -const MAX_HEALTH_CHECK_ATTEMPTS = 10; // This is intentionally quite long to allow some time for first-run EC2 and Docker boot up +const DEFAULT_HEALTH_CHECK_ATTEMPTS = 10; // Long enough to absorb first-run EC2/Docker boot up. +const DEFAULT_HEALTH_CHECK_MAX_BACKOFF_MS = 20_000; +const HEALTH_CHECK_REQUEST_TIMEOUT_MS = 5_000; +// Reserve at the end of the Lambda budget for the registration retries that follow a successful health check, optional +// pruning, and the CFN response submission. The deadline guard in the health check loop refuses to keep retrying +// once the remaining time would not cover this reserve plus the next request. +const POST_HEALTH_CHECK_RESERVE_MS = 60_000; const MAX_REGISTRATION_ATTEMPTS = 3; const DEPLOYMENTS_PATH = "deployments"; const SERVICES_PATH = "services"; +function healthCheckBackoffBaseMs(attempt: number, maxBackoffMs: number): number { + return Math.min(2 ** attempt * 1_000, maxBackoffMs); +} + interface HttpResponse { statusCode: number; body: string; @@ -154,12 +171,24 @@ async function httpRequest( * Custom Resource event handler for Restate service registration. This handler backs the custom resources created by * {@link ServiceDeployer} to facilitate Lambda service handler discovery. */ -export const handler = async function (event: CloudFormationCustomResourceEvent) { +export const handler = async function (event: CloudFormationCustomResourceEvent, context: Context) { console.log({ event }); const props = event.ResourceProperties as RegistrationProperties; const rejectUnauthorized = props.insecure !== "true"; + const maxHealthCheckAttempts = positiveIntOr( + props.healthCheckRetryAttempts, + DEFAULT_HEALTH_CHECK_ATTEMPTS, + "healthCheckRetryAttempts", + ); + const healthCheckMaxBackoffMs = + positiveIntOr( + props.healthCheckMaxBackoffSeconds, + DEFAULT_HEALTH_CHECK_MAX_BACKOFF_MS / 1_000, + "healthCheckMaxBackoffSeconds", + ) * 1_000; + if (event.RequestType === "Delete") { if (props.removalPolicy !== "destroy") { console.log("Removal policy is 'retain'; leaving deployment registered in Restate."); @@ -221,7 +250,7 @@ export const handler = async function (event: CloudFormationCustomResourceEvent) healthResponse = await httpRequest(healthCheckUrl, { method: "GET", headers: authHeader, - timeout: 5_000, + timeout: HEALTH_CHECK_REQUEST_TIMEOUT_MS, rejectUnauthorized, }); @@ -235,13 +264,21 @@ export const handler = async function (event: CloudFormationCustomResourceEvent) console.error(`Restate health check failed: "${errorMessage}" (attempt ${attempt})`); } - if (attempt >= MAX_HEALTH_CHECK_ATTEMPTS) { + if (attempt >= maxHealthCheckAttempts) { console.error(`Admin service health check failing after ${attempt} attempts.`); throw new Error(errorMessage ?? `(${healthResponse?.statusCode})`); } attempt += 1; - const waitTimeMillis = randomInt(2_000) + 2 ** attempt * 1_000; // 3s -> 6s -> 10s -> 18s -> 34s + const waitTimeMillis = randomInt(2_000) + healthCheckBackoffBaseMs(attempt, healthCheckMaxBackoffMs); + const requiredMs = waitTimeMillis + HEALTH_CHECK_REQUEST_TIMEOUT_MS + POST_HEALTH_CHECK_RESERVE_MS; + if (context && context.getRemainingTimeInMillis() < requiredMs) { + throw new Error( + `Health check loop aborted to preserve Lambda budget for CloudFormation response: ` + + `${context.getRemainingTimeInMillis()}ms remaining, need ${requiredMs}ms. ` + + `Last error: ${errorMessage ?? `(${healthResponse?.statusCode})`}`, + ); + } console.log(`Retrying after ${waitTimeMillis} ms...`); await sleep(waitTimeMillis); } @@ -392,6 +429,17 @@ async function sleep(millis: number) { return new Promise((resolve) => setTimeout(resolve, millis)); } +function positiveIntOr(raw: number | string | undefined, fallback: number, propertyName: string): number { + if (raw === undefined || raw === null || raw === "") { + return fallback; + } + const parsed = typeof raw === "number" ? raw : Number(raw); + if (!Number.isFinite(parsed) || parsed < 1) { + throw new Error(`Invalid value for ${propertyName}: expected a positive integer, got ${JSON.stringify(raw)}.`); + } + return Math.floor(parsed); +} + async function deleteDeployment( adminUrl: string, deploymentId: string, diff --git a/lib/restate-constructs/service-deployer.ts b/lib/restate-constructs/service-deployer.ts index 6eac279..b8f1a6c 100644 --- a/lib/restate-constructs/service-deployer.ts +++ b/lib/restate-constructs/service-deployer.ts @@ -148,6 +148,25 @@ export interface ServiceRegistrationProps { * @default false */ breaking?: boolean; + + /** + * Maximum number of admin health check attempts before the deployer gives up and reports failure to + * CloudFormation. Defaults to a value that, combined with `healthCheckMaxBackoff`, keeps the worst-case + * loop comfortably below the deployer Lambda's 5-minute default timeout. Increase only if you also + * raise the deployer's Lambda `timeout` to match. + * + * @default 10 + */ + healthCheckRetryAttempts?: number; + + /** + * Cap on the per-iteration backoff sleep used during admin health check retries. Without a cap, the + * exponential backoff grows fast enough that the deployer Lambda can be killed by the runtime mid-loop, + * leaving CloudFormation to wait for its 60-minute step timeout. + * + * @default Duration.seconds(20) + */ + healthCheckMaxBackoff?: cdk.Duration; } /** @@ -301,6 +320,13 @@ export class ServiceDeployer extends Construct { maxPrunedPerRun: options?.maxPrunedPerRun ?? 10, force: (options?.force ?? false).toString() as "true" | "false", breaking: (options?.breaking ?? false).toString() as "true" | "false", + // Forward retry knobs only when the caller sets them, to avoid CFN property diffs for existing users. + ...(options?.healthCheckRetryAttempts !== undefined + ? { healthCheckRetryAttempts: options.healthCheckRetryAttempts } + : {}), + ...(options?.healthCheckMaxBackoff !== undefined + ? { healthCheckMaxBackoffSeconds: options.healthCheckMaxBackoff.toSeconds() } + : {}), } satisfies RegistrationProperties, }); diff --git a/test/__snapshots__/restate-constructs.test.ts.snap b/test/__snapshots__/restate-constructs.test.ts.snap index 0e0efd0..abbfb12 100644 --- a/test/__snapshots__/restate-constructs.test.ts.snap +++ b/test/__snapshots__/restate-constructs.test.ts.snap @@ -1851,7 +1851,7 @@ exports[`Restate constructs Service Deployer overrides 1`] = ` - arm64 Code: S3Bucket: cdk-hnb659fds-assets-account-id-region - S3Key: 8ed2b7e40aa8b43d18da885b2d5ec8673277d0810b888d6becb080a6f280a64f.zip + S3Key: 595464e51d4a001ceaa6194ce91bd75f1475b9359b2b4da2271fe26a9d65260e.zip Description: Restate custom registration handler Handler: entrypoint.handler MemorySize: 128 diff --git a/test/restate-constructs.test.ts b/test/restate-constructs.test.ts index fa4cf9d..24f4599 100644 --- a/test/restate-constructs.test.ts +++ b/test/restate-constructs.test.ts @@ -15,6 +15,7 @@ import * as iam from "aws-cdk-lib/aws-iam"; import * as lambda from "aws-cdk-lib/aws-lambda"; import * as route53 from "aws-cdk-lib/aws-route53"; import * as secrets from "aws-cdk-lib/aws-secretsmanager"; +import { Template } from "aws-cdk-lib/assertions"; import "jest-cdk-snapshot"; import { FargateRestateDeployment, @@ -235,6 +236,53 @@ describe("Restate constructs", () => { }); }); + test("Service Deployer health check retry overrides are forwarded to the custom resource", () => { + const app = new cdk.App(); + const stack = new cdk.Stack(app, "ServiceDeployerRetryOverrides", { + env: { account: "account-id", region: "region" }, + }); + + const restateEnvironment = RestateEnvironment.fromAttributes({ + adminUrl: "https://restate.example.com:9070", + }); + + const handler = mockHandler(stack); + const serviceDeployer = new ServiceDeployer(stack, "ServiceDeployer", { + code: lambda.Code.fromAsset("dist/register-service-handler"), + }); + serviceDeployer.register(handler.currentVersion, restateEnvironment, { + healthCheckRetryAttempts: 7, + healthCheckMaxBackoff: cdk.Duration.seconds(15), + }); + + const properties = Template.fromStack(stack).findResources("Custom::RestateServiceDeployment"); + const customResource = Object.values(properties)[0]!.Properties as Record; + expect(customResource.healthCheckRetryAttempts).toBe(7); + expect(customResource.healthCheckMaxBackoffSeconds).toBe(15); + }); + + test("Service Deployer omits health check retry properties when not set", () => { + const app = new cdk.App(); + const stack = new cdk.Stack(app, "ServiceDeployerRetryDefaults", { + env: { account: "account-id", region: "region" }, + }); + + const restateEnvironment = RestateEnvironment.fromAttributes({ + adminUrl: "https://restate.example.com:9070", + }); + + const handler = mockHandler(stack); + const serviceDeployer = new ServiceDeployer(stack, "ServiceDeployer", { + code: lambda.Code.fromAsset("dist/register-service-handler"), + }); + serviceDeployer.register(handler.currentVersion, restateEnvironment); + + const properties = Template.fromStack(stack).findResources("Custom::RestateServiceDeployment"); + const customResource = Object.values(properties)[0]!.Properties as Record; + expect("healthCheckRetryAttempts" in customResource).toBe(false); + expect("healthCheckMaxBackoffSeconds" in customResource).toBe(false); + }); + test("[Experimental] Create a self-hosted Restate environment deployed on ECS Fargate", () => { const app = new cdk.App(); const stack = new cdk.Stack(app, "RestateOnFargateStack", {