diff --git a/lib/restate-constructs/register-service-handler/index.mts b/lib/restate-constructs/register-service-handler/index.mts index 999ece5..8362cef 100644 --- a/lib/restate-constructs/register-service-handler/index.mts +++ b/lib/restate-constructs/register-service-handler/index.mts @@ -10,6 +10,7 @@ */ import type { CloudFormationCustomResourceEvent } from "aws-lambda/trigger/cloudformation-custom-resource"; +import type { Context } from "aws-lambda"; import { GetSecretValueCommand, SecretsManagerClient } from "@aws-sdk/client-secrets-manager"; @@ -89,6 +90,12 @@ export interface RegistrationProperties { * @see force for allowing both breaking changes and overwrites */ breaking?: "true" | "false"; + + /** Override the maximum number of admin health check attempts before giving up. */ + healthCheckRetryAttempts?: number; + + /** Cap, in seconds, on the per-iteration backoff sleep used during health check retries. */ + healthCheckMaxBackoffSeconds?: number; } type RegisterDeploymentResponse = { @@ -96,12 +103,22 @@ type RegisterDeploymentResponse = { services: { name: string; revision: number; public: boolean }[]; }; -const MAX_HEALTH_CHECK_ATTEMPTS = 10; // This is intentionally quite long to allow some time for first-run EC2 and Docker boot up +const DEFAULT_HEALTH_CHECK_ATTEMPTS = 10; // Long enough to absorb first-run EC2/Docker boot up. +const DEFAULT_HEALTH_CHECK_MAX_BACKOFF_MS = 20_000; +const HEALTH_CHECK_REQUEST_TIMEOUT_MS = 5_000; +// Reserve at the end of the Lambda budget for the registration retries that follow a successful health check, optional +// pruning, and the CFN response submission. The deadline guard in the health check loop refuses to keep retrying +// once the remaining time would not cover this reserve plus the next request. +const POST_HEALTH_CHECK_RESERVE_MS = 60_000; const MAX_REGISTRATION_ATTEMPTS = 3; const DEPLOYMENTS_PATH = "deployments"; const SERVICES_PATH = "services"; +function healthCheckBackoffBaseMs(attempt: number, maxBackoffMs: number): number { + return Math.min(2 ** attempt * 1_000, maxBackoffMs); +} + interface HttpResponse { statusCode: number; body: string; @@ -154,12 +171,24 @@ async function httpRequest( * Custom Resource event handler for Restate service registration. This handler backs the custom resources created by * {@link ServiceDeployer} to facilitate Lambda service handler discovery. */ -export const handler = async function (event: CloudFormationCustomResourceEvent) { +export const handler = async function (event: CloudFormationCustomResourceEvent, context: Context) { console.log({ event }); const props = event.ResourceProperties as RegistrationProperties; const rejectUnauthorized = props.insecure !== "true"; + const maxHealthCheckAttempts = positiveIntOr( + props.healthCheckRetryAttempts, + DEFAULT_HEALTH_CHECK_ATTEMPTS, + "healthCheckRetryAttempts", + ); + const healthCheckMaxBackoffMs = + positiveIntOr( + props.healthCheckMaxBackoffSeconds, + DEFAULT_HEALTH_CHECK_MAX_BACKOFF_MS / 1_000, + "healthCheckMaxBackoffSeconds", + ) * 1_000; + if (event.RequestType === "Delete") { if (props.removalPolicy !== "destroy") { console.log("Removal policy is 'retain'; leaving deployment registered in Restate."); @@ -221,7 +250,7 @@ export const handler = async function (event: CloudFormationCustomResourceEvent) healthResponse = await httpRequest(healthCheckUrl, { method: "GET", headers: authHeader, - timeout: 5_000, + timeout: HEALTH_CHECK_REQUEST_TIMEOUT_MS, rejectUnauthorized, }); @@ -235,13 +264,21 @@ export const handler = async function (event: CloudFormationCustomResourceEvent) console.error(`Restate health check failed: "${errorMessage}" (attempt ${attempt})`); } - if (attempt >= MAX_HEALTH_CHECK_ATTEMPTS) { + if (attempt >= maxHealthCheckAttempts) { console.error(`Admin service health check failing after ${attempt} attempts.`); throw new Error(errorMessage ?? `(${healthResponse?.statusCode})`); } attempt += 1; - const waitTimeMillis = randomInt(2_000) + 2 ** attempt * 1_000; // 3s -> 6s -> 10s -> 18s -> 34s + const waitTimeMillis = randomInt(2_000) + healthCheckBackoffBaseMs(attempt, healthCheckMaxBackoffMs); + const requiredMs = waitTimeMillis + HEALTH_CHECK_REQUEST_TIMEOUT_MS + POST_HEALTH_CHECK_RESERVE_MS; + if (context && context.getRemainingTimeInMillis() < requiredMs) { + throw new Error( + `Health check loop aborted to preserve Lambda budget for CloudFormation response: ` + + `${context.getRemainingTimeInMillis()}ms remaining, need ${requiredMs}ms. ` + + `Last error: ${errorMessage ?? `(${healthResponse?.statusCode})`}`, + ); + } console.log(`Retrying after ${waitTimeMillis} ms...`); await sleep(waitTimeMillis); } @@ -392,6 +429,17 @@ async function sleep(millis: number) { return new Promise((resolve) => setTimeout(resolve, millis)); } +function positiveIntOr(raw: number | string | undefined, fallback: number, propertyName: string): number { + if (raw === undefined || raw === null || raw === "") { + return fallback; + } + const parsed = typeof raw === "number" ? raw : Number(raw); + if (!Number.isFinite(parsed) || parsed < 1) { + throw new Error(`Invalid value for ${propertyName}: expected a positive integer, got ${JSON.stringify(raw)}.`); + } + return Math.floor(parsed); +} + async function deleteDeployment( adminUrl: string, deploymentId: string, diff --git a/lib/restate-constructs/service-deployer.ts b/lib/restate-constructs/service-deployer.ts index 6eac279..b8f1a6c 100644 --- a/lib/restate-constructs/service-deployer.ts +++ b/lib/restate-constructs/service-deployer.ts @@ -148,6 +148,25 @@ export interface ServiceRegistrationProps { * @default false */ breaking?: boolean; + + /** + * Maximum number of admin health check attempts before the deployer gives up and reports failure to + * CloudFormation. Defaults to a value that, combined with `healthCheckMaxBackoff`, keeps the worst-case + * loop comfortably below the deployer Lambda's 5-minute default timeout. Increase only if you also + * raise the deployer's Lambda `timeout` to match. + * + * @default 10 + */ + healthCheckRetryAttempts?: number; + + /** + * Cap on the per-iteration backoff sleep used during admin health check retries. Without a cap, the + * exponential backoff grows fast enough that the deployer Lambda can be killed by the runtime mid-loop, + * leaving CloudFormation to wait for its 60-minute step timeout. + * + * @default Duration.seconds(20) + */ + healthCheckMaxBackoff?: cdk.Duration; } /** @@ -301,6 +320,13 @@ export class ServiceDeployer extends Construct { maxPrunedPerRun: options?.maxPrunedPerRun ?? 10, force: (options?.force ?? false).toString() as "true" | "false", breaking: (options?.breaking ?? false).toString() as "true" | "false", + // Forward retry knobs only when the caller sets them, to avoid CFN property diffs for existing users. + ...(options?.healthCheckRetryAttempts !== undefined + ? { healthCheckRetryAttempts: options.healthCheckRetryAttempts } + : {}), + ...(options?.healthCheckMaxBackoff !== undefined + ? { healthCheckMaxBackoffSeconds: options.healthCheckMaxBackoff.toSeconds() } + : {}), } satisfies RegistrationProperties, }); diff --git a/test/__snapshots__/restate-constructs.test.ts.snap b/test/__snapshots__/restate-constructs.test.ts.snap index 0e0efd0..abbfb12 100644 --- a/test/__snapshots__/restate-constructs.test.ts.snap +++ b/test/__snapshots__/restate-constructs.test.ts.snap @@ -1851,7 +1851,7 @@ exports[`Restate constructs Service Deployer overrides 1`] = ` - arm64 Code: S3Bucket: cdk-hnb659fds-assets-account-id-region - S3Key: 8ed2b7e40aa8b43d18da885b2d5ec8673277d0810b888d6becb080a6f280a64f.zip + S3Key: 595464e51d4a001ceaa6194ce91bd75f1475b9359b2b4da2271fe26a9d65260e.zip Description: Restate custom registration handler Handler: entrypoint.handler MemorySize: 128 diff --git a/test/restate-constructs.test.ts b/test/restate-constructs.test.ts index fa4cf9d..24f4599 100644 --- a/test/restate-constructs.test.ts +++ b/test/restate-constructs.test.ts @@ -15,6 +15,7 @@ import * as iam from "aws-cdk-lib/aws-iam"; import * as lambda from "aws-cdk-lib/aws-lambda"; import * as route53 from "aws-cdk-lib/aws-route53"; import * as secrets from "aws-cdk-lib/aws-secretsmanager"; +import { Template } from "aws-cdk-lib/assertions"; import "jest-cdk-snapshot"; import { FargateRestateDeployment, @@ -235,6 +236,53 @@ describe("Restate constructs", () => { }); }); + test("Service Deployer health check retry overrides are forwarded to the custom resource", () => { + const app = new cdk.App(); + const stack = new cdk.Stack(app, "ServiceDeployerRetryOverrides", { + env: { account: "account-id", region: "region" }, + }); + + const restateEnvironment = RestateEnvironment.fromAttributes({ + adminUrl: "https://restate.example.com:9070", + }); + + const handler = mockHandler(stack); + const serviceDeployer = new ServiceDeployer(stack, "ServiceDeployer", { + code: lambda.Code.fromAsset("dist/register-service-handler"), + }); + serviceDeployer.register(handler.currentVersion, restateEnvironment, { + healthCheckRetryAttempts: 7, + healthCheckMaxBackoff: cdk.Duration.seconds(15), + }); + + const properties = Template.fromStack(stack).findResources("Custom::RestateServiceDeployment"); + const customResource = Object.values(properties)[0]!.Properties as Record; + expect(customResource.healthCheckRetryAttempts).toBe(7); + expect(customResource.healthCheckMaxBackoffSeconds).toBe(15); + }); + + test("Service Deployer omits health check retry properties when not set", () => { + const app = new cdk.App(); + const stack = new cdk.Stack(app, "ServiceDeployerRetryDefaults", { + env: { account: "account-id", region: "region" }, + }); + + const restateEnvironment = RestateEnvironment.fromAttributes({ + adminUrl: "https://restate.example.com:9070", + }); + + const handler = mockHandler(stack); + const serviceDeployer = new ServiceDeployer(stack, "ServiceDeployer", { + code: lambda.Code.fromAsset("dist/register-service-handler"), + }); + serviceDeployer.register(handler.currentVersion, restateEnvironment); + + const properties = Template.fromStack(stack).findResources("Custom::RestateServiceDeployment"); + const customResource = Object.values(properties)[0]!.Properties as Record; + expect("healthCheckRetryAttempts" in customResource).toBe(false); + expect("healthCheckMaxBackoffSeconds" in customResource).toBe(false); + }); + test("[Experimental] Create a self-hosted Restate environment deployed on ECS Fargate", () => { const app = new cdk.App(); const stack = new cdk.Stack(app, "RestateOnFargateStack", {