Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 53 additions & 5 deletions lib/restate-constructs/register-service-handler/index.mts
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
*/

import type { CloudFormationCustomResourceEvent } from "aws-lambda/trigger/cloudformation-custom-resource";
import type { Context } from "aws-lambda";

import { GetSecretValueCommand, SecretsManagerClient } from "@aws-sdk/client-secrets-manager";

Expand Down Expand Up @@ -89,19 +90,35 @@ export interface RegistrationProperties {
* @see force for allowing both breaking changes and overwrites
*/
breaking?: "true" | "false";

/** Override the maximum number of admin health check attempts before giving up. */
healthCheckRetryAttempts?: number;

/** Cap, in seconds, on the per-iteration backoff sleep used during health check retries. */
healthCheckMaxBackoffSeconds?: number;
}

type RegisterDeploymentResponse = {
id: string;
services: { name: string; revision: number; public: boolean }[];
};

const MAX_HEALTH_CHECK_ATTEMPTS = 10; // This is intentionally quite long to allow some time for first-run EC2 and Docker boot up
const DEFAULT_HEALTH_CHECK_ATTEMPTS = 10; // Long enough to absorb first-run EC2/Docker boot up.
const DEFAULT_HEALTH_CHECK_MAX_BACKOFF_MS = 20_000;
const HEALTH_CHECK_REQUEST_TIMEOUT_MS = 5_000;
// Reserve at the end of the Lambda budget for the registration retries that follow a successful health check, optional
// pruning, and the CFN response submission. The deadline guard in the health check loop refuses to keep retrying
// once the remaining time would not cover this reserve plus the next request.
const POST_HEALTH_CHECK_RESERVE_MS = 60_000;
const MAX_REGISTRATION_ATTEMPTS = 3;

const DEPLOYMENTS_PATH = "deployments";
const SERVICES_PATH = "services";

function healthCheckBackoffBaseMs(attempt: number, maxBackoffMs: number): number {
return Math.min(2 ** attempt * 1_000, maxBackoffMs);
}

interface HttpResponse {
statusCode: number;
body: string;
Expand Down Expand Up @@ -154,12 +171,24 @@ async function httpRequest(
* Custom Resource event handler for Restate service registration. This handler backs the custom resources created by
* {@link ServiceDeployer} to facilitate Lambda service handler discovery.
*/
export const handler = async function (event: CloudFormationCustomResourceEvent) {
export const handler = async function (event: CloudFormationCustomResourceEvent, context: Context) {
console.log({ event });

const props = event.ResourceProperties as RegistrationProperties;
const rejectUnauthorized = props.insecure !== "true";

const maxHealthCheckAttempts = positiveIntOr(
props.healthCheckRetryAttempts,
DEFAULT_HEALTH_CHECK_ATTEMPTS,
"healthCheckRetryAttempts",
);
const healthCheckMaxBackoffMs =
positiveIntOr(
props.healthCheckMaxBackoffSeconds,
DEFAULT_HEALTH_CHECK_MAX_BACKOFF_MS / 1_000,
"healthCheckMaxBackoffSeconds",
) * 1_000;

if (event.RequestType === "Delete") {
if (props.removalPolicy !== "destroy") {
console.log("Removal policy is 'retain'; leaving deployment registered in Restate.");
Expand Down Expand Up @@ -221,7 +250,7 @@ export const handler = async function (event: CloudFormationCustomResourceEvent)
healthResponse = await httpRequest(healthCheckUrl, {
method: "GET",
headers: authHeader,
timeout: 5_000,
timeout: HEALTH_CHECK_REQUEST_TIMEOUT_MS,
rejectUnauthorized,
});

Expand All @@ -235,13 +264,21 @@ export const handler = async function (event: CloudFormationCustomResourceEvent)
console.error(`Restate health check failed: "${errorMessage}" (attempt ${attempt})`);
}

if (attempt >= MAX_HEALTH_CHECK_ATTEMPTS) {
if (attempt >= maxHealthCheckAttempts) {
console.error(`Admin service health check failing after ${attempt} attempts.`);
throw new Error(errorMessage ?? `(${healthResponse?.statusCode})`);
}
attempt += 1;

const waitTimeMillis = randomInt(2_000) + 2 ** attempt * 1_000; // 3s -> 6s -> 10s -> 18s -> 34s
const waitTimeMillis = randomInt(2_000) + healthCheckBackoffBaseMs(attempt, healthCheckMaxBackoffMs);
const requiredMs = waitTimeMillis + HEALTH_CHECK_REQUEST_TIMEOUT_MS + POST_HEALTH_CHECK_RESERVE_MS;
if (context && context.getRemainingTimeInMillis() < requiredMs) {
throw new Error(
`Health check loop aborted to preserve Lambda budget for CloudFormation response: ` +
`${context.getRemainingTimeInMillis()}ms remaining, need ${requiredMs}ms. ` +
`Last error: ${errorMessage ?? `(${healthResponse?.statusCode})`}`,
);
}
console.log(`Retrying after ${waitTimeMillis} ms...`);
await sleep(waitTimeMillis);
}
Expand Down Expand Up @@ -392,6 +429,17 @@ async function sleep(millis: number) {
return new Promise((resolve) => setTimeout(resolve, millis));
}

function positiveIntOr(raw: number | string | undefined, fallback: number, propertyName: string): number {
if (raw === undefined || raw === null || raw === "") {
return fallback;
}
const parsed = typeof raw === "number" ? raw : Number(raw);
if (!Number.isFinite(parsed) || parsed < 1) {
throw new Error(`Invalid value for ${propertyName}: expected a positive integer, got ${JSON.stringify(raw)}.`);
}
return Math.floor(parsed);
}

async function deleteDeployment(
adminUrl: string,
deploymentId: string,
Expand Down
26 changes: 26 additions & 0 deletions lib/restate-constructs/service-deployer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,25 @@ export interface ServiceRegistrationProps {
* @default false
*/
breaking?: boolean;

/**
* Maximum number of admin health check attempts before the deployer gives up and reports failure to
* CloudFormation. Defaults to a value that, combined with `healthCheckMaxBackoff`, keeps the worst-case
* loop comfortably below the deployer Lambda's 5-minute default timeout. Increase only if you also
* raise the deployer's Lambda `timeout` to match.
*
* @default 10
*/
healthCheckRetryAttempts?: number;

/**
* Cap on the per-iteration backoff sleep used during admin health check retries. Without a cap, the
* exponential backoff grows fast enough that the deployer Lambda can be killed by the runtime mid-loop,
* leaving CloudFormation to wait for its 60-minute step timeout.
*
* @default Duration.seconds(20)
*/
healthCheckMaxBackoff?: cdk.Duration;
}

/**
Expand Down Expand Up @@ -301,6 +320,13 @@ export class ServiceDeployer extends Construct {
maxPrunedPerRun: options?.maxPrunedPerRun ?? 10,
force: (options?.force ?? false).toString() as "true" | "false",
breaking: (options?.breaking ?? false).toString() as "true" | "false",
// Forward retry knobs only when the caller sets them, to avoid CFN property diffs for existing users.
...(options?.healthCheckRetryAttempts !== undefined
? { healthCheckRetryAttempts: options.healthCheckRetryAttempts }
: {}),
...(options?.healthCheckMaxBackoff !== undefined
? { healthCheckMaxBackoffSeconds: options.healthCheckMaxBackoff.toSeconds() }
: {}),
} satisfies RegistrationProperties,
});

Expand Down
2 changes: 1 addition & 1 deletion test/__snapshots__/restate-constructs.test.ts.snap
Original file line number Diff line number Diff line change
Expand Up @@ -1851,7 +1851,7 @@ exports[`Restate constructs Service Deployer overrides 1`] = `
- arm64
Code:
S3Bucket: cdk-hnb659fds-assets-account-id-region
S3Key: 8ed2b7e40aa8b43d18da885b2d5ec8673277d0810b888d6becb080a6f280a64f.zip
S3Key: 595464e51d4a001ceaa6194ce91bd75f1475b9359b2b4da2271fe26a9d65260e.zip
Description: Restate custom registration handler
Handler: entrypoint.handler
MemorySize: 128
Expand Down
48 changes: 48 additions & 0 deletions test/restate-constructs.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import * as iam from "aws-cdk-lib/aws-iam";
import * as lambda from "aws-cdk-lib/aws-lambda";
import * as route53 from "aws-cdk-lib/aws-route53";
import * as secrets from "aws-cdk-lib/aws-secretsmanager";
import { Template } from "aws-cdk-lib/assertions";
import "jest-cdk-snapshot";
import {
FargateRestateDeployment,
Expand Down Expand Up @@ -235,6 +236,53 @@ describe("Restate constructs", () => {
});
});

test("Service Deployer health check retry overrides are forwarded to the custom resource", () => {
const app = new cdk.App();
const stack = new cdk.Stack(app, "ServiceDeployerRetryOverrides", {
env: { account: "account-id", region: "region" },
});

const restateEnvironment = RestateEnvironment.fromAttributes({
adminUrl: "https://restate.example.com:9070",
});

const handler = mockHandler(stack);
const serviceDeployer = new ServiceDeployer(stack, "ServiceDeployer", {
code: lambda.Code.fromAsset("dist/register-service-handler"),
});
serviceDeployer.register(handler.currentVersion, restateEnvironment, {
healthCheckRetryAttempts: 7,
healthCheckMaxBackoff: cdk.Duration.seconds(15),
});

const properties = Template.fromStack(stack).findResources("Custom::RestateServiceDeployment");
const customResource = Object.values(properties)[0]!.Properties as Record<string, unknown>;
expect(customResource.healthCheckRetryAttempts).toBe(7);
expect(customResource.healthCheckMaxBackoffSeconds).toBe(15);
});

test("Service Deployer omits health check retry properties when not set", () => {
const app = new cdk.App();
const stack = new cdk.Stack(app, "ServiceDeployerRetryDefaults", {
env: { account: "account-id", region: "region" },
});

const restateEnvironment = RestateEnvironment.fromAttributes({
adminUrl: "https://restate.example.com:9070",
});

const handler = mockHandler(stack);
const serviceDeployer = new ServiceDeployer(stack, "ServiceDeployer", {
code: lambda.Code.fromAsset("dist/register-service-handler"),
});
serviceDeployer.register(handler.currentVersion, restateEnvironment);

const properties = Template.fromStack(stack).findResources("Custom::RestateServiceDeployment");
const customResource = Object.values(properties)[0]!.Properties as Record<string, unknown>;
expect("healthCheckRetryAttempts" in customResource).toBe(false);
expect("healthCheckMaxBackoffSeconds" in customResource).toBe(false);
});

test("[Experimental] Create a self-hosted Restate environment deployed on ECS Fargate", () => {
const app = new cdk.App();
const stack = new cdk.Stack(app, "RestateOnFargateStack", {
Expand Down
Loading