From 026af3e94b4fb5c21464db3d19c7b2b7b84d63a9 Mon Sep 17 00:00:00 2001 From: Leonard O'Sullivan Date: Fri, 6 Mar 2026 08:17:45 +1000 Subject: [PATCH 01/27] feat(ami): add first-party nat ami assets under ami directory --- ami/README.md | 55 +++++++++++++++++++++++ ami/files/snat.service | 12 +++++ ami/files/snat.sh | 43 ++++++++++++++++++ ami/nat-zero.pkr.hcl | 88 +++++++++++++++++++++++++++++++++++++ ami/scripts/configure.sh | 14 ++++++ ami/scripts/install-deps.sh | 5 +++ 6 files changed, 217 insertions(+) create mode 100644 ami/README.md create mode 100644 ami/files/snat.service create mode 100755 ami/files/snat.sh create mode 100644 ami/nat-zero.pkr.hcl create mode 100755 ami/scripts/configure.sh create mode 100755 ami/scripts/install-deps.sh diff --git a/ami/README.md b/ami/README.md new file mode 100644 index 0000000..954f202 --- /dev/null +++ b/ami/README.md @@ -0,0 +1,55 @@ +# First-Party NAT AMI (arm64, AL2023 minimal) + +This directory contains the Packer build for the nat-zero first-party NAT AMI. + +## Supported Flavor + +- Architecture: `arm64` +- Base image: Amazon Linux 2023 minimal +- Runtime model: deterministic dual ENI (`ens5` public, `ens6` private) + +## Runtime Design Constraints + +- No IMDS calls in bootstrap/runtime NAT scripts +- No `aws` CLI calls in bootstrap/runtime NAT scripts +- No runtime ENI attach/detach or EIP association logic +- Small, readable bootstrap and NAT config scripts + +## Build + +1. Choose a public subnet ID in the target region. +2. Build with Packer: + +```bash +cd ami +packer init nat-zero.pkr.hcl +packer build \ + -var "region=us-east-1" \ + -var "subnet_id=subnet-0123456789abcdef0" \ + nat-zero.pkr.hcl +``` + +The AMI name format is: + +- `nat-zero-al2023-minimal-arm64-` + +This full AMI name is used as the module default target for deterministic first-party rollout. + +## GitHub Workflow + +Workflow: `.github/workflows/nat-images.yml` + +- Requires GitHub environment secret `AMI_BUILD_ROLE_ARN` +- Requires GitHub environment secret `INTEGRATION_ROLE_ARN` when `run_integration_gate=true` +- Uses OIDC via `aws-actions/configure-aws-credentials` +- Inputs: + - `build_subnet_id` + - `source_region` (default `us-east-1`) + - `run_integration_gate` (default `true`) +- Behavior: + - builds a new first-party AMI with Packer + - copies it to all currently enabled regions in the account (parallel copy with retries) + - runs integration tests against the new source AMI (gate) before promotion + - updates `first_party_ami_name_pattern` (and generated docs) and opens a PR + +Merge the promotion PR to `main` to let release-please publish a new module release that points to the promoted AMI name. diff --git a/ami/files/snat.service b/ami/files/snat.service new file mode 100644 index 0000000..3754e39 --- /dev/null +++ b/ami/files/snat.service @@ -0,0 +1,12 @@ +[Unit] +Description=Configure deterministic IPv4 SNAT for NAT instance +Wants=network-online.target +After=network-online.target + +[Service] +ExecStart=/opt/nat/snat.sh +Type=oneshot +RemainAfterExit=yes + +[Install] +WantedBy=multi-user.target diff --git a/ami/files/snat.sh b/ami/files/snat.sh new file mode 100755 index 0000000..9d1b71a --- /dev/null +++ b/ami/files/snat.sh @@ -0,0 +1,43 @@ +#!/bin/sh +set -eu + +NAT_PUBLIC_IFACE="${NAT_PUBLIC_IFACE:-ens5}" +NAT_PRIVATE_IFACE="${NAT_PRIVATE_IFACE:-ens6}" + +if ! ip link show "$NAT_PUBLIC_IFACE" > /dev/null 2>&1; then + echo "Missing expected public interface: $NAT_PUBLIC_IFACE" >&2 + exit 1 +fi + +if ! ip link show "$NAT_PRIVATE_IFACE" > /dev/null 2>&1; then + echo "Missing expected private interface: $NAT_PRIVATE_IFACE" >&2 + exit 1 +fi + +cat > /etc/sysctl.d/99-nat.conf << 'EOF_SYSCTL' +net.ipv4.ip_forward = 1 +EOF_SYSCTL +sysctl --system > /dev/null + +cat > /etc/sysconfig/iptables << EOF_IPTABLES +*filter +:INPUT DROP [0:0] +:FORWARD DROP [0:0] +:OUTPUT ACCEPT [0:0] +-A INPUT -i lo -j ACCEPT +-A INPUT -i $NAT_PRIVATE_IFACE -j ACCEPT +-A INPUT -i $NAT_PUBLIC_IFACE -m conntrack --ctstate RELATED,ESTABLISHED -j ACCEPT +-A FORWARD -i $NAT_PRIVATE_IFACE -o $NAT_PUBLIC_IFACE -j ACCEPT +-A FORWARD -i $NAT_PUBLIC_IFACE -o $NAT_PRIVATE_IFACE -m conntrack --ctstate RELATED,ESTABLISHED -j ACCEPT +COMMIT + +*nat +:PREROUTING ACCEPT [0:0] +:INPUT ACCEPT [0:0] +:OUTPUT ACCEPT [0:0] +:POSTROUTING ACCEPT [0:0] +-A POSTROUTING -o $NAT_PUBLIC_IFACE -j MASQUERADE +COMMIT +EOF_IPTABLES + +iptables-restore < /etc/sysconfig/iptables diff --git a/ami/nat-zero.pkr.hcl b/ami/nat-zero.pkr.hcl new file mode 100644 index 0000000..b81141e --- /dev/null +++ b/ami/nat-zero.pkr.hcl @@ -0,0 +1,88 @@ +packer { + required_plugins { + amazon = { + source = "github.com/hashicorp/amazon" + version = ">= 1.2.0" + } + } +} + +variable "region" { + type = string + default = "us-east-1" +} + +variable "subnet_id" { + type = string +} + +variable "ami_name_prefix" { + type = string + default = "nat-zero-al2023-minimal-arm64" +} + +variable "root_volume_size" { + type = number + default = 4 +} + +source "amazon-ebs" "nat_zero" { + ami_name = "${var.ami_name_prefix}-${formatdate("YYYYMMDD-hhmmss", timestamp())}" + instance_type = "t4g.nano" + region = var.region + max_retries = 50 + subnet_id = var.subnet_id + ssh_username = "ec2-user" + + launch_block_device_mappings { + device_name = "/dev/xvda" + volume_size = var.root_volume_size + volume_type = "gp3" + delete_on_termination = true + encrypted = true + } + + source_ami_filter { + filters = { + name = "al2023-ami-minimal-*-kernel-*-arm64" + root-device-type = "ebs" + virtualization-type = "hvm" + } + most_recent = true + owners = ["amazon"] + } + + tags = { + Name = "nat-zero-first-party" + Project = "nat-zero" + Role = "nat" + ManagedBy = "packer" + OS = "al2023-minimal" + Architecture = "arm64" + } +} + +build { + name = "nat-zero-first-party" + sources = ["source.amazon-ebs.nat_zero"] + + provisioner "file" { + source = "files/snat.sh" + destination = "/tmp/snat.sh" + } + + provisioner "file" { + source = "files/snat.service" + destination = "/tmp/snat.service" + } + + provisioner "shell" { + execute_command = "sudo -E sh -eux '{{ .Path }}'" + script = "scripts/install-deps.sh" + } + + provisioner "shell" { + execute_command = "sudo -E sh -eux '{{ .Path }}'" + script = "scripts/configure.sh" + } +} diff --git a/ami/scripts/configure.sh b/ami/scripts/configure.sh new file mode 100755 index 0000000..990e96c --- /dev/null +++ b/ami/scripts/configure.sh @@ -0,0 +1,14 @@ +#!/bin/sh +set -eu + +systemctl stop sshd +systemctl disable sshd +systemctl mask sshd +dnf remove -y openssh-server + +mkdir -p /opt/nat +install /tmp/snat.sh /opt/nat/snat.sh -m u+rx +cp /tmp/snat.service /etc/systemd/system/snat.service + +systemctl daemon-reload +systemctl enable snat diff --git a/ami/scripts/install-deps.sh b/ami/scripts/install-deps.sh new file mode 100755 index 0000000..e7650a5 --- /dev/null +++ b/ami/scripts/install-deps.sh @@ -0,0 +1,5 @@ +#!/bin/sh +set -eu + +dnf -y update +dnf -y install iptables From 79b44821be929b8c36fda9acf282db90b399f821 Mon Sep 17 00:00:00 2001 From: Leonard O'Sullivan Date: Fri, 6 Mar 2026 10:22:40 +1000 Subject: [PATCH 02/27] feat: build and promote nat-zero AMIs --- .github/workflows/integration-tests.yml | 21 ++ .github/workflows/nat-images.yml | 339 ++++++++++++++++++++++++ .github/workflows/precommit.yml | 5 + .gitignore | 1 + .pre-commit-config.yaml | 22 ++ README.md | 19 +- ami.tf | 23 ++ ami/README.md | 13 +- ami/nat-zero.pkr.hcl | 11 +- ami/scripts/install-deps.sh | 6 +- cmd/lambda/ec2iface.go | 3 - cmd/lambda/ec2ops.go | 44 +-- cmd/lambda/ec2ops_test.go | 33 +-- cmd/lambda/handler.go | 2 - cmd/lambda/handler_test.go | 26 +- cmd/lambda/main.go | 2 - cmd/lambda/mock_test.go | 51 +--- docs/examples.md | 11 +- docs/index.md | 6 +- docs/performance.md | 2 +- docs/reference.md | 12 +- docs/testing.md | 2 +- docs/workflows.md | 24 +- examples/basic/main.tf | 2 +- iam.tf | 3 - lambda.tf | 16 +- launch_template.tf | 9 +- scripts/update_ami_defaults.sh | 50 ++++ tests/integration/fixture/main.tf | 6 + tests/integration/nat_zero_test.go | 193 ++++++++++---- variables.tf | 15 +- 31 files changed, 720 insertions(+), 252 deletions(-) create mode 100644 .github/workflows/nat-images.yml create mode 100644 ami.tf create mode 100644 scripts/update_ami_defaults.sh diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml index 9b9a231..bd28558 100644 --- a/.github/workflows/integration-tests.yml +++ b/.github/workflows/integration-tests.yml @@ -4,6 +4,23 @@ on: pull_request: types: [labeled] workflow_dispatch: + inputs: + nat_ami_id: + description: Explicit NAT AMI ID to use for the integration fixture + required: false + type: string + updated_nat_ami_id: + description: Optional replacement NAT AMI ID to exercise the AMI upgrade path + required: false + type: string + workflow_call: + inputs: + nat_ami_id: + required: false + type: string + updated_nat_ami_id: + required: false + type: string concurrency: group: nat-zero-integration @@ -16,11 +33,15 @@ permissions: jobs: integration-test: if: >- + github.event_name == 'workflow_call' || github.event_name == 'workflow_dispatch' || github.event.label.name == 'integration-test' runs-on: ubuntu-latest timeout-minutes: 15 environment: integration + env: + NAT_ZERO_TEST_NAT_AMI_ID: ${{ inputs.nat_ami_id || github.event.inputs.nat_ami_id || '' }} + NAT_ZERO_TEST_UPDATED_NAT_AMI_ID: ${{ inputs.updated_nat_ami_id || github.event.inputs.updated_nat_ami_id || '' }} steps: - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 diff --git a/.github/workflows/nat-images.yml b/.github/workflows/nat-images.yml new file mode 100644 index 0000000..582f843 --- /dev/null +++ b/.github/workflows/nat-images.yml @@ -0,0 +1,339 @@ +name: NAT Images + +on: + workflow_dispatch: + inputs: + build_subnet_id: + description: Public subnet ID to use for the Packer builder instance + required: true + type: string + source_region: + description: Region where the AMI is built before being copied globally + required: false + default: us-east-1 + type: string + run_integration_gate: + description: Run the us-east-1 integration gates before publishing and promoting the AMI + required: false + default: true + type: boolean + +concurrency: + group: nat-zero-ami + cancel-in-progress: false + +permissions: + id-token: write + contents: write + pull-requests: write + +jobs: + build-and-copy: + runs-on: ubuntu-latest + environment: ami-build + outputs: + ami_name: ${{ steps.metadata.outputs.ami_name }} + owner_account_id: ${{ steps.metadata.outputs.owner_account_id }} + source_ami_id: ${{ steps.build.outputs.source_ami_id }} + test_ami_id: ${{ steps.test-ami.outputs.test_ami_id }} + regions_json: ${{ steps.regions.outputs.regions_json }} + steps: + - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 + + - uses: hashicorp/setup-packer@v3 + + - uses: aws-actions/configure-aws-credentials@7474bc4690e29a8392af63c5b98e7449536d5c3a # v4 + with: + role-to-assume: ${{ secrets.AMI_BUILD_ROLE_ARN }} + aws-region: ${{ inputs.source_region }} + + - name: Build AMI + id: build + working-directory: ami + run: | + rm -f manifest.json + packer init nat-zero.pkr.hcl + packer build \ + -color=false \ + -var "region=${{ inputs.source_region }}" \ + -var "subnet_id=${{ inputs.build_subnet_id }}" \ + nat-zero.pkr.hcl + + source_ami_id="$(jq -r '.builds[-1].artifact_id' manifest.json | awk -F: '{print $2}')" + if [ -z "$source_ami_id" ] || [ "$source_ami_id" = "null" ]; then + echo "failed to determine source AMI ID from ami/manifest.json" >&2 + exit 1 + fi + + echo "source_ami_id=$source_ami_id" >> "$GITHUB_OUTPUT" + + - name: Resolve AMI metadata + id: metadata + env: + SOURCE_AMI_ID: ${{ steps.build.outputs.source_ami_id }} + run: | + owner_account_id="$(aws sts get-caller-identity --query 'Account' --output text)" + ami_name="$(aws ec2 describe-images --region "${{ inputs.source_region }}" --image-ids "$SOURCE_AMI_ID" --query 'Images[0].Name' --output text)" + + echo "owner_account_id=$owner_account_id" >> "$GITHUB_OUTPUT" + echo "ami_name=$ami_name" >> "$GITHUB_OUTPUT" + + - name: Resolve enabled regions + id: regions + run: | + regions_json="$(aws ec2 describe-regions --all-regions --query "Regions[?OptInStatus=='opt-in-not-required' || OptInStatus=='opted-in'].RegionName" --output json | jq -c 'sort')" + echo "regions_json=$regions_json" >> "$GITHUB_OUTPUT" + + - name: Copy AMI to enabled regions + env: + AMI_NAME: ${{ steps.metadata.outputs.ami_name }} + REGIONS_JSON: ${{ steps.regions.outputs.regions_json }} + SOURCE_AMI_ID: ${{ steps.build.outputs.source_ami_id }} + SOURCE_REGION: ${{ inputs.source_region }} + run: | + copy_region() { + local region="$1" + local attempt + local err_file + local image_id + + err_file="$(mktemp)" + trap 'rm -f "$err_file"' RETURN + + for attempt in 1 2 3 4 5; do + image_id="$(aws ec2 copy-image \ + --region "$region" \ + --source-region "$SOURCE_REGION" \ + --source-image-id "$SOURCE_AMI_ID" \ + --name "$AMI_NAME" \ + --description "nat-zero AMI copied from $SOURCE_REGION" \ + --query 'ImageId' \ + --output text 2>"$err_file")" && break + sleep $((attempt * 5)) + done + + if [ -z "${image_id:-}" ] || [ "$image_id" = "None" ]; then + cat "$err_file" >&2 || true + return 1 + fi + + aws ec2 wait image-available --region "$region" --image-ids "$image_id" + } + + while IFS= read -r region; do + if [ "$region" = "$SOURCE_REGION" ]; then + continue + fi + + ( + set -euo pipefail + copy_region "$region" + ) & + done < <(jq -r '.[]' <<<"$REGIONS_JSON") + + wait + + - name: Resolve us-east-1 test AMI + id: test-ami + env: + AMI_NAME: ${{ steps.metadata.outputs.ami_name }} + OWNER_ACCOUNT_ID: ${{ steps.metadata.outputs.owner_account_id }} + SOURCE_AMI_ID: ${{ steps.build.outputs.source_ami_id }} + run: | + if [ "${{ inputs.source_region }}" = "us-east-1" ]; then + test_ami_id="$SOURCE_AMI_ID" + else + test_ami_id="$(aws ec2 describe-images \ + --region us-east-1 \ + --owners "$OWNER_ACCOUNT_ID" \ + --filters "Name=name,Values=$AMI_NAME" "Name=state,Values=available" \ + --query 'Images[0].ImageId' \ + --output text)" + fi + + if [ -z "$test_ami_id" ] || [ "$test_ami_id" = "None" ]; then + echo "failed to resolve the us-east-1 AMI copy for $AMI_NAME" >&2 + exit 1 + fi + + echo "test_ami_id=$test_ami_id" >> "$GITHUB_OUTPUT" + + integration-new-ami: + if: ${{ inputs.run_integration_gate }} + needs: build-and-copy + runs-on: ubuntu-latest + timeout-minutes: 15 + environment: integration + env: + NAT_ZERO_TEST_NAT_AMI_ID: ${{ needs.build-and-copy.outputs.test_ami_id }} + steps: + - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 + + - uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5 + with: + go-version-file: tests/integration/go.mod + + - uses: hashicorp/setup-terraform@b9cd54a3c349d3f38e8881555d616ced269862dd # v3 + with: + terraform_wrapper: false + + - uses: aws-actions/configure-aws-credentials@7474bc4690e29a8392af63c5b98e7449536d5c3a # v4 + with: + role-to-assume: ${{ secrets.INTEGRATION_ROLE_ARN }} + aws-region: us-east-1 + + - name: Build Lambda binary + working-directory: cmd/lambda + run: | + GOOS=linux GOARCH=arm64 CGO_ENABLED=0 go build -tags lambda.norpc -ldflags='-s -w' -o bootstrap + zip lambda.zip bootstrap + mkdir -p ../../.build + cp lambda.zip ../../.build/lambda.zip + + - name: Test new AMI directly + working-directory: tests/integration + run: go test -v -timeout 10m -count=1 + + integration-ami-upgrade: + if: ${{ inputs.run_integration_gate }} + needs: + - build-and-copy + - integration-new-ami + runs-on: ubuntu-latest + timeout-minutes: 15 + environment: integration + env: + NAT_ZERO_TEST_UPDATED_NAT_AMI_ID: ${{ needs.build-and-copy.outputs.test_ami_id }} + steps: + - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 + + - uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5 + with: + go-version-file: tests/integration/go.mod + + - uses: hashicorp/setup-terraform@b9cd54a3c349d3f38e8881555d616ced269862dd # v3 + with: + terraform_wrapper: false + + - uses: aws-actions/configure-aws-credentials@7474bc4690e29a8392af63c5b98e7449536d5c3a # v4 + with: + role-to-assume: ${{ secrets.INTEGRATION_ROLE_ARN }} + aws-region: us-east-1 + + - name: Build Lambda binary + working-directory: cmd/lambda + run: | + GOOS=linux GOARCH=arm64 CGO_ENABLED=0 go build -tags lambda.norpc -ldflags='-s -w' -o bootstrap + zip lambda.zip bootstrap + mkdir -p ../../.build + cp lambda.zip ../../.build/lambda.zip + + - name: Test AMI upgrade path + working-directory: tests/integration + run: go test -v -timeout 15m -count=1 + + publish-public: + needs: + - build-and-copy + - integration-new-ami + - integration-ami-upgrade + if: >- + always() && + needs.build-and-copy.result == 'success' && + ( + inputs.run_integration_gate == false || + (needs.integration-new-ami.result == 'success' && needs.integration-ami-upgrade.result == 'success') + ) + runs-on: ubuntu-latest + environment: ami-build + steps: + - uses: aws-actions/configure-aws-credentials@7474bc4690e29a8392af63c5b98e7449536d5c3a # v4 + with: + role-to-assume: ${{ secrets.AMI_BUILD_ROLE_ARN }} + aws-region: ${{ inputs.source_region }} + + - name: Make copied AMIs public + env: + AMI_NAME: ${{ needs.build-and-copy.outputs.ami_name }} + OWNER_ACCOUNT_ID: ${{ needs.build-and-copy.outputs.owner_account_id }} + REGIONS_JSON: ${{ needs.build-and-copy.outputs.regions_json }} + run: | + while IFS= read -r region; do + image_id="$(aws ec2 describe-images \ + --region "$region" \ + --owners "$OWNER_ACCOUNT_ID" \ + --filters "Name=name,Values=$AMI_NAME" "Name=state,Values=available" \ + --query 'Images[0].ImageId' \ + --output text)" + + if [ -z "$image_id" ] || [ "$image_id" = "None" ]; then + echo "failed to resolve image for $region" >&2 + exit 1 + fi + + aws ec2 modify-image-attribute \ + --region "$region" \ + --image-id "$image_id" \ + --launch-permission 'Add=[{Group=all}]' + done < <(jq -r '.[]' <<<"$REGIONS_JSON") + + open-promotion-pr: + needs: + - build-and-copy + - publish-public + if: ${{ needs.publish-public.result == 'success' }} + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 + with: + ref: ${{ github.ref_name }} + + - uses: hashicorp/setup-terraform@b9cd54a3c349d3f38e8881555d616ced269862dd # v3 + with: + terraform_wrapper: false + + - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5 + with: + python-version: "3.12" + + - name: Install pre-commit + run: python -m pip install --upgrade pre-commit + + - name: Update promoted AMI defaults + env: + AMI_NAME: ${{ needs.build-and-copy.outputs.ami_name }} + OWNER_ACCOUNT_ID: ${{ needs.build-and-copy.outputs.owner_account_id }} + run: | + bash scripts/update_ami_defaults.sh "$OWNER_ACCOUNT_ID" "$AMI_NAME" + terraform fmt -recursive + pre-commit run terraform-docs-go --all-files + + - name: Create or update promotion PR + env: + AMI_NAME: ${{ needs.build-and-copy.outputs.ami_name }} + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + branch_name="automation/promote-nat-zero-ami-$(tr -cs '[:alnum:]' '-' <<<"$AMI_NAME" | sed 's/^-//; s/-$//' | tr '[:upper:]' '[:lower:]')" + commit_title="feat: promote nat-zero AMI ${AMI_NAME}" + pr_title="$commit_title" + pr_body=$'Promotes the default nat-zero AMI after the automated Packer build, global copy, and us-east-1 integration gates passed.\n\nSquash-merge this PR to preserve the `feat:` title so release-please cuts the next module release PR.' + + if git diff --quiet; then + echo "no default changes detected; nothing to promote" + exit 0 + fi + + git config user.name "github-actions[bot]" + git config user.email "41898282+github-actions[bot]@users.noreply.github.com" + + git checkout -B "$branch_name" + git add variables.tf README.md docs/reference.md + git commit -m "$commit_title" + git push --force --set-upstream origin "$branch_name" + + pr_number="$(gh pr list --head "$branch_name" --json number --jq '.[0].number')" + if [ -n "$pr_number" ]; then + gh pr edit "$pr_number" --title "$pr_title" --body "$pr_body" + else + gh pr create --base main --head "$branch_name" --title "$pr_title" --body "$pr_body" + fi diff --git a/.github/workflows/precommit.yml b/.github/workflows/precommit.yml index c6c4296..86df725 100644 --- a/.github/workflows/precommit.yml +++ b/.github/workflows/precommit.yml @@ -16,11 +16,16 @@ jobs: with: go-version-file: cmd/lambda/go.mod + - uses: hashicorp/setup-packer@v3 + - uses: hashicorp/setup-terraform@b9cd54a3c349d3f38e8881555d616ced269862dd # v3 - name: Install tools run: | + sudo apt-get update + sudo apt-get install -y shellcheck go install honnef.co/go/tools/cmd/staticcheck@latest + go install github.com/rhysd/actionlint/cmd/actionlint@latest curl -s https://raw.githubusercontent.com/terraform-linters/tflint/master/install_linux.sh | bash - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5 diff --git a/.gitignore b/.gitignore index 9476cc0..d70df16 100644 --- a/.gitignore +++ b/.gitignore @@ -10,6 +10,7 @@ cmd/lambda/lambda cmd/lambda/bootstrap *.zip +ami/manifest.json # Go vendor/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 5ebf7f0..5826891 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -11,6 +11,10 @@ repos: - id: detect-private-key - id: detect-aws-credentials args: ["--allow-missing-credentials"] + - repo: https://github.com/rhysd/actionlint + rev: v1.7.8 + hooks: + - id: actionlint - repo: https://github.com/TekWizely/pre-commit-golang rev: v1.0.0-rc.1 hooks: @@ -30,6 +34,23 @@ repos: files: '\.go$' exclude: "tests/integration/" pass_filenames: false + - id: shellcheck + name: shellcheck + language: system + entry: shellcheck + files: '(^scripts/.*\.sh$|^ami/.*\.sh$)' + - id: packer-fmt + name: packer fmt + language: system + entry: bash -c 'cd ami && packer fmt -check -diff nat-zero.pkr.hcl' + files: '^ami/.*\.(pkr\.hcl|hcl)$' + pass_filenames: false + - id: packer-validate + name: packer validate + language: system + entry: bash -c 'cd ami && packer init nat-zero.pkr.hcl >/dev/null && packer validate -var "subnet_id=subnet-00000000000000000" nat-zero.pkr.hcl' + files: '^ami/.*$' + pass_filenames: false - repo: https://github.com/zricethezav/gitleaks rev: v8.16.4 hooks: @@ -38,6 +59,7 @@ repos: rev: v1.77.0 hooks: - id: terraform_fmt + - id: terraform_validate - id: terraform_tflint - repo: https://github.com/terraform-docs/terraform-docs rev: "v0.16.0" diff --git a/README.md b/README.md index 997dd83..07ea58d 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ nat-zero is a Terraform module that replaces always-on NAT with on-demand NAT instances. When a workload launches in a private subnet, a NAT instance starts automatically. When the last workload stops, the NAT shuts down and its Elastic IP is released. Idle cost: ~$0.80/month per AZ. -Built on [fck-nat](https://fck-nat.dev/) AMIs. Orchestrated by a single Go Lambda (~55 ms cold start, 29 MB memory). Integration-tested against real AWS infrastructure on every PR. +Built around a NAT Zero AMI baked in-repo and promoted through a dedicated workflow. Orchestrated by a single Go Lambda (~55 ms cold start, 29 MB memory). Integration-tested against real AWS infrastructure on every PR. ``` AZ-A (active) AZ-B (idle) @@ -33,9 +33,9 @@ Built on [fck-nat](https://fck-nat.dev/) AMIs. Orchestrated by a single Go Lambd | **Idle** (no workloads) | **~$0.80/mo** | ~$7-8 | ~$36+ | | **Active** (workloads running) | ~$7-8 | ~$7-8 | ~$36+ | -AWS NAT Gateway costs ~$36/month per AZ even when idle. fck-nat brings that to ~$7-8/month, but the instance and EIP run 24/7. nat-zero releases the Elastic IP when idle, avoiding the [$3.60/month public IPv4 charge](https://aws.amazon.com/blogs/aws/new-aws-public-ipv4-address-charge-public-ip-insights/). +AWS NAT Gateway costs ~$36/month per AZ even when idle. `fck-nat` brings that to roughly ~$7-8/month, but the instance and EIP stay allocated 24/7. nat-zero releases the Elastic IP when idle, avoiding the [$3.60/month public IPv4 charge](https://aws.amazon.com/blogs/aws/new-aws-public-ipv4-address-charge-public-ip-insights/). -Best for dev/staging environments, CI/CD runners, batch jobs, and side projects where workloads run intermittently. +Best for dev/staging environments, CI/CD runners, batch jobs, and side projects where workloads run intermittently. If you need a simpler always-on NAT instance, `fck-nat` is still a sensible option. ## How it works @@ -99,6 +99,7 @@ See [Performance](docs/performance.md) for detailed timings and cost breakdowns. - **EventBridge scope**: Captures all EC2 state changes in the account; Lambda filters by VPC ID. - **Startup delay**: First workload in an idle AZ waits ~10 seconds for internet. Design scripts to retry outbound connections. - **Dual ENI**: Persistent public + private ENIs survive stop/start cycles. +- **AMI compatibility**: The module defaults to the NAT Zero AMI track. Custom AMIs are supported only if they follow the same deterministic dual-ENI model. `fck-nat` AMIs are intentionally unsupported because their bootstrap interrogates IMDS/AWS to discover attached ENIs before nat-zero's EIP lifecycle has completed. - **Retries**: Failed Lambda invocations are retried up to 2 times by EventBridge. - **Clean destroy**: A cleanup action terminates NAT instances before `terraform destroy` removes ENIs. - **Config versioning**: Changing AMI or instance type auto-replaces NAT instances on next workload event. @@ -118,9 +119,9 @@ See [Performance](docs/performance.md) for detailed timings and cost breakdowns. | Name | Version | |------|---------| -| [aws](#provider\_aws) | >= 5.0 | -| [null](#provider\_null) | >= 3.0 | -| [time](#provider\_time) | >= 0.9 | +| [aws](#provider\_aws) | 6.34.0 | +| [null](#provider\_null) | 3.2.4 | +| [time](#provider\_time) | 0.13.1 | ## Modules @@ -151,17 +152,18 @@ No modules. | [null_resource.download_lambda](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource | | [time_sleep.eventbridge_propagation](https://registry.terraform.io/providers/hashicorp/time/latest/docs/resources/sleep) | resource | | [time_sleep.lambda_ready](https://registry.terraform.io/providers/hashicorp/time/latest/docs/resources/sleep) | resource | +| [aws_ami.nat](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/ami) | data source | ## Inputs | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| | [ami\_id](#input\_ami\_id) | Explicit AMI ID to use (overrides AMI lookup entirely) | `string` | `null` | no | +| [ami\_name\_pattern](#input\_ami\_name\_pattern) | AMI name pattern used when resolving the default nat-zero AMI. Override this to use your own shared AMI. | `string` | `null` | no | +| [ami\_owner\_account](#input\_ami\_owner\_account) | Owner account ID used when resolving the default nat-zero AMI by name pattern. Override this to use your own shared AMI. | `string` | `null` | no | | [availability\_zones](#input\_availability\_zones) | List of availability zones to deploy NAT instances in | `list(string)` | n/a | yes | | [block\_device\_size](#input\_block\_device\_size) | Size in GB of the root EBS volume | `number` | `10` | no | | [build\_lambda\_locally](#input\_build\_lambda\_locally) | Build the Lambda binary from Go source instead of downloading a pre-compiled release. Requires Go and zip installed locally. | `bool` | `false` | no | -| [custom\_ami\_name\_pattern](#input\_custom\_ami\_name\_pattern) | AMI name pattern when use\_fck\_nat\_ami is false | `string` | `null` | no | -| [custom\_ami\_owner](#input\_custom\_ami\_owner) | AMI owner account ID when use\_fck\_nat\_ami is false | `string` | `null` | no | | [enable\_logging](#input\_enable\_logging) | Create a CloudWatch log group for the Lambda function | `bool` | `true` | no | | [encrypt\_root\_volume](#input\_encrypt\_root\_volume) | Encrypt the root EBS volume. | `bool` | `true` | no | | [ignore\_tag\_key](#input\_ignore\_tag\_key) | Tag key used to mark instances the Lambda should ignore | `string` | `"nat-zero:ignore"` | no | @@ -179,7 +181,6 @@ No modules. | [private\_subnets\_cidr\_blocks](#input\_private\_subnets\_cidr\_blocks) | CIDR blocks for the private subnets (one per AZ, used in security group rules) | `list(string)` | n/a | yes | | [public\_subnets](#input\_public\_subnets) | Public subnet IDs (one per AZ) for NAT instance public ENIs | `list(string)` | n/a | yes | | [tags](#input\_tags) | Additional tags to apply to all resources | `map(string)` | `{}` | no | -| [use\_fck\_nat\_ami](#input\_use\_fck\_nat\_ami) | Use the public fck-nat AMI. Set to false to use a custom AMI. | `bool` | `true` | no | | [vpc\_id](#input\_vpc\_id) | The VPC ID where NAT instances will be deployed | `string` | n/a | yes | ## Outputs diff --git a/ami.tf b/ami.tf new file mode 100644 index 0000000..a999be2 --- /dev/null +++ b/ami.tf @@ -0,0 +1,23 @@ +locals { + ami_lookup_enabled = var.ami_id == null && var.ami_owner_account != null && var.ami_name_pattern != null +} + +data "aws_ami" "nat" { + count = local.ami_lookup_enabled ? 1 : 0 + most_recent = true + owners = [local.ami_lookup_enabled ? var.ami_owner_account : "000000000000"] + + filter { + name = "name" + values = [local.ami_lookup_enabled ? var.ami_name_pattern : "missing"] + } + + filter { + name = "state" + values = ["available"] + } +} + +locals { + effective_ami_id = var.ami_id != null ? var.ami_id : try(data.aws_ami.nat[0].id, null) +} diff --git a/ami/README.md b/ami/README.md index 954f202..1f55eea 100644 --- a/ami/README.md +++ b/ami/README.md @@ -1,6 +1,6 @@ -# First-Party NAT AMI (arm64, AL2023 minimal) +# NAT Zero AMI (arm64, AL2023 minimal) -This directory contains the Packer build for the nat-zero first-party NAT AMI. +This directory contains the Packer build for the nat-zero AMI. ## Supported Flavor @@ -14,6 +14,9 @@ This directory contains the Packer build for the nat-zero first-party NAT AMI. - No `aws` CLI calls in bootstrap/runtime NAT scripts - No runtime ENI attach/detach or EIP association logic - Small, readable bootstrap and NAT config scripts +- `fck-nat`-style bootstrap discovery is intentionally avoided because nat-zero relies on launch-template-owned ENIs and attaches the EIP later in the reconciliation loop +- Unencrypted AMI backing snapshot so the image can be made public; the module can still encrypt runtime NAT instance volumes +- Build-time OS patching via `dnf upgrade --refresh` before the AMI is created ## Build @@ -33,7 +36,7 @@ The AMI name format is: - `nat-zero-al2023-minimal-arm64-` -This full AMI name is used as the module default target for deterministic first-party rollout. +This full AMI name is used as the module default target for deterministic rollout. ## GitHub Workflow @@ -47,9 +50,9 @@ Workflow: `.github/workflows/nat-images.yml` - `source_region` (default `us-east-1`) - `run_integration_gate` (default `true`) - Behavior: - - builds a new first-party AMI with Packer + - builds a new nat-zero AMI with Packer - copies it to all currently enabled regions in the account (parallel copy with retries) - runs integration tests against the new source AMI (gate) before promotion - - updates `first_party_ami_name_pattern` (and generated docs) and opens a PR +- updates `ami_owner_account`, `ami_name_pattern` (and generated docs) and opens a PR Merge the promotion PR to `main` to let release-please publish a new module release that points to the promoted AMI name. diff --git a/ami/nat-zero.pkr.hcl b/ami/nat-zero.pkr.hcl index b81141e..4a3dc66 100644 --- a/ami/nat-zero.pkr.hcl +++ b/ami/nat-zero.pkr.hcl @@ -39,7 +39,8 @@ source "amazon-ebs" "nat_zero" { volume_size = var.root_volume_size volume_type = "gp3" delete_on_termination = true - encrypted = true + # Public AMIs cannot reference encrypted backing snapshots. + encrypted = false } source_ami_filter { @@ -53,7 +54,7 @@ source "amazon-ebs" "nat_zero" { } tags = { - Name = "nat-zero-first-party" + Name = "nat-zero-ami-build" Project = "nat-zero" Role = "nat" ManagedBy = "packer" @@ -63,7 +64,7 @@ source "amazon-ebs" "nat_zero" { } build { - name = "nat-zero-first-party" + name = "nat-zero-ami" sources = ["source.amazon-ebs.nat_zero"] provisioner "file" { @@ -85,4 +86,8 @@ build { execute_command = "sudo -E sh -eux '{{ .Path }}'" script = "scripts/configure.sh" } + + post-processor "manifest" { + output = "manifest.json" + } } diff --git a/ami/scripts/install-deps.sh b/ami/scripts/install-deps.sh index e7650a5..a722e80 100755 --- a/ami/scripts/install-deps.sh +++ b/ami/scripts/install-deps.sh @@ -1,5 +1,9 @@ #!/bin/sh set -eu -dnf -y update +# Always bake from a fully patched AL2023 base so each AMI includes the +# latest published OS-level fixes available at build time. +dnf -y upgrade --refresh dnf -y install iptables +dnf clean all +rm -rf /var/cache/dnf diff --git a/cmd/lambda/ec2iface.go b/cmd/lambda/ec2iface.go index 0db74c5..43b0ca6 100644 --- a/cmd/lambda/ec2iface.go +++ b/cmd/lambda/ec2iface.go @@ -18,8 +18,5 @@ type EC2API interface { DisassociateAddress(ctx context.Context, params *ec2.DisassociateAddressInput, optFns ...func(*ec2.Options)) (*ec2.DisassociateAddressOutput, error) ReleaseAddress(ctx context.Context, params *ec2.ReleaseAddressInput, optFns ...func(*ec2.Options)) (*ec2.ReleaseAddressOutput, error) DescribeAddresses(ctx context.Context, params *ec2.DescribeAddressesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeAddressesOutput, error) - DescribeNetworkInterfaces(ctx context.Context, params *ec2.DescribeNetworkInterfacesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeNetworkInterfacesOutput, error) - DescribeImages(ctx context.Context, params *ec2.DescribeImagesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeImagesOutput, error) DescribeLaunchTemplates(ctx context.Context, params *ec2.DescribeLaunchTemplatesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeLaunchTemplatesOutput, error) - DescribeLaunchTemplateVersions(ctx context.Context, params *ec2.DescribeLaunchTemplateVersionsInput, optFns ...func(*ec2.Options)) (*ec2.DescribeLaunchTemplateVersionsOutput, error) } diff --git a/cmd/lambda/ec2ops.go b/cmd/lambda/ec2ops.go index bc88a46..dadff5d 100644 --- a/cmd/lambda/ec2ops.go +++ b/cmd/lambda/ec2ops.go @@ -5,7 +5,6 @@ import ( "errors" "fmt" "log" - "sort" "strings" "time" @@ -321,33 +320,6 @@ func (h *Handler) isCurrentConfig(inst *Instance) bool { // --- NAT lifecycle helpers --- -func (h *Handler) resolveAMI(ctx context.Context) string { - defer timed("resolve_ami")() - resp, err := h.EC2.DescribeImages(ctx, &ec2.DescribeImagesInput{ - Owners: []string{h.AMIOwner}, - Filters: []ec2types.Filter{ - {Name: aws.String("name"), Values: []string{h.AMIPattern}}, - {Name: aws.String("state"), Values: []string{"available"}}, - }, - }) - if err != nil { - log.Printf("AMI lookup failed, using launch template default: %v", err) - return "" - } - if len(resp.Images) == 0 { - return "" - } - - images := resp.Images - sort.Slice(images, func(i, j int) bool { - return aws.ToString(images[i].CreationDate) > aws.ToString(images[j].CreationDate) - }) - ami := images[0] - amiID := aws.ToString(ami.ImageId) - log.Printf("Using AMI %s (%s)", amiID, aws.ToString(ami.Name)) - return amiID -} - func (h *Handler) resolveLT(ctx context.Context, az, vpc string) (string, int64) { defer timed("resolve_lt")() resp, err := h.EC2.DescribeLaunchTemplates(ctx, &ec2.DescribeLaunchTemplatesInput{ @@ -361,16 +333,10 @@ func (h *Handler) resolveLT(ctx context.Context, az, vpc string) (string, int64) } ltID := aws.ToString(resp.LaunchTemplates[0].LaunchTemplateId) - - verResp, err := h.EC2.DescribeLaunchTemplateVersions(ctx, &ec2.DescribeLaunchTemplateVersionsInput{ - LaunchTemplateId: aws.String(ltID), - Versions: []string{"$Latest"}, - }) - if err != nil || len(verResp.LaunchTemplateVersions) == 0 { + version := aws.ToInt64(resp.LaunchTemplates[0].LatestVersionNumber) + if version == 0 { return "", 0 } - - version := aws.ToInt64(verResp.LaunchTemplateVersions[0].VersionNumber) return ltID, version } @@ -383,8 +349,6 @@ func (h *Handler) createNAT(ctx context.Context, az, vpc string) string { return "" } - amiID := h.resolveAMI(ctx) - input := &ec2.RunInstancesInput{ LaunchTemplate: &ec2types.LaunchTemplateSpecification{ LaunchTemplateId: aws.String(ltID), @@ -403,10 +367,6 @@ func (h *Handler) createNAT(ctx context.Context, az, vpc string) string { }} } - if amiID != "" { - input.ImageId = aws.String(amiID) - } - resp, err := h.EC2.RunInstances(ctx, input) if err != nil { log.Printf("Failed to create NAT instance: %v", err) diff --git a/cmd/lambda/ec2ops_test.go b/cmd/lambda/ec2ops_test.go index c20da4b..bc1fc0f 100644 --- a/cmd/lambda/ec2ops_test.go +++ b/cmd/lambda/ec2ops_test.go @@ -369,16 +369,12 @@ func TestReleaseEIPs(t *testing.T) { // --- createNAT() --- func TestCreateNAT(t *testing.T) { - setupLTAndAMI := func(mock *mockEC2) { + setupLT := func(mock *mockEC2) { mock.DescribeLaunchTemplatesFn = func(ctx context.Context, params *ec2.DescribeLaunchTemplatesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeLaunchTemplatesOutput, error) { return &ec2.DescribeLaunchTemplatesOutput{ - LaunchTemplates: []ec2types.LaunchTemplate{{LaunchTemplateId: aws.String("lt-123")}}, - }, nil - } - mock.DescribeLaunchTemplateVersionsFn = func(ctx context.Context, params *ec2.DescribeLaunchTemplateVersionsInput, optFns ...func(*ec2.Options)) (*ec2.DescribeLaunchTemplateVersionsOutput, error) { - return &ec2.DescribeLaunchTemplateVersionsOutput{ - LaunchTemplateVersions: []ec2types.LaunchTemplateVersion{{ - LaunchTemplateId: aws.String("lt-123"), VersionNumber: aws.Int64(1), + LaunchTemplates: []ec2types.LaunchTemplate{{ + LaunchTemplateId: aws.String("lt-123"), + LatestVersionNumber: aws.Int64(1), }}, }, nil } @@ -386,16 +382,7 @@ func TestCreateNAT(t *testing.T) { t.Run("happy path", func(t *testing.T) { mock := &mockEC2{} - setupLTAndAMI(mock) - mock.DescribeImagesFn = func(ctx context.Context, params *ec2.DescribeImagesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeImagesOutput, error) { - return &ec2.DescribeImagesOutput{ - Images: []ec2types.Image{{ - ImageId: aws.String("ami-fcknat"), - Name: aws.String("fck-nat-al2023-1.0-arm64-20240101"), - CreationDate: aws.String("2024-01-01T00:00:00.000Z"), - }}, - }, nil - } + setupLT(mock) mock.RunInstancesFn = func(ctx context.Context, params *ec2.RunInstancesInput, optFns ...func(*ec2.Options)) (*ec2.RunInstancesOutput, error) { return &ec2.RunInstancesOutput{ Instances: []ec2types.Instance{{InstanceId: aws.String("i-new1")}}, @@ -422,10 +409,7 @@ func TestCreateNAT(t *testing.T) { t.Run("run instances fails", func(t *testing.T) { mock := &mockEC2{} - setupLTAndAMI(mock) - mock.DescribeImagesFn = func(ctx context.Context, params *ec2.DescribeImagesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeImagesOutput, error) { - return &ec2.DescribeImagesOutput{Images: []ec2types.Image{}}, nil - } + setupLT(mock) mock.RunInstancesFn = func(ctx context.Context, params *ec2.RunInstancesInput, optFns ...func(*ec2.Options)) (*ec2.RunInstancesOutput, error) { return nil, fmt.Errorf("InsufficientInstanceCapacity: No capacity") } @@ -438,10 +422,7 @@ func TestCreateNAT(t *testing.T) { t.Run("config version tag included", func(t *testing.T) { mock := &mockEC2{} - setupLTAndAMI(mock) - mock.DescribeImagesFn = func(ctx context.Context, params *ec2.DescribeImagesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeImagesOutput, error) { - return &ec2.DescribeImagesOutput{Images: []ec2types.Image{}}, nil - } + setupLT(mock) mock.RunInstancesFn = func(ctx context.Context, params *ec2.RunInstancesInput, optFns ...func(*ec2.Options)) (*ec2.RunInstancesOutput, error) { if len(params.TagSpecifications) == 0 { t.Error("expected TagSpecifications") diff --git a/cmd/lambda/handler.go b/cmd/lambda/handler.go index 6866e1f..4edb04f 100644 --- a/cmd/lambda/handler.go +++ b/cmd/lambda/handler.go @@ -20,8 +20,6 @@ type Handler struct { IgnoreTagKey string IgnoreTagValue string TargetVPC string - AMIOwner string - AMIPattern string ConfigVersion string } diff --git a/cmd/lambda/handler_test.go b/cmd/lambda/handler_test.go index e86e613..8deb4d8 100644 --- a/cmd/lambda/handler_test.go +++ b/cmd/lambda/handler_test.go @@ -119,19 +119,12 @@ func TestReconcileScaleUp(t *testing.T) { } mock.DescribeLaunchTemplatesFn = func(ctx context.Context, params *ec2.DescribeLaunchTemplatesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeLaunchTemplatesOutput, error) { return &ec2.DescribeLaunchTemplatesOutput{ - LaunchTemplates: []ec2types.LaunchTemplate{{LaunchTemplateId: aws.String("lt-123")}}, - }, nil - } - mock.DescribeLaunchTemplateVersionsFn = func(ctx context.Context, params *ec2.DescribeLaunchTemplateVersionsInput, optFns ...func(*ec2.Options)) (*ec2.DescribeLaunchTemplateVersionsOutput, error) { - return &ec2.DescribeLaunchTemplateVersionsOutput{ - LaunchTemplateVersions: []ec2types.LaunchTemplateVersion{{ - LaunchTemplateId: aws.String("lt-123"), VersionNumber: aws.Int64(1), + LaunchTemplates: []ec2types.LaunchTemplate{{ + LaunchTemplateId: aws.String("lt-123"), + LatestVersionNumber: aws.Int64(1), }}, }, nil } - mock.DescribeImagesFn = func(ctx context.Context, params *ec2.DescribeImagesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeImagesOutput, error) { - return &ec2.DescribeImagesOutput{Images: []ec2types.Image{}}, nil - } mock.RunInstancesFn = func(ctx context.Context, params *ec2.RunInstancesInput, optFns ...func(*ec2.Options)) (*ec2.RunInstancesOutput, error) { return &ec2.RunInstancesOutput{ Instances: []ec2types.Instance{{InstanceId: aws.String("i-new1")}}, @@ -751,19 +744,12 @@ func TestReconcileNATEvent(t *testing.T) { } mock.DescribeLaunchTemplatesFn = func(ctx context.Context, params *ec2.DescribeLaunchTemplatesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeLaunchTemplatesOutput, error) { return &ec2.DescribeLaunchTemplatesOutput{ - LaunchTemplates: []ec2types.LaunchTemplate{{LaunchTemplateId: aws.String("lt-123")}}, - }, nil - } - mock.DescribeLaunchTemplateVersionsFn = func(ctx context.Context, params *ec2.DescribeLaunchTemplateVersionsInput, optFns ...func(*ec2.Options)) (*ec2.DescribeLaunchTemplateVersionsOutput, error) { - return &ec2.DescribeLaunchTemplateVersionsOutput{ - LaunchTemplateVersions: []ec2types.LaunchTemplateVersion{{ - LaunchTemplateId: aws.String("lt-123"), VersionNumber: aws.Int64(1), + LaunchTemplates: []ec2types.LaunchTemplate{{ + LaunchTemplateId: aws.String("lt-123"), + LatestVersionNumber: aws.Int64(1), }}, }, nil } - mock.DescribeImagesFn = func(ctx context.Context, params *ec2.DescribeImagesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeImagesOutput, error) { - return &ec2.DescribeImagesOutput{Images: []ec2types.Image{}}, nil - } mock.RunInstancesFn = func(ctx context.Context, params *ec2.RunInstancesInput, optFns ...func(*ec2.Options)) (*ec2.RunInstancesOutput, error) { return &ec2.RunInstancesOutput{ Instances: []ec2types.Instance{{InstanceId: aws.String("i-new")}}, diff --git a/cmd/lambda/main.go b/cmd/lambda/main.go index bfe950c..643c12e 100644 --- a/cmd/lambda/main.go +++ b/cmd/lambda/main.go @@ -29,8 +29,6 @@ func main() { IgnoreTagKey: envOr("IGNORE_TAG_KEY", "nat-zero:ignore"), IgnoreTagValue: envOr("IGNORE_TAG_VALUE", "true"), TargetVPC: os.Getenv("TARGET_VPC_ID"), - AMIOwner: envOr("AMI_OWNER_ACCOUNT", "568608671756"), - AMIPattern: envOr("AMI_NAME_PATTERN", "fck-nat-al2023-*-arm64-*"), ConfigVersion: os.Getenv("CONFIG_VERSION"), } diff --git a/cmd/lambda/mock_test.go b/cmd/lambda/mock_test.go index 8fadd16..b96d762 100644 --- a/cmd/lambda/mock_test.go +++ b/cmd/lambda/mock_test.go @@ -11,20 +11,17 @@ import ( // mockEC2 implements EC2API with per-method function fields for test control. type mockEC2 struct { - DescribeInstancesFn func(ctx context.Context, params *ec2.DescribeInstancesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeInstancesOutput, error) - RunInstancesFn func(ctx context.Context, params *ec2.RunInstancesInput, optFns ...func(*ec2.Options)) (*ec2.RunInstancesOutput, error) - StartInstancesFn func(ctx context.Context, params *ec2.StartInstancesInput, optFns ...func(*ec2.Options)) (*ec2.StartInstancesOutput, error) - StopInstancesFn func(ctx context.Context, params *ec2.StopInstancesInput, optFns ...func(*ec2.Options)) (*ec2.StopInstancesOutput, error) - TerminateInstancesFn func(ctx context.Context, params *ec2.TerminateInstancesInput, optFns ...func(*ec2.Options)) (*ec2.TerminateInstancesOutput, error) - AllocateAddressFn func(ctx context.Context, params *ec2.AllocateAddressInput, optFns ...func(*ec2.Options)) (*ec2.AllocateAddressOutput, error) - AssociateAddressFn func(ctx context.Context, params *ec2.AssociateAddressInput, optFns ...func(*ec2.Options)) (*ec2.AssociateAddressOutput, error) - DisassociateAddressFn func(ctx context.Context, params *ec2.DisassociateAddressInput, optFns ...func(*ec2.Options)) (*ec2.DisassociateAddressOutput, error) - ReleaseAddressFn func(ctx context.Context, params *ec2.ReleaseAddressInput, optFns ...func(*ec2.Options)) (*ec2.ReleaseAddressOutput, error) - DescribeAddressesFn func(ctx context.Context, params *ec2.DescribeAddressesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeAddressesOutput, error) - DescribeNetworkInterfacesFn func(ctx context.Context, params *ec2.DescribeNetworkInterfacesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeNetworkInterfacesOutput, error) - DescribeImagesFn func(ctx context.Context, params *ec2.DescribeImagesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeImagesOutput, error) - DescribeLaunchTemplatesFn func(ctx context.Context, params *ec2.DescribeLaunchTemplatesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeLaunchTemplatesOutput, error) - DescribeLaunchTemplateVersionsFn func(ctx context.Context, params *ec2.DescribeLaunchTemplateVersionsInput, optFns ...func(*ec2.Options)) (*ec2.DescribeLaunchTemplateVersionsOutput, error) + DescribeInstancesFn func(ctx context.Context, params *ec2.DescribeInstancesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeInstancesOutput, error) + RunInstancesFn func(ctx context.Context, params *ec2.RunInstancesInput, optFns ...func(*ec2.Options)) (*ec2.RunInstancesOutput, error) + StartInstancesFn func(ctx context.Context, params *ec2.StartInstancesInput, optFns ...func(*ec2.Options)) (*ec2.StartInstancesOutput, error) + StopInstancesFn func(ctx context.Context, params *ec2.StopInstancesInput, optFns ...func(*ec2.Options)) (*ec2.StopInstancesOutput, error) + TerminateInstancesFn func(ctx context.Context, params *ec2.TerminateInstancesInput, optFns ...func(*ec2.Options)) (*ec2.TerminateInstancesOutput, error) + AllocateAddressFn func(ctx context.Context, params *ec2.AllocateAddressInput, optFns ...func(*ec2.Options)) (*ec2.AllocateAddressOutput, error) + AssociateAddressFn func(ctx context.Context, params *ec2.AssociateAddressInput, optFns ...func(*ec2.Options)) (*ec2.AssociateAddressOutput, error) + DisassociateAddressFn func(ctx context.Context, params *ec2.DisassociateAddressInput, optFns ...func(*ec2.Options)) (*ec2.DisassociateAddressOutput, error) + ReleaseAddressFn func(ctx context.Context, params *ec2.ReleaseAddressInput, optFns ...func(*ec2.Options)) (*ec2.ReleaseAddressOutput, error) + DescribeAddressesFn func(ctx context.Context, params *ec2.DescribeAddressesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeAddressesOutput, error) + DescribeLaunchTemplatesFn func(ctx context.Context, params *ec2.DescribeLaunchTemplatesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeLaunchTemplatesOutput, error) // Call tracking (mutex-protected for concurrent access) mu sync.Mutex @@ -134,22 +131,6 @@ func (m *mockEC2) DescribeAddresses(ctx context.Context, params *ec2.DescribeAdd return &ec2.DescribeAddressesOutput{}, nil } -func (m *mockEC2) DescribeNetworkInterfaces(ctx context.Context, params *ec2.DescribeNetworkInterfacesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeNetworkInterfacesOutput, error) { - m.track("DescribeNetworkInterfaces", params) - if m.DescribeNetworkInterfacesFn != nil { - return m.DescribeNetworkInterfacesFn(ctx, params, optFns...) - } - return &ec2.DescribeNetworkInterfacesOutput{}, nil -} - -func (m *mockEC2) DescribeImages(ctx context.Context, params *ec2.DescribeImagesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeImagesOutput, error) { - m.track("DescribeImages", params) - if m.DescribeImagesFn != nil { - return m.DescribeImagesFn(ctx, params, optFns...) - } - return &ec2.DescribeImagesOutput{}, nil -} - func (m *mockEC2) DescribeLaunchTemplates(ctx context.Context, params *ec2.DescribeLaunchTemplatesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeLaunchTemplatesOutput, error) { m.track("DescribeLaunchTemplates", params) if m.DescribeLaunchTemplatesFn != nil { @@ -158,14 +139,6 @@ func (m *mockEC2) DescribeLaunchTemplates(ctx context.Context, params *ec2.Descr return &ec2.DescribeLaunchTemplatesOutput{}, nil } -func (m *mockEC2) DescribeLaunchTemplateVersions(ctx context.Context, params *ec2.DescribeLaunchTemplateVersionsInput, optFns ...func(*ec2.Options)) (*ec2.DescribeLaunchTemplateVersionsOutput, error) { - m.track("DescribeLaunchTemplateVersions", params) - if m.DescribeLaunchTemplateVersionsFn != nil { - return m.DescribeLaunchTemplateVersionsFn(ctx, params, optFns...) - } - return &ec2.DescribeLaunchTemplateVersionsOutput{}, nil -} - // --- Test helper builders --- const ( @@ -220,8 +193,6 @@ func newTestHandler(mock *mockEC2) *Handler { IgnoreTagKey: "nat-zero:ignore", IgnoreTagValue: "true", TargetVPC: testVPC, - AMIOwner: "568608671756", - AMIPattern: "fck-nat-al2023-*-arm64-*", ConfigVersion: "", } } diff --git a/docs/examples.md b/docs/examples.md index 6ac5448..ab5ae72 100644 --- a/docs/examples.md +++ b/docs/examples.md @@ -55,7 +55,7 @@ module "nat_zero" { private_route_table_ids = module.vpc.private_route_table_ids private_subnets_cidr_blocks = module.vpc.private_subnets_cidr_blocks - # Defaults: t4g.nano, fck-nat AMI, on-demand + # Defaults: t4g.nano, promoted public nat-zero AMI track, on-demand # Uncomment for spot instances: # market_type = "spot" @@ -92,7 +92,7 @@ module "nat_zero" { ## Custom AMI -To use a custom AMI instead of the default fck-nat AMI: +To use your own NAT Zero-compatible AMI instead of the default public nat-zero AMI: ```hcl module "nat_zero" { @@ -100,9 +100,8 @@ module "nat_zero" { # ... required variables ... - use_fck_nat_ami = false - custom_ami_owner = "123456789012" - custom_ami_name_pattern = "my-nat-ami-*" + ami_owner_account = "123456789012" + ami_name_pattern = "my-nat-ami-*" } ``` @@ -118,6 +117,8 @@ module "nat_zero" { } ``` +Custom AMIs must preserve nat-zero's deterministic dual-ENI boot model. `fck-nat` AMIs are not compatible because they query IMDS/AWS during bootstrap to infer ENI attachment, while nat-zero relies on the launch template ENIs being known up front and the EIP being attached later by the reconciler. + ## Disable Root Volume Encryption The root EBS volume is encrypted by default. To disable encryption (e.g., for environments without compliance requirements): diff --git a/docs/index.md b/docs/index.md index fdd5f02..ab08889 100644 --- a/docs/index.md +++ b/docs/index.md @@ -4,7 +4,7 @@ nat-zero is a Terraform module that replaces always-on NAT with on-demand NAT instances. When a workload launches in a private subnet, a NAT instance starts automatically. When the last workload stops, the NAT shuts down and its Elastic IP is released. Idle cost: ~$0.80/month per AZ. -Built on [fck-nat](https://fck-nat.dev/) AMIs. Orchestrated by a single Go Lambda (~55 ms cold start, 29 MB memory). Integration-tested against real AWS infrastructure on every PR. +Built around a NAT Zero AMI baked in-repo and promoted through a dedicated workflow. Orchestrated by a single Go Lambda (~55 ms cold start, 29 MB memory). Integration-tested against real AWS infrastructure on every PR. ## Quick start @@ -37,3 +37,7 @@ module "nat_zero" { - [Examples](examples.md) — spot instances, custom AMIs, building from source - [Terraform Reference](reference.md) — inputs, outputs, resources - [Testing](testing.md) — integration test lifecycle and CI + +`fck-nat` AMIs are intentionally unsupported here. They discover ENIs via IMDS/AWS calls during bootstrap, which does not match nat-zero's launch-template-owned ENIs and delayed EIP attachment model. + +`fck-nat` itself is still a good fit when you want a conventional always-on NAT instance and do not need nat-zero's scale-to-zero lifecycle. diff --git a/docs/performance.md b/docs/performance.md index f7a5816..e85c1f0 100644 --- a/docs/performance.md +++ b/docs/performance.md @@ -20,7 +20,7 @@ All measurements from real integration tests in us-east-1 with `t4g.nano` instan 2.3 s RunInstances returns — NAT is "pending" Lambda returns. -~8.0 s NAT reaches "running" (EC2 boot + fck-nat config) +~8.0 s NAT reaches "running" (EC2 boot + NAT config) ~8.3 s EventBridge delivers NAT "running" event ~8.9 s Lambda: allocate EIP + associate (~3 s) diff --git a/docs/reference.md b/docs/reference.md index d286474..3b849c6 100644 --- a/docs/reference.md +++ b/docs/reference.md @@ -11,9 +11,9 @@ | Name | Version | |------|---------| -| [aws](#provider\_aws) | >= 5.0 | -| [null](#provider\_null) | >= 3.0 | -| [time](#provider\_time) | >= 0.9 | +| [aws](#provider\_aws) | 6.34.0 | +| [null](#provider\_null) | 3.2.4 | +| [time](#provider\_time) | 0.13.1 | ## Modules @@ -44,17 +44,18 @@ No modules. | [null_resource.download_lambda](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource | | [time_sleep.eventbridge_propagation](https://registry.terraform.io/providers/hashicorp/time/latest/docs/resources/sleep) | resource | | [time_sleep.lambda_ready](https://registry.terraform.io/providers/hashicorp/time/latest/docs/resources/sleep) | resource | +| [aws_ami.nat](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/ami) | data source | ## Inputs | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| | [ami\_id](#input\_ami\_id) | Explicit AMI ID to use (overrides AMI lookup entirely) | `string` | `null` | no | +| [ami\_name\_pattern](#input\_ami\_name\_pattern) | AMI name pattern used when resolving the default nat-zero AMI. Override this to use your own shared AMI. | `string` | `null` | no | +| [ami\_owner\_account](#input\_ami\_owner\_account) | Owner account ID used when resolving the default nat-zero AMI by name pattern. Override this to use your own shared AMI. | `string` | `null` | no | | [availability\_zones](#input\_availability\_zones) | List of availability zones to deploy NAT instances in | `list(string)` | n/a | yes | | [block\_device\_size](#input\_block\_device\_size) | Size in GB of the root EBS volume | `number` | `10` | no | | [build\_lambda\_locally](#input\_build\_lambda\_locally) | Build the Lambda binary from Go source instead of downloading a pre-compiled release. Requires Go and zip installed locally. | `bool` | `false` | no | -| [custom\_ami\_name\_pattern](#input\_custom\_ami\_name\_pattern) | AMI name pattern when use\_fck\_nat\_ami is false | `string` | `null` | no | -| [custom\_ami\_owner](#input\_custom\_ami\_owner) | AMI owner account ID when use\_fck\_nat\_ami is false | `string` | `null` | no | | [enable\_logging](#input\_enable\_logging) | Create a CloudWatch log group for the Lambda function | `bool` | `true` | no | | [encrypt\_root\_volume](#input\_encrypt\_root\_volume) | Encrypt the root EBS volume. | `bool` | `true` | no | | [ignore\_tag\_key](#input\_ignore\_tag\_key) | Tag key used to mark instances the Lambda should ignore | `string` | `"nat-zero:ignore"` | no | @@ -72,7 +73,6 @@ No modules. | [private\_subnets\_cidr\_blocks](#input\_private\_subnets\_cidr\_blocks) | CIDR blocks for the private subnets (one per AZ, used in security group rules) | `list(string)` | n/a | yes | | [public\_subnets](#input\_public\_subnets) | Public subnet IDs (one per AZ) for NAT instance public ENIs | `list(string)` | n/a | yes | | [tags](#input\_tags) | Additional tags to apply to all resources | `map(string)` | `{}` | no | -| [use\_fck\_nat\_ami](#input\_use\_fck\_nat\_ami) | Use the public fck-nat AMI. Set to false to use a custom AMI. | `bool` | `true` | no | | [vpc\_id](#input\_vpc\_id) | The VPC ID where NAT instances will be deployed | `string` | n/a | yes | ## Outputs diff --git a/docs/testing.md b/docs/testing.md index 0dce6a3..baf319d 100644 --- a/docs/testing.md +++ b/docs/testing.md @@ -64,4 +64,4 @@ Integration tests run in GitHub Actions when the `integration-test` label is add ## Config Version Replacement -The Lambda tags NAT instances with a `ConfigVersion` hash (AMI + instance type + market type + volume size + encryption). When the config changes and a workload triggers reconciliation, the Lambda terminates the outdated NAT and creates a replacement. The integration test doesn't exercise this path directly, but it's covered by unit tests. +The Lambda tags NAT instances with a `ConfigVersion` hash (resolved AMI ID + instance type + market type + volume size + encryption). When the config changes and a workload triggers reconciliation, the Lambda terminates the outdated NAT and creates a replacement. The integration suite can now exercise this path by setting `NAT_ZERO_TEST_UPDATED_NAT_AMI_ID` before running `go test`. diff --git a/docs/workflows.md b/docs/workflows.md index 333c289..70e0333 100644 --- a/docs/workflows.md +++ b/docs/workflows.md @@ -6,18 +6,18 @@ Internal reference for GitHub Actions workflows, repo rulesets, and the release | Workflow | File | Triggers | Required Check | |----------|------|----------|----------------| -| Pre-commit | `precommit.yml` | All PRs; push to `main` (filtered paths) | `precommit` | +| Pre-commit | `precommit.yml` | All PRs | `precommit` | | Go Tests | `go-tests.yml` | PRs touching `cmd/lambda/**`; push to `main` | `go-test` | -| Integration Tests | `integration-tests.yml` | PR labeled `integration-test`; manual dispatch | `integration-test` | +| Integration Tests | `integration-tests.yml` | PR labeled `integration-test`; manual dispatch; reusable workflow | `integration-test` | +| NAT Images | `nat-images.yml` | Manual dispatch | No (promotion workflow) | | Docs | `docs.yml` | Push to `main` (filtered paths) | No (post-merge deploy) | | Release | `release-please.yml` | Push to `main`; manual dispatch | No (post-merge) | ## Pre-commit (`precommit.yml`) -Runs the repo's `.pre-commit-config.yaml` hooks: terraform fmt, tflint, terraform-docs, Go staticcheck, etc. +Runs the repo's `.pre-commit-config.yaml` hooks: terraform fmt/validate, tflint, terraform-docs, Go staticcheck, actionlint, shellcheck, and Packer fmt/validate. - **PR trigger**: All pull requests, all paths (no path filter). -- **Push trigger**: Only on `main`, only when `*.tf`, `cmd/lambda/**`, `.pre-commit-config.yaml`, or `.terraform-docs.yml` change. - **Job name**: `precommit` (required status check for merge). ## Go Tests (`go-tests.yml`) @@ -35,11 +35,15 @@ Full end-to-end test: deploys real AWS infrastructure via Terratest, exercises t - **PR trigger**: `labeled` type only. Runs when the `integration-test` label is added. - **Manual trigger**: `workflow_dispatch`. +- **Reusable trigger**: `workflow_call`. - **Condition**: `github.event.label.name == 'integration-test'` (or manual dispatch). - **Concurrency**: Group `nat-zero-integration`, `cancel-in-progress: false`. Only one integration test runs at a time; new ones queue. - **Environment**: `integration` (holds the `INTEGRATION_ROLE_ARN` secret for OIDC). - **Timeout**: 15 minutes. - **Job name**: `integration-test` (required status check for merge). +- **Optional inputs**: + - `nat_ami_id` to force the fixture onto a specific NAT AMI. + - `updated_nat_ami_id` to exercise the AMI replacement path after a second `terraform apply`. ### Steps @@ -48,6 +52,18 @@ Full end-to-end test: deploys real AWS infrastructure via Terratest, exercises t 3. Build the Lambda binary from source (`cmd/lambda/` -> `.build/lambda.zip`). 4. Run `go test -v -timeout 10m -count=1` in `tests/integration/`. +## NAT Images (`nat-images.yml`) + +Manual promotion workflow for the default public nat-zero AMI. + +1. Build the AMI with Packer in the chosen source region. +2. Copy it to every enabled AWS region in the account. +3. Run two us-east-1 integration gates: + - direct test of the new AMI + - upgrade-path test that reapplies the module with the new AMI and verifies the old NAT is replaced +4. Make every copied AMI public. +5. Open a PR that updates the Terraform defaults (`ami_owner_account`, `ami_name_pattern`) so merge + release-please can publish the new module version. + ## Docs (`docs.yml`) Deploys MkDocs Material to GitHub Pages. diff --git a/examples/basic/main.tf b/examples/basic/main.tf index 7910af8..1567949 100644 --- a/examples/basic/main.tf +++ b/examples/basic/main.tf @@ -48,7 +48,7 @@ module "nat_zero" { private_route_table_ids = module.vpc.private_route_table_ids private_subnets_cidr_blocks = module.vpc.private_subnets_cidr_blocks - # Defaults: t4g.nano, fck-nat AMI, on-demand + # Defaults: t4g.nano, nat-zero AMI, on-demand # Uncomment for spot instances: # market_type = "spot" diff --git a/iam.tf b/iam.tf index 0365f10..848362f 100644 --- a/iam.tf +++ b/iam.tf @@ -47,10 +47,7 @@ resource "aws_iam_role_policy" "lambda_iam_policy" { Effect = "Allow" Action = [ "ec2:DescribeInstances", - "ec2:DescribeImages", "ec2:DescribeLaunchTemplates", - "ec2:DescribeLaunchTemplateVersions", - "ec2:DescribeNetworkInterfaces", "ec2:DescribeAddresses", ] Resource = "*" diff --git a/lambda.tf b/lambda.tf index 049ec2e..29f5733 100644 --- a/lambda.tf +++ b/lambda.tf @@ -66,17 +66,13 @@ resource "aws_lambda_function" "nat_zero" { environment { variables = { - NAT_TAG_KEY = var.nat_tag_key - NAT_TAG_VALUE = var.nat_tag_value - IGNORE_TAG_KEY = var.ignore_tag_key - IGNORE_TAG_VALUE = var.ignore_tag_value - TARGET_VPC_ID = var.vpc_id - AMI_OWNER_ACCOUNT = var.use_fck_nat_ami ? "568608671756" : var.custom_ami_owner - AMI_NAME_PATTERN = var.use_fck_nat_ami ? "fck-nat-al2023-*-arm64-*" : var.custom_ami_name_pattern + NAT_TAG_KEY = var.nat_tag_key + NAT_TAG_VALUE = var.nat_tag_value + IGNORE_TAG_KEY = var.ignore_tag_key + IGNORE_TAG_VALUE = var.ignore_tag_value + TARGET_VPC_ID = var.vpc_id CONFIG_VERSION = sha256(join(",", [ - var.use_fck_nat_ami ? "568608671756" : var.custom_ami_owner, - var.use_fck_nat_ami ? "fck-nat-al2023-*-arm64-*" : var.custom_ami_name_pattern, - coalesce(var.ami_id, "none"), + coalesce(local.effective_ami_id, "missing"), var.instance_type, var.market_type, tostring(var.block_device_size), diff --git a/launch_template.tf b/launch_template.tf index 2388791..05ff688 100644 --- a/launch_template.tf +++ b/launch_template.tf @@ -11,7 +11,7 @@ resource "aws_launch_template" "nat_launch_template" { count = length(var.availability_zones) name = "${var.name}-${var.availability_zones[count.index]}-launch-template" instance_type = var.instance_type - image_id = var.ami_id + image_id = local.effective_ami_id iam_instance_profile { arn = aws_iam_instance_profile.nat_instance_profile.arn @@ -76,4 +76,11 @@ resource "aws_launch_template" "nat_launch_template" { }, local.common_tags, ) + + lifecycle { + precondition { + condition = local.effective_ami_id != null + error_message = "Set ami_id or configure a resolvable AMI source with ami_owner_account and ami_name_pattern." + } + } } diff --git a/scripts/update_ami_defaults.sh b/scripts/update_ami_defaults.sh new file mode 100644 index 0000000..92f20a5 --- /dev/null +++ b/scripts/update_ami_defaults.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash +set -euo pipefail + +if [ "$#" -ne 2 ]; then + echo "usage: $0 " >&2 + exit 1 +fi + +owner_account_id="$1" +ami_name_pattern="$2" + +update_variable_default() { + local file="$1" + local variable_name="$2" + local replacement="$3" + local tmp_file + + tmp_file="$(mktemp)" + if ! awk -v variable_name="$variable_name" -v replacement="$replacement" ' + BEGIN { + in_variable = 0 + updated = 0 + } + $0 ~ "^variable \"" variable_name "\" \\{" { + in_variable = 1 + } + in_variable && $1 == "default" { + sub(/=.*/, "= " replacement) + in_variable = 0 + updated = 1 + } + { + print + } + END { + if (updated == 0) { + exit 1 + } + } + ' "$file" > "$tmp_file"; then + rm -f "$tmp_file" + echo "failed to update default for ${variable_name}" >&2 + exit 1 + fi + + mv "$tmp_file" "$file" +} + +update_variable_default "variables.tf" "ami_owner_account" "\"${owner_account_id}\"" +update_variable_default "variables.tf" "ami_name_pattern" "\"${ami_name_pattern}\"" diff --git a/tests/integration/fixture/main.tf b/tests/integration/fixture/main.tf index 0a608b4..9e182b2 100644 --- a/tests/integration/fixture/main.tf +++ b/tests/integration/fixture/main.tf @@ -71,6 +71,11 @@ variable "encrypt_root_volume" { default = true } +variable "nat_ami_id" { + type = string + default = null +} + module "nat_zero" { source = "../../../" @@ -86,6 +91,7 @@ module "nat_zero" { instance_type = var.nat_instance_type market_type = "on-demand" encrypt_root_volume = var.encrypt_root_volume + ami_id = var.nat_ami_id } output "vpc_id" { diff --git a/tests/integration/nat_zero_test.go b/tests/integration/nat_zero_test.go index dc4af3a..68c2af4 100644 --- a/tests/integration/nat_zero_test.go +++ b/tests/integration/nat_zero_test.go @@ -4,6 +4,7 @@ import ( "encoding/base64" "encoding/json" "fmt" + "os" "strings" "testing" "time" @@ -112,9 +113,21 @@ func TestNatZero(t *testing.T) { t.Logf("Deleted SQS queue %s", queueName) }() + initialNatAMI := strings.TrimSpace(os.Getenv("NAT_ZERO_TEST_NAT_AMI_ID")) + updatedNatAMI := strings.TrimSpace(os.Getenv("NAT_ZERO_TEST_UPDATED_NAT_AMI_ID")) + tfVars := map[string]interface{}{} + if initialNatAMI != "" { + tfVars["nat_ami_id"] = initialNatAMI + t.Logf("Initial NAT AMI override: %s", initialNatAMI) + } + if updatedNatAMI != "" { + t.Logf("Updated NAT AMI target: %s", updatedNatAMI) + } + opts := terraform.WithDefaultRetryableErrors(t, &terraform.Options{ TerraformDir: "./fixture", NoColor: true, + Vars: tfVars, }) defer func() { destroyStart := time.Now() @@ -170,50 +183,27 @@ func TestNatZero(t *testing.T) { amiID := getLatestAL2023AMI(t, ec2Client) // Shared across phases — set by Phase 1, used by Phase 2. - var workloadID string + var activeWorkloadID string // ── Phase 1: NAT creation and connectivity ────────────────────────── // Launch a workload and let EventBridge trigger the Lambda automatically. t.Run("NATCreationAndConnectivity", func(t *testing.T) { wlStart := time.Now() - workloadID = launchWorkload(t, ec2Client, privateSubnet, amiID, runID, profileName, queueURL) + activeWorkloadID = launchWorkload(t, ec2Client, privateSubnet, amiID, runID, profileName, queueURL) record("Launch workload instance", time.Since(wlStart)) - t.Logf("Launched workload %s in VPC %s", workloadID, vpcID) + t.Logf("Launched workload %s in VPC %s", activeWorkloadID, vpcID) // EventBridge fires when the workload goes pending/running, // triggering the Lambda to create a NAT and attach an EIP. t.Log("Waiting for NAT to be running with EIP (via EventBridge)...") start := time.Now() - var natInstance *ec2.Instance - retry.DoWithRetry(t, "NAT running with EIP", 100, 2*time.Second, func() (string, error) { - nats := findNATInstances(t, ec2Client, vpcID) - for _, n := range nats { - if aws.StringValue(n.State.Name) == "running" { - for _, eni := range n.NetworkInterfaces { - if aws.Int64Value(eni.Attachment.DeviceIndex) == 0 && - eni.Association != nil && eni.Association.PublicIp != nil { - natInstance = n - return "OK", nil - } - } - return "", fmt.Errorf("NAT running but no EIP yet") - } - } - return "", fmt.Errorf("no running NAT (%d found)", len(nats)) - }) + natInstance := waitForRunningNATWithEIP(t, ec2Client, vpcID, "NAT running with EIP") natUpTime := time.Since(start) record("Wait for NAT running with EIP", natUpTime) t.Logf("NAT up with EIP in %s", natUpTime.Round(time.Millisecond)) - // Get NAT public IP from primary ENI. - var natEIP string - for _, eni := range natInstance.NetworkInterfaces { - if aws.Int64Value(eni.Attachment.DeviceIndex) == 0 && eni.Association != nil { - natEIP = aws.StringValue(eni.Association.PublicIp) - break - } - } + natEIP := natPublicIP(natInstance) require.NotEmpty(t, natEIP, "NAT should have a public IP") // Validate NAT tags. @@ -253,17 +243,18 @@ func TestNatZero(t *testing.T) { // scale-down flow: stop NAT, then detach/release EIP. t.Run("NATScaleDown", func(t *testing.T) { - require.NotEmpty(t, workloadID, "Phase 1 must set workloadID") + require.NotEmpty(t, activeWorkloadID, "Phase 1 must set activeWorkloadID") // Terminate the workload instance. EventBridge fires shutting-down // and terminated events which trigger the Lambda to stop the NAT. t.Log("Terminating workload to trigger NAT scale-down...") termStart := time.Now() _, err := ec2Client.TerminateInstances(&ec2.TerminateInstancesInput{ - InstanceIds: []*string{aws.String(workloadID)}, + InstanceIds: []*string{aws.String(activeWorkloadID)}, }) require.NoError(t, err) record("Terminate workload instance", time.Since(termStart)) + activeWorkloadID = "" // Wait for NAT to reach stopped state. t.Log("Waiting for NAT to stop (via EventBridge)...") @@ -319,28 +310,13 @@ func TestNatZero(t *testing.T) { newWorkloadID := launchWorkload(t, ec2Client, privateSubnet, amiID, runID, profileName, queueURL) record("Launch workload instance (restart)", time.Since(wlStart)) t.Logf("Launched workload %s", newWorkloadID) + activeWorkloadID = newWorkloadID // EventBridge fires when the new workload goes pending/running, // triggering the Lambda to start the stopped NAT. t.Log("Waiting for restarted NAT to be running with EIP (via EventBridge)...") start := time.Now() - var natInstance *ec2.Instance - retry.DoWithRetry(t, "NAT restarted with EIP", 100, 2*time.Second, func() (string, error) { - nats := findNATInstances(t, ec2Client, vpcID) - for _, n := range nats { - if aws.StringValue(n.State.Name) == "running" { - for _, eni := range n.NetworkInterfaces { - if aws.Int64Value(eni.Attachment.DeviceIndex) == 0 && - eni.Association != nil && eni.Association.PublicIp != nil { - natInstance = n - return "OK", nil - } - } - return "", fmt.Errorf("NAT running but no EIP yet") - } - } - return "", fmt.Errorf("no running NAT (%d found)", len(nats)) - }) + natInstance := waitForRunningNATWithEIP(t, ec2Client, vpcID, "NAT restarted with EIP") natRestartTime := time.Since(start) record("Wait for NAT restarted with EIP", natRestartTime) t.Logf("NAT restarted with EIP in %s", natRestartTime.Round(time.Millisecond)) @@ -348,13 +324,7 @@ func TestNatZero(t *testing.T) { require.NotNil(t, natInstance, "NAT should be running") // Verify the restarted NAT has an EIP. - var natEIP string - for _, eni := range natInstance.NetworkInterfaces { - if aws.Int64Value(eni.Attachment.DeviceIndex) == 0 && eni.Association != nil { - natEIP = aws.StringValue(eni.Association.PublicIp) - break - } - } + natEIP := natPublicIP(natInstance) require.NotEmpty(t, natEIP, "Restarted NAT should have a public IP") t.Logf("Restarted NAT has EIP %s", natEIP) @@ -374,7 +344,70 @@ func TestNatZero(t *testing.T) { } }) - // ── Phase 4: Cleanup action ───────────────────────────────────────── + // ── Phase 4: NAT replacement on AMI update ───────────────────────── + + t.Run("NATAMIUpgrade", func(t *testing.T) { + if updatedNatAMI == "" { + t.Skip("NAT_ZERO_TEST_UPDATED_NAT_AMI_ID not set") + } + require.NotEmpty(t, activeWorkloadID, "AMI update phase requires an active workload") + + currentNat := waitForRunningNATWithEIP(t, ec2Client, vpcID, "current NAT running with EIP") + oldNatID := aws.StringValue(currentNat.InstanceId) + oldNatAMI := aws.StringValue(currentNat.ImageId) + require.NotEmpty(t, oldNatID, "current NAT should have an instance id") + require.NotEmpty(t, oldNatAMI, "current NAT should have an AMI id") + if oldNatAMI == updatedNatAMI { + t.Skipf("current NAT already uses target AMI %s", updatedNatAMI) + } + t.Logf("Updating NAT AMI from %s to %s", oldNatAMI, updatedNatAMI) + + applyStart := time.Now() + opts.Vars["nat_ami_id"] = updatedNatAMI + terraform.Apply(t, opts) + record("Terraform apply (AMI update)", time.Since(applyStart)) + + invokeTerminateStart := time.Now() + invokeLambda(t, lambdaClient, lambdaName, map[string]string{ + "instance_id": activeWorkloadID, + "state": "running", + }) + record("Lambda invoke (AMI update terminate)", time.Since(invokeTerminateStart)) + + waitTermStart := time.Now() + waitForInstanceTerminated(t, ec2Client, oldNatID) + record("Wait for outdated NAT terminated", time.Since(waitTermStart)) + + invokeCreateStart := time.Now() + invokeLambda(t, lambdaClient, lambdaName, map[string]string{ + "instance_id": activeWorkloadID, + "state": "running", + }) + record("Lambda invoke (AMI update create)", time.Since(invokeCreateStart)) + + replacementStart := time.Now() + replacementNat := waitForRunningNATWithEIP(t, ec2Client, vpcID, "replacement NAT running with EIP") + record("Wait for replacement NAT running with EIP", time.Since(replacementStart)) + + require.NotEqual(t, oldNatID, aws.StringValue(replacementNat.InstanceId), "replacement NAT should be a new instance") + require.Equal(t, updatedNatAMI, aws.StringValue(replacementNat.ImageId), "replacement NAT should use updated AMI") + + replacementEIP := natPublicIP(replacementNat) + require.NotEmpty(t, replacementEIP, "replacement NAT should have a public IP") + + upgradeWorkloadStart := time.Now() + upgradeWorkloadID := launchWorkload(t, ec2Client, privateSubnet, amiID, runID, profileName, queueURL) + record("Launch workload instance (AMI update)", time.Since(upgradeWorkloadStart)) + activeWorkloadID = upgradeWorkloadID + + t.Log("Waiting for workload connectivity via replacement NAT (SQS)...") + egressStart := time.Now() + msg := waitForEgress(t, sqsClient, queueURL, 4*time.Minute) + record("Wait for workload egress IP (AMI update)", time.Since(egressStart)) + require.Equal(t, replacementEIP, msg.EgressIP, "workload egress IP should match replacement NAT EIP") + }) + + // ── Phase 5: Cleanup action ───────────────────────────────────────── t.Run("CleanupAction", func(t *testing.T) { // Terminate all test workloads before cleanup to match production @@ -658,6 +691,56 @@ func findWorkloadsInState(t *testing.T, c *ec2.EC2, vpcID, runID string, states return res } +func waitForRunningNATWithEIP(t *testing.T, c *ec2.EC2, vpcID, description string) *ec2.Instance { + t.Helper() + + var natInstance *ec2.Instance + retry.DoWithRetry(t, description, 100, 2*time.Second, func() (string, error) { + nats := findNATInstances(t, c, vpcID) + for _, n := range nats { + if aws.StringValue(n.State.Name) == "running" && natPublicIP(n) != "" { + natInstance = n + return "OK", nil + } + if aws.StringValue(n.State.Name) == "running" { + return "", fmt.Errorf("NAT running but no EIP yet") + } + } + return "", fmt.Errorf("no running NAT (%d found)", len(nats)) + }) + return natInstance +} + +func waitForInstanceTerminated(t *testing.T, c *ec2.EC2, instanceID string) { + t.Helper() + + retry.DoWithRetry(t, "instance terminated", 60, 2*time.Second, func() (string, error) { + out, err := c.DescribeInstances(&ec2.DescribeInstancesInput{ + InstanceIds: []*string{aws.String(instanceID)}, + }) + if err != nil { + return "", err + } + if len(out.Reservations) == 0 || len(out.Reservations[0].Instances) == 0 { + return "OK", nil + } + state := aws.StringValue(out.Reservations[0].Instances[0].State.Name) + if state == ec2.InstanceStateNameTerminated { + return "OK", nil + } + return "", fmt.Errorf("instance %s still %s", instanceID, state) + }) +} + +func natPublicIP(nat *ec2.Instance) string { + for _, eni := range nat.NetworkInterfaces { + if aws.Int64Value(eni.Attachment.DeviceIndex) == 0 && eni.Association != nil { + return aws.StringValue(eni.Association.PublicIp) + } + } + return "" +} + func launchWorkload(t *testing.T, c *ec2.EC2, subnet, ami, runID, profile, queueURL string) string { t.Helper() out, err := c.RunInstances(&ec2.RunInstancesInput{ diff --git a/variables.tf b/variables.tf index 47c046a..49e89ef 100644 --- a/variables.tf +++ b/variables.tf @@ -68,29 +68,22 @@ variable "encrypt_root_volume" { description = "Encrypt the root EBS volume." } -# AMI configuration -variable "use_fck_nat_ami" { - type = bool - default = true - description = "Use the public fck-nat AMI. Set to false to use a custom AMI." -} - variable "ami_id" { type = string default = null description = "Explicit AMI ID to use (overrides AMI lookup entirely)" } -variable "custom_ami_owner" { +variable "ami_owner_account" { type = string default = null - description = "AMI owner account ID when use_fck_nat_ami is false" + description = "Owner account ID used when resolving the default nat-zero AMI by name pattern. Override this to use your own shared AMI." } -variable "custom_ami_name_pattern" { +variable "ami_name_pattern" { type = string default = null - description = "AMI name pattern when use_fck_nat_ami is false" + description = "AMI name pattern used when resolving the default nat-zero AMI. Override this to use your own shared AMI." } variable "nat_tag_key" { From 1298ea1a2d94a6b9f7d18beef0317d4300eb5ee7 Mon Sep 17 00:00:00 2001 From: Leonard O'Sullivan Date: Fri, 6 Mar 2026 13:03:32 +1000 Subject: [PATCH 03/27] fix: align AMI workflows with CI and integration defaults --- .github/workflows/integration-tests.yml | 33 ++++++++++++++++++-- .github/workflows/nat-images.yml | 41 ++++++++++++++++++++++--- .github/workflows/precommit.yml | 2 +- docs/testing.md | 1 + docs/workflows.md | 4 +-- 5 files changed, 71 insertions(+), 10 deletions(-) diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml index bd28558..53f64ba 100644 --- a/.github/workflows/integration-tests.yml +++ b/.github/workflows/integration-tests.yml @@ -30,6 +30,10 @@ permissions: id-token: write contents: read +env: + TEST_NAT_AMI_OWNER_ACCOUNT: "590144423513" + TEST_NAT_AMI_NAME_PATTERN: "nat-zero-al2023-minimal-arm64-*" + jobs: integration-test: if: >- @@ -39,9 +43,6 @@ jobs: runs-on: ubuntu-latest timeout-minutes: 15 environment: integration - env: - NAT_ZERO_TEST_NAT_AMI_ID: ${{ inputs.nat_ami_id || github.event.inputs.nat_ami_id || '' }} - NAT_ZERO_TEST_UPDATED_NAT_AMI_ID: ${{ inputs.updated_nat_ami_id || github.event.inputs.updated_nat_ami_id || '' }} steps: - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 @@ -58,6 +59,32 @@ jobs: role-to-assume: ${{ secrets.INTEGRATION_ROLE_ARN }} aws-region: us-east-1 + - name: Resolve NAT AMI inputs + env: + INPUT_NAT_AMI_ID: ${{ inputs.nat_ami_id || github.event.inputs.nat_ami_id || '' }} + INPUT_UPDATED_NAT_AMI_ID: ${{ inputs.updated_nat_ami_id || github.event.inputs.updated_nat_ami_id || '' }} + run: | + nat_ami_id="$INPUT_NAT_AMI_ID" + if [ -z "$nat_ami_id" ]; then + nat_ami_id="$(aws ec2 describe-images \ + --region us-east-1 \ + --owners "$TEST_NAT_AMI_OWNER_ACCOUNT" \ + --filters "Name=name,Values=$TEST_NAT_AMI_NAME_PATTERN" "Name=state,Values=available" \ + --query 'reverse(sort_by(Images,&CreationDate))[0].ImageId' \ + --output text)" + fi + + if [ -z "$nat_ami_id" ] || [ "$nat_ami_id" = "None" ]; then + echo "failed to resolve the default integration NAT AMI" >&2 + exit 1 + fi + + echo "NAT_ZERO_TEST_NAT_AMI_ID=$nat_ami_id" >> "$GITHUB_ENV" + + if [ -n "$INPUT_UPDATED_NAT_AMI_ID" ]; then + echo "NAT_ZERO_TEST_UPDATED_NAT_AMI_ID=$INPUT_UPDATED_NAT_AMI_ID" >> "$GITHUB_ENV" + fi + - name: Build Lambda binary working-directory: cmd/lambda run: | diff --git a/.github/workflows/nat-images.yml b/.github/workflows/nat-images.yml index 582f843..672016c 100644 --- a/.github/workflows/nat-images.yml +++ b/.github/workflows/nat-images.yml @@ -27,6 +27,10 @@ permissions: contents: write pull-requests: write +env: + TEST_NAT_AMI_OWNER_ACCOUNT: "590144423513" + TEST_NAT_AMI_NAME_PATTERN: "nat-zero-al2023-minimal-arm64-*" + jobs: build-and-copy: runs-on: ubuntu-latest @@ -40,7 +44,7 @@ jobs: steps: - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 - - uses: hashicorp/setup-packer@v3 + - uses: hashicorp/setup-packer@1aa358be5cf73883762b302a3a03abd66e75b232 # v3 - uses: aws-actions/configure-aws-credentials@7474bc4690e29a8392af63c5b98e7449536d5c3a # v4 with: @@ -120,6 +124,8 @@ jobs: aws ec2 wait image-available --region "$region" --image-ids "$image_id" } + pids=() + while IFS= read -r region; do if [ "$region" = "$SOURCE_REGION" ]; then continue @@ -129,9 +135,17 @@ jobs: set -euo pipefail copy_region "$region" ) & + pids+=("$!") done < <(jq -r '.[]' <<<"$REGIONS_JSON") - wait + status=0 + for pid in "${pids[@]}"; do + if ! wait "$pid"; then + status=1 + fi + done + + exit "$status" - name: Resolve us-east-1 test AMI id: test-ami @@ -202,8 +216,6 @@ jobs: runs-on: ubuntu-latest timeout-minutes: 15 environment: integration - env: - NAT_ZERO_TEST_UPDATED_NAT_AMI_ID: ${{ needs.build-and-copy.outputs.test_ami_id }} steps: - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 @@ -220,6 +232,23 @@ jobs: role-to-assume: ${{ secrets.INTEGRATION_ROLE_ARN }} aws-region: us-east-1 + - name: Resolve baseline NAT AMI + run: | + nat_ami_id="$(aws ec2 describe-images \ + --region us-east-1 \ + --owners "$TEST_NAT_AMI_OWNER_ACCOUNT" \ + --filters "Name=name,Values=$TEST_NAT_AMI_NAME_PATTERN" "Name=state,Values=available" \ + --query 'reverse(sort_by(Images,&CreationDate))[0].ImageId' \ + --output text)" + + if [ -z "$nat_ami_id" ] || [ "$nat_ami_id" = "None" ]; then + echo "failed to resolve the baseline integration NAT AMI" >&2 + exit 1 + fi + + echo "NAT_ZERO_TEST_NAT_AMI_ID=$nat_ami_id" >> "$GITHUB_ENV" + echo "NAT_ZERO_TEST_UPDATED_NAT_AMI_ID=${{ needs.build-and-copy.outputs.test_ami_id }}" >> "$GITHUB_ENV" + - name: Build Lambda binary working-directory: cmd/lambda run: | @@ -296,6 +325,10 @@ jobs: with: python-version: "3.12" + - uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5 + with: + go-version-file: cmd/lambda/go.mod + - name: Install pre-commit run: python -m pip install --upgrade pre-commit diff --git a/.github/workflows/precommit.yml b/.github/workflows/precommit.yml index 86df725..3a7df22 100644 --- a/.github/workflows/precommit.yml +++ b/.github/workflows/precommit.yml @@ -16,7 +16,7 @@ jobs: with: go-version-file: cmd/lambda/go.mod - - uses: hashicorp/setup-packer@v3 + - uses: hashicorp/setup-packer@1aa358be5cf73883762b302a3a03abd66e75b232 # v3 - uses: hashicorp/setup-terraform@b9cd54a3c349d3f38e8881555d616ced269862dd # v3 diff --git a/docs/testing.md b/docs/testing.md index baf319d..cacfdf3 100644 --- a/docs/testing.md +++ b/docs/testing.md @@ -57,6 +57,7 @@ Integration tests run in GitHub Actions when the `integration-test` label is add - Concurrency: one test at a time (`cancel-in-progress: false`) - Timeout: 15 minutes - Region: us-east-1 +- Default NAT AMI: resolves the shared test nat-zero AMI from account `590144423513` in us-east-1 unless `nat_ami_id` is supplied explicitly ## Orphan Detection diff --git a/docs/workflows.md b/docs/workflows.md index 70e0333..f4b1c28 100644 --- a/docs/workflows.md +++ b/docs/workflows.md @@ -42,7 +42,7 @@ Full end-to-end test: deploys real AWS infrastructure via Terratest, exercises t - **Timeout**: 15 minutes. - **Job name**: `integration-test` (required status check for merge). - **Optional inputs**: - - `nat_ami_id` to force the fixture onto a specific NAT AMI. + - `nat_ami_id` to force the fixture onto a specific NAT AMI. If omitted, the workflow resolves the shared us-east-1 test AMI from account `590144423513`. - `updated_nat_ami_id` to exercise the AMI replacement path after a second `terraform apply`. ### Steps @@ -60,7 +60,7 @@ Manual promotion workflow for the default public nat-zero AMI. 2. Copy it to every enabled AWS region in the account. 3. Run two us-east-1 integration gates: - direct test of the new AMI - - upgrade-path test that reapplies the module with the new AMI and verifies the old NAT is replaced + - upgrade-path test that starts from the shared us-east-1 test NAT AMI, reapplies the module with the new AMI, and verifies the old NAT is replaced 4. Make every copied AMI public. 5. Open a PR that updates the Terraform defaults (`ami_owner_account`, `ami_name_pattern`) so merge + release-please can publish the new module version. From 379c1ea46f84e4228119e5afc7b740754224863a Mon Sep 17 00:00:00 2001 From: Leonard O'Sullivan Date: Fri, 6 Mar 2026 13:05:32 +1000 Subject: [PATCH 04/27] docs: hide generated provider version tables --- .terraform-docs-reference.yml | 4 ++++ .terraform-docs.yml | 4 ++++ README.md | 8 -------- docs/reference.md | 8 -------- 4 files changed, 8 insertions(+), 16 deletions(-) diff --git a/.terraform-docs-reference.yml b/.terraform-docs-reference.yml index f1ba32b..00fe526 100644 --- a/.terraform-docs-reference.yml +++ b/.terraform-docs-reference.yml @@ -1,5 +1,9 @@ formatter: "markdown table" +sections: + hide: + - providers + output: template: | {{ .Content }} diff --git a/.terraform-docs.yml b/.terraform-docs.yml index 8e30c37..78d3caf 100644 --- a/.terraform-docs.yml +++ b/.terraform-docs.yml @@ -1 +1,5 @@ formatter: "markdown table" + +sections: + hide: + - providers diff --git a/README.md b/README.md index 07ea58d..7c679c3 100644 --- a/README.md +++ b/README.md @@ -115,14 +115,6 @@ See [Performance](docs/performance.md) for detailed timings and cost breakdowns. | [null](#requirement\_null) | >= 3.0 | | [time](#requirement\_time) | >= 0.9 | -## Providers - -| Name | Version | -|------|---------| -| [aws](#provider\_aws) | 6.34.0 | -| [null](#provider\_null) | 3.2.4 | -| [time](#provider\_time) | 0.13.1 | - ## Modules No modules. diff --git a/docs/reference.md b/docs/reference.md index 3b849c6..6766016 100644 --- a/docs/reference.md +++ b/docs/reference.md @@ -7,14 +7,6 @@ | [null](#requirement\_null) | >= 3.0 | | [time](#requirement\_time) | >= 0.9 | -## Providers - -| Name | Version | -|------|---------| -| [aws](#provider\_aws) | 6.34.0 | -| [null](#provider\_null) | 3.2.4 | -| [time](#provider\_time) | 0.13.1 | - ## Modules No modules. From bccc048f44617997eec727636e795dfe677b1582 Mon Sep 17 00:00:00 2001 From: Leonard O'Sullivan Date: Fri, 6 Mar 2026 13:21:57 +1000 Subject: [PATCH 05/27] fix: use the shared private test nat ami in CI --- .github/workflows/integration-tests.yml | 17 ++++------------- .github/workflows/nat-images.yml | 16 ++++------------ docs/testing.md | 2 +- docs/workflows.md | 4 ++-- 4 files changed, 11 insertions(+), 28 deletions(-) diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml index 53f64ba..0e12191 100644 --- a/.github/workflows/integration-tests.yml +++ b/.github/workflows/integration-tests.yml @@ -31,8 +31,7 @@ permissions: contents: read env: - TEST_NAT_AMI_OWNER_ACCOUNT: "590144423513" - TEST_NAT_AMI_NAME_PATTERN: "nat-zero-al2023-minimal-arm64-*" + TEST_NAT_AMI_ID: ${{ vars.NAT_ZERO_TEST_AMI_ID }} jobs: integration-test: @@ -64,18 +63,10 @@ jobs: INPUT_NAT_AMI_ID: ${{ inputs.nat_ami_id || github.event.inputs.nat_ami_id || '' }} INPUT_UPDATED_NAT_AMI_ID: ${{ inputs.updated_nat_ami_id || github.event.inputs.updated_nat_ami_id || '' }} run: | - nat_ami_id="$INPUT_NAT_AMI_ID" - if [ -z "$nat_ami_id" ]; then - nat_ami_id="$(aws ec2 describe-images \ - --region us-east-1 \ - --owners "$TEST_NAT_AMI_OWNER_ACCOUNT" \ - --filters "Name=name,Values=$TEST_NAT_AMI_NAME_PATTERN" "Name=state,Values=available" \ - --query 'reverse(sort_by(Images,&CreationDate))[0].ImageId' \ - --output text)" - fi + nat_ami_id="${INPUT_NAT_AMI_ID:-$TEST_NAT_AMI_ID}" - if [ -z "$nat_ami_id" ] || [ "$nat_ami_id" = "None" ]; then - echo "failed to resolve the default integration NAT AMI" >&2 + if [ -z "$nat_ami_id" ]; then + echo "default integration NAT AMI is not configured" >&2 exit 1 fi diff --git a/.github/workflows/nat-images.yml b/.github/workflows/nat-images.yml index 672016c..3b5a64b 100644 --- a/.github/workflows/nat-images.yml +++ b/.github/workflows/nat-images.yml @@ -28,8 +28,7 @@ permissions: pull-requests: write env: - TEST_NAT_AMI_OWNER_ACCOUNT: "590144423513" - TEST_NAT_AMI_NAME_PATTERN: "nat-zero-al2023-minimal-arm64-*" + TEST_NAT_AMI_ID: ${{ vars.NAT_ZERO_TEST_AMI_ID }} jobs: build-and-copy: @@ -234,19 +233,12 @@ jobs: - name: Resolve baseline NAT AMI run: | - nat_ami_id="$(aws ec2 describe-images \ - --region us-east-1 \ - --owners "$TEST_NAT_AMI_OWNER_ACCOUNT" \ - --filters "Name=name,Values=$TEST_NAT_AMI_NAME_PATTERN" "Name=state,Values=available" \ - --query 'reverse(sort_by(Images,&CreationDate))[0].ImageId' \ - --output text)" - - if [ -z "$nat_ami_id" ] || [ "$nat_ami_id" = "None" ]; then - echo "failed to resolve the baseline integration NAT AMI" >&2 + if [ -z "$TEST_NAT_AMI_ID" ]; then + echo "baseline integration NAT AMI is not configured" >&2 exit 1 fi - echo "NAT_ZERO_TEST_NAT_AMI_ID=$nat_ami_id" >> "$GITHUB_ENV" + echo "NAT_ZERO_TEST_NAT_AMI_ID=$TEST_NAT_AMI_ID" >> "$GITHUB_ENV" echo "NAT_ZERO_TEST_UPDATED_NAT_AMI_ID=${{ needs.build-and-copy.outputs.test_ami_id }}" >> "$GITHUB_ENV" - name: Build Lambda binary diff --git a/docs/testing.md b/docs/testing.md index cacfdf3..a007590 100644 --- a/docs/testing.md +++ b/docs/testing.md @@ -57,7 +57,7 @@ Integration tests run in GitHub Actions when the `integration-test` label is add - Concurrency: one test at a time (`cancel-in-progress: false`) - Timeout: 15 minutes - Region: us-east-1 -- Default NAT AMI: resolves the shared test nat-zero AMI from account `590144423513` in us-east-1 unless `nat_ami_id` is supplied explicitly +- Default NAT AMI: shared private test nat-zero AMI supplied via the GitHub Actions variable `NAT_ZERO_TEST_AMI_ID` unless `nat_ami_id` is supplied explicitly ## Orphan Detection diff --git a/docs/workflows.md b/docs/workflows.md index f4b1c28..cf4fca9 100644 --- a/docs/workflows.md +++ b/docs/workflows.md @@ -42,7 +42,7 @@ Full end-to-end test: deploys real AWS infrastructure via Terratest, exercises t - **Timeout**: 15 minutes. - **Job name**: `integration-test` (required status check for merge). - **Optional inputs**: - - `nat_ami_id` to force the fixture onto a specific NAT AMI. If omitted, the workflow resolves the shared us-east-1 test AMI from account `590144423513`. + - `nat_ami_id` to force the fixture onto a specific NAT AMI. If omitted, the workflow uses the shared private test AMI from the GitHub Actions variable `NAT_ZERO_TEST_AMI_ID`. - `updated_nat_ami_id` to exercise the AMI replacement path after a second `terraform apply`. ### Steps @@ -60,7 +60,7 @@ Manual promotion workflow for the default public nat-zero AMI. 2. Copy it to every enabled AWS region in the account. 3. Run two us-east-1 integration gates: - direct test of the new AMI - - upgrade-path test that starts from the shared us-east-1 test NAT AMI, reapplies the module with the new AMI, and verifies the old NAT is replaced + - upgrade-path test that starts from the shared private test NAT AMI in `NAT_ZERO_TEST_AMI_ID`, reapplies the module with the new AMI, and verifies the old NAT is replaced 4. Make every copied AMI public. 5. Open a PR that updates the Terraform defaults (`ami_owner_account`, `ami_name_pattern`) so merge + release-please can publish the new module version. From 3be4bf8fa3c7f9c05285b22e1250d1e9d92aef3c Mon Sep 17 00:00:00 2001 From: Leonard O'Sullivan Date: Fri, 6 Mar 2026 17:24:23 +1000 Subject: [PATCH 06/27] fix: point defaults at the bootstrap nat-zero AMI --- README.md | 4 ++-- docs/reference.md | 4 ++-- variables.tf | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 7c679c3..2a9bd5f 100644 --- a/README.md +++ b/README.md @@ -151,8 +151,8 @@ No modules. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| | [ami\_id](#input\_ami\_id) | Explicit AMI ID to use (overrides AMI lookup entirely) | `string` | `null` | no | -| [ami\_name\_pattern](#input\_ami\_name\_pattern) | AMI name pattern used when resolving the default nat-zero AMI. Override this to use your own shared AMI. | `string` | `null` | no | -| [ami\_owner\_account](#input\_ami\_owner\_account) | Owner account ID used when resolving the default nat-zero AMI by name pattern. Override this to use your own shared AMI. | `string` | `null` | no | +| [ami\_name\_pattern](#input\_ami\_name\_pattern) | AMI name pattern used when resolving the default nat-zero AMI. Override this to use your own shared AMI. | `string` | `"nat-zero-al2023-minimal-arm64-20260306-064438"` | no | +| [ami\_owner\_account](#input\_ami\_owner\_account) | Owner account ID used when resolving the default nat-zero AMI by name pattern. Override this to use your own shared AMI. | `string` | `"590144423513"` | no | | [availability\_zones](#input\_availability\_zones) | List of availability zones to deploy NAT instances in | `list(string)` | n/a | yes | | [block\_device\_size](#input\_block\_device\_size) | Size in GB of the root EBS volume | `number` | `10` | no | | [build\_lambda\_locally](#input\_build\_lambda\_locally) | Build the Lambda binary from Go source instead of downloading a pre-compiled release. Requires Go and zip installed locally. | `bool` | `false` | no | diff --git a/docs/reference.md b/docs/reference.md index 6766016..6979b9d 100644 --- a/docs/reference.md +++ b/docs/reference.md @@ -43,8 +43,8 @@ No modules. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| | [ami\_id](#input\_ami\_id) | Explicit AMI ID to use (overrides AMI lookup entirely) | `string` | `null` | no | -| [ami\_name\_pattern](#input\_ami\_name\_pattern) | AMI name pattern used when resolving the default nat-zero AMI. Override this to use your own shared AMI. | `string` | `null` | no | -| [ami\_owner\_account](#input\_ami\_owner\_account) | Owner account ID used when resolving the default nat-zero AMI by name pattern. Override this to use your own shared AMI. | `string` | `null` | no | +| [ami\_name\_pattern](#input\_ami\_name\_pattern) | AMI name pattern used when resolving the default nat-zero AMI. Override this to use your own shared AMI. | `string` | `"nat-zero-al2023-minimal-arm64-20260306-064438"` | no | +| [ami\_owner\_account](#input\_ami\_owner\_account) | Owner account ID used when resolving the default nat-zero AMI by name pattern. Override this to use your own shared AMI. | `string` | `"590144423513"` | no | | [availability\_zones](#input\_availability\_zones) | List of availability zones to deploy NAT instances in | `list(string)` | n/a | yes | | [block\_device\_size](#input\_block\_device\_size) | Size in GB of the root EBS volume | `number` | `10` | no | | [build\_lambda\_locally](#input\_build\_lambda\_locally) | Build the Lambda binary from Go source instead of downloading a pre-compiled release. Requires Go and zip installed locally. | `bool` | `false` | no | diff --git a/variables.tf b/variables.tf index 49e89ef..9513cee 100644 --- a/variables.tf +++ b/variables.tf @@ -76,13 +76,13 @@ variable "ami_id" { variable "ami_owner_account" { type = string - default = null + default = "590144423513" description = "Owner account ID used when resolving the default nat-zero AMI by name pattern. Override this to use your own shared AMI." } variable "ami_name_pattern" { type = string - default = null + default = "nat-zero-al2023-minimal-arm64-20260306-064438" description = "AMI name pattern used when resolving the default nat-zero AMI. Override this to use your own shared AMI." } From 8b5fd1ab7828f26ffccd4127d391d0ba004eda7b Mon Sep 17 00:00:00 2001 From: Leonard O'Sullivan Date: Sun, 8 Mar 2026 11:39:04 +1000 Subject: [PATCH 07/27] fix: use packer native regional ami copies --- .github/workflows/nat-images.yml | 108 ++++--------------- .pre-commit-config.yaml | 4 +- ami/README.md | 6 +- ami/nat-zero-private-all-regions.pkrvars.hcl | 38 +++++++ ami/nat-zero.pkr.hcl | 6 ++ docs/workflows.md | 4 +- scripts/publish_ami_public.sh | 97 +++++++++++++++++ scripts/render_packer_ami_regions.sh | 29 +++++ 8 files changed, 199 insertions(+), 93 deletions(-) create mode 100644 ami/nat-zero-private-all-regions.pkrvars.hcl create mode 100755 scripts/publish_ami_public.sh create mode 100755 scripts/render_packer_ami_regions.sh diff --git a/.github/workflows/nat-images.yml b/.github/workflows/nat-images.yml index 3b5a64b..d68f478 100644 --- a/.github/workflows/nat-images.yml +++ b/.github/workflows/nat-images.yml @@ -29,6 +29,7 @@ permissions: env: TEST_NAT_AMI_ID: ${{ vars.NAT_ZERO_TEST_AMI_ID }} + PACKER_REGIONS_FILE: ami/nat-zero-private-all-regions.pkrvars.hcl jobs: build-and-copy: @@ -39,7 +40,6 @@ jobs: owner_account_id: ${{ steps.metadata.outputs.owner_account_id }} source_ami_id: ${{ steps.build.outputs.source_ami_id }} test_ami_id: ${{ steps.test-ami.outputs.test_ami_id }} - regions_json: ${{ steps.regions.outputs.regions_json }} steps: - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 @@ -50,6 +50,16 @@ jobs: role-to-assume: ${{ secrets.AMI_BUILD_ROLE_ARN }} aws-region: ${{ inputs.source_region }} + - name: Prepare Packer copy regions + id: regions + run: | + bash scripts/render_packer_ami_regions.sh \ + "$PACKER_REGIONS_FILE" \ + "${{ inputs.source_region }}" \ + "$RUNNER_TEMP/nat-zero-copy-regions.pkrvars.hcl" + + echo "var_file=$RUNNER_TEMP/nat-zero-copy-regions.pkrvars.hcl" >> "$GITHUB_OUTPUT" + - name: Build AMI id: build working-directory: ami @@ -58,13 +68,14 @@ jobs: packer init nat-zero.pkr.hcl packer build \ -color=false \ + -var-file "${{ steps.regions.outputs.var_file }}" \ -var "region=${{ inputs.source_region }}" \ -var "subnet_id=${{ inputs.build_subnet_id }}" \ nat-zero.pkr.hcl - source_ami_id="$(jq -r '.builds[-1].artifact_id' manifest.json | awk -F: '{print $2}')" + source_ami_id="$(jq -er '.builds[-1].artifact_id' manifest.json | awk -F: 'NF == 2 && $2 != "" { print $2 }')" if [ -z "$source_ami_id" ] || [ "$source_ami_id" = "null" ]; then - echo "failed to determine source AMI ID from ami/manifest.json" >&2 + echo "failed to determine source AMI ID after packer build" >&2 exit 1 fi @@ -81,71 +92,6 @@ jobs: echo "owner_account_id=$owner_account_id" >> "$GITHUB_OUTPUT" echo "ami_name=$ami_name" >> "$GITHUB_OUTPUT" - - name: Resolve enabled regions - id: regions - run: | - regions_json="$(aws ec2 describe-regions --all-regions --query "Regions[?OptInStatus=='opt-in-not-required' || OptInStatus=='opted-in'].RegionName" --output json | jq -c 'sort')" - echo "regions_json=$regions_json" >> "$GITHUB_OUTPUT" - - - name: Copy AMI to enabled regions - env: - AMI_NAME: ${{ steps.metadata.outputs.ami_name }} - REGIONS_JSON: ${{ steps.regions.outputs.regions_json }} - SOURCE_AMI_ID: ${{ steps.build.outputs.source_ami_id }} - SOURCE_REGION: ${{ inputs.source_region }} - run: | - copy_region() { - local region="$1" - local attempt - local err_file - local image_id - - err_file="$(mktemp)" - trap 'rm -f "$err_file"' RETURN - - for attempt in 1 2 3 4 5; do - image_id="$(aws ec2 copy-image \ - --region "$region" \ - --source-region "$SOURCE_REGION" \ - --source-image-id "$SOURCE_AMI_ID" \ - --name "$AMI_NAME" \ - --description "nat-zero AMI copied from $SOURCE_REGION" \ - --query 'ImageId' \ - --output text 2>"$err_file")" && break - sleep $((attempt * 5)) - done - - if [ -z "${image_id:-}" ] || [ "$image_id" = "None" ]; then - cat "$err_file" >&2 || true - return 1 - fi - - aws ec2 wait image-available --region "$region" --image-ids "$image_id" - } - - pids=() - - while IFS= read -r region; do - if [ "$region" = "$SOURCE_REGION" ]; then - continue - fi - - ( - set -euo pipefail - copy_region "$region" - ) & - pids+=("$!") - done < <(jq -r '.[]' <<<"$REGIONS_JSON") - - status=0 - for pid in "${pids[@]}"; do - if ! wait "$pid"; then - status=1 - fi - done - - exit "$status" - - name: Resolve us-east-1 test AMI id: test-ami env: @@ -268,6 +214,8 @@ jobs: runs-on: ubuntu-latest environment: ami-build steps: + - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 + - uses: aws-actions/configure-aws-credentials@7474bc4690e29a8392af63c5b98e7449536d5c3a # v4 with: role-to-assume: ${{ secrets.AMI_BUILD_ROLE_ARN }} @@ -277,26 +225,12 @@ jobs: env: AMI_NAME: ${{ needs.build-and-copy.outputs.ami_name }} OWNER_ACCOUNT_ID: ${{ needs.build-and-copy.outputs.owner_account_id }} - REGIONS_JSON: ${{ needs.build-and-copy.outputs.regions_json }} run: | - while IFS= read -r region; do - image_id="$(aws ec2 describe-images \ - --region "$region" \ - --owners "$OWNER_ACCOUNT_ID" \ - --filters "Name=name,Values=$AMI_NAME" "Name=state,Values=available" \ - --query 'Images[0].ImageId' \ - --output text)" - - if [ -z "$image_id" ] || [ "$image_id" = "None" ]; then - echo "failed to resolve image for $region" >&2 - exit 1 - fi - - aws ec2 modify-image-attribute \ - --region "$region" \ - --image-id "$image_id" \ - --launch-permission 'Add=[{Group=all}]' - done < <(jq -r '.[]' <<<"$REGIONS_JSON") + bash scripts/publish_ami_public.sh \ + "$OWNER_ACCOUNT_ID" \ + "$AMI_NAME" \ + "${{ inputs.source_region }}" \ + "$PACKER_REGIONS_FILE" open-promotion-pr: needs: diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 5826891..42acec9 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -42,13 +42,13 @@ repos: - id: packer-fmt name: packer fmt language: system - entry: bash -c 'cd ami && packer fmt -check -diff nat-zero.pkr.hcl' + entry: bash -c 'cd ami && packer fmt -check -diff .' files: '^ami/.*\.(pkr\.hcl|hcl)$' pass_filenames: false - id: packer-validate name: packer validate language: system - entry: bash -c 'cd ami && packer init nat-zero.pkr.hcl >/dev/null && packer validate -var "subnet_id=subnet-00000000000000000" nat-zero.pkr.hcl' + entry: bash -c 'tmp="$(mktemp).pkrvars.hcl" && trap '\''rm -f "$tmp"'\'' EXIT && bash scripts/render_packer_ami_regions.sh ami/nat-zero-private-all-regions.pkrvars.hcl us-east-1 "$tmp" && cd ami && packer init nat-zero.pkr.hcl >/dev/null && packer validate -var-file "$tmp" -var "subnet_id=subnet-00000000000000000" nat-zero.pkr.hcl' files: '^ami/.*$' pass_filenames: false - repo: https://github.com/zricethezav/gitleaks diff --git a/ami/README.md b/ami/README.md index 1f55eea..bfce233 100644 --- a/ami/README.md +++ b/ami/README.md @@ -27,6 +27,7 @@ This directory contains the Packer build for the nat-zero AMI. cd ami packer init nat-zero.pkr.hcl packer build \ + -var-file "nat-zero-private-all-regions.pkrvars.hcl" \ -var "region=us-east-1" \ -var "subnet_id=subnet-0123456789abcdef0" \ nat-zero.pkr.hcl @@ -51,8 +52,9 @@ Workflow: `.github/workflows/nat-images.yml` - `run_integration_gate` (default `true`) - Behavior: - builds a new nat-zero AMI with Packer - - copies it to all currently enabled regions in the account (parallel copy with retries) - - runs integration tests against the new source AMI (gate) before promotion + - uses `nat-zero-private-all-regions.pkrvars.hcl` as the checked-in list of private regional copies + - runs integration tests against the new us-east-1 AMI before any public sharing + - publishes the copied AMIs only after the integration gates pass - updates `ami_owner_account`, `ami_name_pattern` (and generated docs) and opens a PR Merge the promotion PR to `main` to let release-please publish a new module release that points to the promoted AMI name. diff --git a/ami/nat-zero-private-all-regions.pkrvars.hcl b/ami/nat-zero-private-all-regions.pkrvars.hcl new file mode 100644 index 0000000..502f312 --- /dev/null +++ b/ami/nat-zero-private-all-regions.pkrvars.hcl @@ -0,0 +1,38 @@ +# Keep this list aligned with the regions we want nat-zero published into. +# me-central-1 is intentionally excluded for now because EC2 CopyImage is +# currently throttling there for this account. +ami_regions = [ + "af-south-1", + "ap-east-1", + "ap-east-2", + "ap-northeast-1", + "ap-northeast-2", + "ap-northeast-3", + "ap-south-1", + "ap-south-2", + "ap-southeast-1", + "ap-southeast-2", + "ap-southeast-3", + "ap-southeast-4", + "ap-southeast-5", + "ap-southeast-6", + "ap-southeast-7", + "ca-central-1", + "ca-west-1", + "eu-central-1", + "eu-central-2", + "eu-north-1", + "eu-south-1", + "eu-south-2", + "eu-west-1", + "eu-west-2", + "eu-west-3", + "il-central-1", + "me-south-1", + "mx-central-1", + "sa-east-1", + "us-east-1", + "us-east-2", + "us-west-1", + "us-west-2", +] diff --git a/ami/nat-zero.pkr.hcl b/ami/nat-zero.pkr.hcl index 4a3dc66..579335f 100644 --- a/ami/nat-zero.pkr.hcl +++ b/ami/nat-zero.pkr.hcl @@ -26,8 +26,14 @@ variable "root_volume_size" { default = 4 } +variable "ami_regions" { + type = list(string) + default = [] +} + source "amazon-ebs" "nat_zero" { ami_name = "${var.ami_name_prefix}-${formatdate("YYYYMMDD-hhmmss", timestamp())}" + ami_regions = var.ami_regions instance_type = "t4g.nano" region = var.region max_retries = 50 diff --git a/docs/workflows.md b/docs/workflows.md index cf4fca9..91b4188 100644 --- a/docs/workflows.md +++ b/docs/workflows.md @@ -57,11 +57,11 @@ Full end-to-end test: deploys real AWS infrastructure via Terratest, exercises t Manual promotion workflow for the default public nat-zero AMI. 1. Build the AMI with Packer in the chosen source region. -2. Copy it to every enabled AWS region in the account. +2. Let Packer privately copy it to the regions listed in `ami/nat-zero-private-all-regions.pkrvars.hcl`. 3. Run two us-east-1 integration gates: - direct test of the new AMI - upgrade-path test that starts from the shared private test NAT AMI in `NAT_ZERO_TEST_AMI_ID`, reapplies the module with the new AMI, and verifies the old NAT is replaced -4. Make every copied AMI public. +4. After the integration gates pass, run a small publish script that opens launch permissions for the copied AMIs. 5. Open a PR that updates the Terraform defaults (`ami_owner_account`, `ami_name_pattern`) so merge + release-please can publish the new module version. ## Docs (`docs.yml`) diff --git a/scripts/publish_ami_public.sh b/scripts/publish_ami_public.sh new file mode 100755 index 0000000..f18d838 --- /dev/null +++ b/scripts/publish_ami_public.sh @@ -0,0 +1,97 @@ +#!/usr/bin/env bash +set -euo pipefail + +if [ "$#" -ne 4 ]; then + echo "usage: $0 " >&2 + exit 1 +fi + +owner_account_id="$1" +ami_name="$2" +source_region="$3" +regions_file="$4" + +mapfile -t publish_regions < <(awk -F'"' '/"/ {print $2}' "$regions_file") + +if [ "${#publish_regions[@]}" -eq 0 ]; then + echo "no regions found in $regions_file" >&2 + exit 1 +fi + +source_present=0 +for region in "${publish_regions[@]}"; do + if [ "$region" = "$source_region" ]; then + source_present=1 + break + fi +done +if [ "$source_present" -eq 0 ]; then + publish_regions+=("$source_region") +fi + +cleanup() { + local region + + for region in "${publish_regions[@]}"; do + aws ec2 enable-image-block-public-access \ + --region "$region" \ + --image-block-public-access-state block-new-sharing >/dev/null + done + + for region in "${publish_regions[@]}"; do + for _ in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15; do + state="$( + aws ec2 get-image-block-public-access-state \ + --region "$region" \ + --query 'ImageBlockPublicAccessState' \ + --output text + )" + if [ "$state" = "block-new-sharing" ]; then + break + fi + sleep 20 + done + done +} + +trap cleanup EXIT + +for region in "${publish_regions[@]}"; do + aws ec2 disable-image-block-public-access --region "$region" >/dev/null +done + +for region in "${publish_regions[@]}"; do + for _ in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15; do + state="$( + aws ec2 get-image-block-public-access-state \ + --region "$region" \ + --query 'ImageBlockPublicAccessState' \ + --output text + )" + if [ "$state" = "unblocked" ]; then + break + fi + sleep 20 + done +done + +for region in "${publish_regions[@]}"; do + image_id="$( + aws ec2 describe-images \ + --region "$region" \ + --owners "$owner_account_id" \ + --filters "Name=name,Values=$ami_name" "Name=state,Values=available" \ + --query 'Images[0].ImageId' \ + --output text + )" + + if [ -z "$image_id" ] || [ "$image_id" = "None" ]; then + echo "failed to resolve image for $region" >&2 + exit 1 + fi + + aws ec2 modify-image-attribute \ + --region "$region" \ + --image-id "$image_id" \ + --launch-permission 'Add=[{Group=all}]' +done diff --git a/scripts/render_packer_ami_regions.sh b/scripts/render_packer_ami_regions.sh new file mode 100755 index 0000000..3b02ecf --- /dev/null +++ b/scripts/render_packer_ami_regions.sh @@ -0,0 +1,29 @@ +#!/usr/bin/env bash +set -euo pipefail + +if [ "$#" -ne 3 ]; then + echo "usage: $0 " >&2 + exit 1 +fi + +regions_file="$1" +source_region="$2" +output_file="$3" + +mapfile -t configured_regions < <(awk -F'"' '/"/ {print $2}' "$regions_file") + +if [ "${#configured_regions[@]}" -eq 0 ]; then + echo "no regions found in $regions_file" >&2 + exit 1 +fi + +{ + echo "ami_regions = [" + for region in "${configured_regions[@]}"; do + if [ "$region" = "$source_region" ]; then + continue + fi + printf ' "%s",\n' "$region" + done + echo "]" +} >"$output_file" From 9c68b021396410db2177e483d64f3ad9285076c1 Mon Sep 17 00:00:00 2001 From: Leonard O'Sullivan Date: Sun, 8 Mar 2026 16:28:35 +1000 Subject: [PATCH 08/27] fix: support nat image validation on pull requests --- .github/workflows/nat-images.yml | 93 ++++++++++++++++++++++++++------ ami/README.md | 7 ++- docs/workflows.md | 4 +- 3 files changed, 87 insertions(+), 17 deletions(-) diff --git a/.github/workflows/nat-images.yml b/.github/workflows/nat-images.yml index d68f478..f7ba7d0 100644 --- a/.github/workflows/nat-images.yml +++ b/.github/workflows/nat-images.yml @@ -5,7 +5,7 @@ on: inputs: build_subnet_id: description: Public subnet ID to use for the Packer builder instance - required: true + required: false type: string source_region: description: Region where the AMI is built before being copied globally @@ -17,6 +17,9 @@ on: required: false default: true type: boolean + pull_request: + types: + - labeled concurrency: group: nat-zero-ami @@ -32,7 +35,60 @@ env: PACKER_REGIONS_FILE: ami/nat-zero-private-all-regions.pkrvars.hcl jobs: + resolve-inputs: + runs-on: ubuntu-latest + outputs: + build_subnet_id: ${{ steps.resolve.outputs.build_subnet_id }} + source_region: ${{ steps.resolve.outputs.source_region }} + run_integration_gate: ${{ steps.resolve.outputs.run_integration_gate }} + should_run: ${{ steps.resolve.outputs.should_run }} + should_publish: ${{ steps.resolve.outputs.should_publish }} + checkout_ref: ${{ steps.resolve.outputs.checkout_ref }} + steps: + - name: Resolve workflow inputs + id: resolve + env: + EVENT_NAME: ${{ github.event_name }} + EVENT_LABEL: ${{ github.event.label.name }} + INPUT_BUILD_SUBNET_ID: ${{ github.event.inputs.build_subnet_id }} + INPUT_SOURCE_REGION: ${{ github.event.inputs.source_region }} + INPUT_RUN_INTEGRATION_GATE: ${{ github.event.inputs.run_integration_gate }} + DEFAULT_BUILD_SUBNET_ID: ${{ vars.NAT_ZERO_AMI_BUILD_SUBNET_ID }} + GITHUB_HEAD_REF: ${{ github.head_ref }} + GITHUB_REF_NAME: ${{ github.ref_name }} + run: | + should_run=true + should_publish=true + + if [ "$EVENT_NAME" = "pull_request" ]; then + if [ "$EVENT_LABEL" != "nat-images" ]; then + should_run=false + fi + should_publish=false + fi + + build_subnet_id="${INPUT_BUILD_SUBNET_ID:-$DEFAULT_BUILD_SUBNET_ID}" + source_region="${INPUT_SOURCE_REGION:-us-east-1}" + run_integration_gate="${INPUT_RUN_INTEGRATION_GATE:-true}" + checkout_ref="${GITHUB_HEAD_REF:-$GITHUB_REF_NAME}" + + if [ "$should_run" = "true" ] && [ -z "$build_subnet_id" ]; then + echo "build_subnet_id input is required unless vars.NAT_ZERO_AMI_BUILD_SUBNET_ID is set" >&2 + exit 1 + fi + + { + echo "build_subnet_id=$build_subnet_id" + echo "source_region=$source_region" + echo "run_integration_gate=$run_integration_gate" + echo "should_run=$should_run" + echo "should_publish=$should_publish" + echo "checkout_ref=$checkout_ref" + } >> "$GITHUB_OUTPUT" + build-and-copy: + needs: resolve-inputs + if: ${{ needs.resolve-inputs.outputs.should_run == 'true' }} runs-on: ubuntu-latest environment: ami-build outputs: @@ -48,14 +104,14 @@ jobs: - uses: aws-actions/configure-aws-credentials@7474bc4690e29a8392af63c5b98e7449536d5c3a # v4 with: role-to-assume: ${{ secrets.AMI_BUILD_ROLE_ARN }} - aws-region: ${{ inputs.source_region }} + aws-region: ${{ needs.resolve-inputs.outputs.source_region }} - name: Prepare Packer copy regions id: regions run: | bash scripts/render_packer_ami_regions.sh \ "$PACKER_REGIONS_FILE" \ - "${{ inputs.source_region }}" \ + "${{ needs.resolve-inputs.outputs.source_region }}" \ "$RUNNER_TEMP/nat-zero-copy-regions.pkrvars.hcl" echo "var_file=$RUNNER_TEMP/nat-zero-copy-regions.pkrvars.hcl" >> "$GITHUB_OUTPUT" @@ -69,8 +125,8 @@ jobs: packer build \ -color=false \ -var-file "${{ steps.regions.outputs.var_file }}" \ - -var "region=${{ inputs.source_region }}" \ - -var "subnet_id=${{ inputs.build_subnet_id }}" \ + -var "region=${{ needs.resolve-inputs.outputs.source_region }}" \ + -var "subnet_id=${{ needs.resolve-inputs.outputs.build_subnet_id }}" \ nat-zero.pkr.hcl source_ami_id="$(jq -er '.builds[-1].artifact_id' manifest.json | awk -F: 'NF == 2 && $2 != "" { print $2 }')" @@ -87,7 +143,7 @@ jobs: SOURCE_AMI_ID: ${{ steps.build.outputs.source_ami_id }} run: | owner_account_id="$(aws sts get-caller-identity --query 'Account' --output text)" - ami_name="$(aws ec2 describe-images --region "${{ inputs.source_region }}" --image-ids "$SOURCE_AMI_ID" --query 'Images[0].Name' --output text)" + ami_name="$(aws ec2 describe-images --region "${{ needs.resolve-inputs.outputs.source_region }}" --image-ids "$SOURCE_AMI_ID" --query 'Images[0].Name' --output text)" echo "owner_account_id=$owner_account_id" >> "$GITHUB_OUTPUT" echo "ami_name=$ami_name" >> "$GITHUB_OUTPUT" @@ -99,7 +155,7 @@ jobs: OWNER_ACCOUNT_ID: ${{ steps.metadata.outputs.owner_account_id }} SOURCE_AMI_ID: ${{ steps.build.outputs.source_ami_id }} run: | - if [ "${{ inputs.source_region }}" = "us-east-1" ]; then + if [ "${{ needs.resolve-inputs.outputs.source_region }}" = "us-east-1" ]; then test_ami_id="$SOURCE_AMI_ID" else test_ami_id="$(aws ec2 describe-images \ @@ -118,8 +174,10 @@ jobs: echo "test_ami_id=$test_ami_id" >> "$GITHUB_OUTPUT" integration-new-ami: - if: ${{ inputs.run_integration_gate }} - needs: build-and-copy + if: ${{ needs.resolve-inputs.outputs.should_run == 'true' && needs.resolve-inputs.outputs.run_integration_gate == 'true' }} + needs: + - resolve-inputs + - build-and-copy runs-on: ubuntu-latest timeout-minutes: 15 environment: integration @@ -154,8 +212,9 @@ jobs: run: go test -v -timeout 10m -count=1 integration-ami-upgrade: - if: ${{ inputs.run_integration_gate }} + if: ${{ needs.resolve-inputs.outputs.should_run == 'true' && needs.resolve-inputs.outputs.run_integration_gate == 'true' }} needs: + - resolve-inputs - build-and-copy - integration-new-ami runs-on: ubuntu-latest @@ -201,14 +260,17 @@ jobs: publish-public: needs: + - resolve-inputs - build-and-copy - integration-new-ami - integration-ami-upgrade if: >- always() && + needs.resolve-inputs.outputs.should_run == 'true' && + needs.resolve-inputs.outputs.should_publish == 'true' && needs.build-and-copy.result == 'success' && ( - inputs.run_integration_gate == false || + needs.resolve-inputs.outputs.run_integration_gate != 'true' || (needs.integration-new-ami.result == 'success' && needs.integration-ami-upgrade.result == 'success') ) runs-on: ubuntu-latest @@ -219,7 +281,7 @@ jobs: - uses: aws-actions/configure-aws-credentials@7474bc4690e29a8392af63c5b98e7449536d5c3a # v4 with: role-to-assume: ${{ secrets.AMI_BUILD_ROLE_ARN }} - aws-region: ${{ inputs.source_region }} + aws-region: ${{ needs.resolve-inputs.outputs.source_region }} - name: Make copied AMIs public env: @@ -229,19 +291,20 @@ jobs: bash scripts/publish_ami_public.sh \ "$OWNER_ACCOUNT_ID" \ "$AMI_NAME" \ - "${{ inputs.source_region }}" \ + "${{ needs.resolve-inputs.outputs.source_region }}" \ "$PACKER_REGIONS_FILE" open-promotion-pr: needs: + - resolve-inputs - build-and-copy - publish-public - if: ${{ needs.publish-public.result == 'success' }} + if: ${{ needs.resolve-inputs.outputs.should_publish == 'true' && needs.publish-public.result == 'success' }} runs-on: ubuntu-latest steps: - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 with: - ref: ${{ github.ref_name }} + ref: ${{ needs.resolve-inputs.outputs.checkout_ref }} - uses: hashicorp/setup-terraform@b9cd54a3c349d3f38e8881555d616ced269862dd # v3 with: diff --git a/ami/README.md b/ami/README.md index bfce233..d79bb21 100644 --- a/ami/README.md +++ b/ami/README.md @@ -45,8 +45,12 @@ Workflow: `.github/workflows/nat-images.yml` - Requires GitHub environment secret `AMI_BUILD_ROLE_ARN` - Requires GitHub environment secret `INTEGRATION_ROLE_ARN` when `run_integration_gate=true` +- Requires GitHub Actions variable `NAT_ZERO_AMI_BUILD_SUBNET_ID` for label-triggered PR validation runs - Uses OIDC via `aws-actions/configure-aws-credentials` -- Inputs: +- Triggers: + - `workflow_dispatch` + - PR label `nat-images` for pre-merge validation on the branch under review +- Inputs for `workflow_dispatch`: - `build_subnet_id` - `source_region` (default `us-east-1`) - `run_integration_gate` (default `true`) @@ -55,6 +59,7 @@ Workflow: `.github/workflows/nat-images.yml` - uses `nat-zero-private-all-regions.pkrvars.hcl` as the checked-in list of private regional copies - runs integration tests against the new us-east-1 AMI before any public sharing - publishes the copied AMIs only after the integration gates pass +- PR label runs stop after build + integration so they can validate the branch safely without publishing or opening a promotion PR - updates `ami_owner_account`, `ami_name_pattern` (and generated docs) and opens a PR Merge the promotion PR to `main` to let release-please publish a new module release that points to the promoted AMI name. diff --git a/docs/workflows.md b/docs/workflows.md index 91b4188..5f3cdae 100644 --- a/docs/workflows.md +++ b/docs/workflows.md @@ -9,7 +9,7 @@ Internal reference for GitHub Actions workflows, repo rulesets, and the release | Pre-commit | `precommit.yml` | All PRs | `precommit` | | Go Tests | `go-tests.yml` | PRs touching `cmd/lambda/**`; push to `main` | `go-test` | | Integration Tests | `integration-tests.yml` | PR labeled `integration-test`; manual dispatch; reusable workflow | `integration-test` | -| NAT Images | `nat-images.yml` | Manual dispatch | No (promotion workflow) | +| NAT Images | `nat-images.yml` | Manual dispatch; PR labeled `nat-images` | No (promotion workflow) | | Docs | `docs.yml` | Push to `main` (filtered paths) | No (post-merge deploy) | | Release | `release-please.yml` | Push to `main`; manual dispatch | No (post-merge) | @@ -64,6 +64,8 @@ Manual promotion workflow for the default public nat-zero AMI. 4. After the integration gates pass, run a small publish script that opens launch permissions for the copied AMIs. 5. Open a PR that updates the Terraform defaults (`ami_owner_account`, `ami_name_pattern`) so merge + release-please can publish the new module version. +For pre-merge validation on a branch, add the `nat-images` label to the PR. That trigger uses the GitHub Actions variable `NAT_ZERO_AMI_BUILD_SUBNET_ID`, runs the build and integration gates on the PR branch, and intentionally skips the public-sharing and promotion-PR jobs. + ## Docs (`docs.yml`) Deploys MkDocs Material to GitHub Pages. From 31c2824f0745ae23e0cef9b7f5d011508bd07031 Mon Sep 17 00:00:00 2001 From: Leonard O'Sullivan Date: Sun, 8 Mar 2026 16:46:26 +1000 Subject: [PATCH 09/27] fix: parse packer multi-region manifest output --- .github/workflows/nat-images.yml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/workflows/nat-images.yml b/.github/workflows/nat-images.yml index f7ba7d0..e4646e7 100644 --- a/.github/workflows/nat-images.yml +++ b/.github/workflows/nat-images.yml @@ -119,6 +119,8 @@ jobs: - name: Build AMI id: build working-directory: ami + env: + SOURCE_REGION: ${{ needs.resolve-inputs.outputs.source_region }} run: | rm -f manifest.json packer init nat-zero.pkr.hcl @@ -129,7 +131,11 @@ jobs: -var "subnet_id=${{ needs.resolve-inputs.outputs.build_subnet_id }}" \ nat-zero.pkr.hcl - source_ami_id="$(jq -er '.builds[-1].artifact_id' manifest.json | awk -F: 'NF == 2 && $2 != "" { print $2 }')" + source_ami_id="$( + jq -er '.builds[-1].artifact_id' manifest.json | + tr ',' '\n' | + awk -F: -v source_region="$SOURCE_REGION" '$1 == source_region && $2 != "" { print $2; exit }' + )" if [ -z "$source_ami_id" ] || [ "$source_ami_id" = "null" ]; then echo "failed to determine source AMI ID after packer build" >&2 exit 1 From 8000c57fa0065847bdad88400d58239bb8bd729e Mon Sep 17 00:00:00 2001 From: Leonard O'Sullivan Date: Sun, 8 Mar 2026 17:24:53 +1000 Subject: [PATCH 10/27] fix: stabilize ami validation workflow --- cmd/lambda/ec2ops.go | 17 +++++++++++-- lambda.tf | 2 +- tests/integration/nat_zero_test.go | 41 ++++++++++++++++++++++-------- 3 files changed, 46 insertions(+), 14 deletions(-) diff --git a/cmd/lambda/ec2ops.go b/cmd/lambda/ec2ops.go index dadff5d..4fade97 100644 --- a/cmd/lambda/ec2ops.go +++ b/cmd/lambda/ec2ops.go @@ -445,8 +445,21 @@ func (h *Handler) cleanupAll(ctx context.Context) { // before termination completes, Terraform may try to delete still-attached ENIs. func (h *Handler) waitForTermination(ctx context.Context, instanceIDs []string) { defer timed("wait_for_termination")() - for attempt := 0; attempt < 60; attempt++ { - time.Sleep(2 * time.Second) + const ( + pollInterval = 2 * time.Second + maxAttempts = 90 + deadlineBuffer = 5 * time.Second + ) + + deadline, hasDeadline := ctx.Deadline() + for attempt := 0; attempt < maxAttempts; attempt++ { + if hasDeadline && time.Until(deadline) <= deadlineBuffer { + log.Printf("Stopping termination wait with Lambda deadline %s away", time.Until(deadline).Round(time.Second)) + return + } + if attempt > 0 { + time.Sleep(pollInterval) + } resp, err := h.EC2.DescribeInstances(ctx, &ec2.DescribeInstancesInput{ InstanceIds: instanceIDs, Filters: []ec2types.Filter{ diff --git a/lambda.tf b/lambda.tf index 29f5733..2d692c4 100644 --- a/lambda.tf +++ b/lambda.tf @@ -59,7 +59,7 @@ resource "aws_lambda_function" "nat_zero" { runtime = "provided.al2023" source_code_hash = fileexists("${path.module}/.build/lambda.zip") ? filebase64sha256("${path.module}/.build/lambda.zip") : null architectures = ["arm64"] - timeout = 90 + timeout = 180 reserved_concurrent_executions = 1 memory_size = var.lambda_memory_size tags = local.common_tags diff --git a/tests/integration/nat_zero_test.go b/tests/integration/nat_zero_test.go index 68c2af4..df4cd33 100644 --- a/tests/integration/nat_zero_test.go +++ b/tests/integration/nat_zero_test.go @@ -10,6 +10,7 @@ import ( "time" "github.com/aws/aws-sdk-go/aws" + "github.com/aws/aws-sdk-go/aws/awserr" "github.com/aws/aws-sdk-go/aws/session" "github.com/aws/aws-sdk-go/service/cloudwatchevents" "github.com/aws/aws-sdk-go/service/cloudwatchlogs" @@ -378,13 +379,9 @@ func TestNatZero(t *testing.T) { waitForInstanceTerminated(t, ec2Client, oldNatID) record("Wait for outdated NAT terminated", time.Since(waitTermStart)) - invokeCreateStart := time.Now() - invokeLambda(t, lambdaClient, lambdaName, map[string]string{ - "instance_id": activeWorkloadID, - "state": "running", - }) - record("Lambda invoke (AMI update create)", time.Since(invokeCreateStart)) - + // The old NAT termination emits the next EventBridge signal. That + // should drive creation of the replacement NAT without another manual + // invoke, which would race the single-concurrency reconciler. replacementStart := time.Now() replacementNat := waitForRunningNATWithEIP(t, ec2Client, vpcID, "replacement NAT running with EIP") record("Wait for replacement NAT running with EIP", time.Since(replacementStart)) @@ -503,10 +500,21 @@ func TestNatZero(t *testing.T) { func invokeLambda(t *testing.T, client *lambda.Lambda, funcName string, payload map[string]string) { t.Helper() body, _ := json.Marshal(payload) - out, err := client.Invoke(&lambda.InvokeInput{ - FunctionName: aws.String(funcName), - Payload: body, - LogType: aws.String("Tail"), + var out *lambda.InvokeOutput + _, err := retry.DoWithRetryE(t, "lambda invoke", 20, 3*time.Second, func() (string, error) { + var invokeErr error + out, invokeErr = client.Invoke(&lambda.InvokeInput{ + FunctionName: aws.String(funcName), + Payload: body, + LogType: aws.String("Tail"), + }) + if invokeErr == nil { + return "OK", nil + } + if isLambdaConcurrencyThrottle(invokeErr) { + return "", invokeErr + } + return "", retry.FatalError{Underlying: invokeErr} }) require.NoError(t, err, "Lambda invocation failed") if out.FunctionError != nil { @@ -527,6 +535,17 @@ func invokeLambda(t *testing.T, client *lambda.Lambda, funcName string, payload t.Logf("Lambda invoked: %v", payload) } +func isLambdaConcurrencyThrottle(err error) bool { + awsErr, ok := err.(awserr.Error) + if !ok { + return false + } + if awsErr.Code() != "TooManyRequestsException" { + return false + } + return strings.Contains(awsErr.Message(), "ReservedFunctionConcurrentInvocationLimitExceeded") +} + // dumpLambdaLogs prints recent Lambda CloudWatch log events for post-mortem debugging. func dumpLambdaLogs(t *testing.T, client *cloudwatchlogs.CloudWatchLogs, logGroup string) { t.Helper() From b09d01d40fb02c18f110f7c76a16533e9311b818 Mon Sep 17 00:00:00 2001 From: Leonard O'Sullivan Date: Sun, 8 Mar 2026 18:47:13 +1000 Subject: [PATCH 11/27] fix: collapse nat image validation into one stack --- .github/workflows/nat-images.yml | 92 +++--------------------------- docs/testing.md | 13 ++++- docs/workflows.md | 8 ++- lambda.tf | 2 +- tests/integration/fixture/main.tf | 7 ++- tests/integration/nat_zero_test.go | 30 +++++++--- 6 files changed, 53 insertions(+), 99 deletions(-) diff --git a/.github/workflows/nat-images.yml b/.github/workflows/nat-images.yml index e4646e7..0bfe43c 100644 --- a/.github/workflows/nat-images.yml +++ b/.github/workflows/nat-images.yml @@ -31,7 +31,6 @@ permissions: pull-requests: write env: - TEST_NAT_AMI_ID: ${{ vars.NAT_ZERO_TEST_AMI_ID }} PACKER_REGIONS_FILE: ami/nat-zero-private-all-regions.pkrvars.hcl jobs: @@ -179,97 +178,22 @@ jobs: echo "test_ami_id=$test_ami_id" >> "$GITHUB_OUTPUT" - integration-new-ami: + integration: if: ${{ needs.resolve-inputs.outputs.should_run == 'true' && needs.resolve-inputs.outputs.run_integration_gate == 'true' }} needs: - resolve-inputs - build-and-copy - runs-on: ubuntu-latest - timeout-minutes: 15 - environment: integration - env: - NAT_ZERO_TEST_NAT_AMI_ID: ${{ needs.build-and-copy.outputs.test_ami_id }} - steps: - - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 - - - uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5 - with: - go-version-file: tests/integration/go.mod - - - uses: hashicorp/setup-terraform@b9cd54a3c349d3f38e8881555d616ced269862dd # v3 - with: - terraform_wrapper: false - - - uses: aws-actions/configure-aws-credentials@7474bc4690e29a8392af63c5b98e7449536d5c3a # v4 - with: - role-to-assume: ${{ secrets.INTEGRATION_ROLE_ARN }} - aws-region: us-east-1 - - - name: Build Lambda binary - working-directory: cmd/lambda - run: | - GOOS=linux GOARCH=arm64 CGO_ENABLED=0 go build -tags lambda.norpc -ldflags='-s -w' -o bootstrap - zip lambda.zip bootstrap - mkdir -p ../../.build - cp lambda.zip ../../.build/lambda.zip - - - name: Test new AMI directly - working-directory: tests/integration - run: go test -v -timeout 10m -count=1 - - integration-ami-upgrade: - if: ${{ needs.resolve-inputs.outputs.should_run == 'true' && needs.resolve-inputs.outputs.run_integration_gate == 'true' }} - needs: - - resolve-inputs - - build-and-copy - - integration-new-ami - runs-on: ubuntu-latest - timeout-minutes: 15 - environment: integration - steps: - - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 - - - uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5 - with: - go-version-file: tests/integration/go.mod - - - uses: hashicorp/setup-terraform@b9cd54a3c349d3f38e8881555d616ced269862dd # v3 - with: - terraform_wrapper: false - - - uses: aws-actions/configure-aws-credentials@7474bc4690e29a8392af63c5b98e7449536d5c3a # v4 - with: - role-to-assume: ${{ secrets.INTEGRATION_ROLE_ARN }} - aws-region: us-east-1 - - - name: Resolve baseline NAT AMI - run: | - if [ -z "$TEST_NAT_AMI_ID" ]; then - echo "baseline integration NAT AMI is not configured" >&2 - exit 1 - fi - - echo "NAT_ZERO_TEST_NAT_AMI_ID=$TEST_NAT_AMI_ID" >> "$GITHUB_ENV" - echo "NAT_ZERO_TEST_UPDATED_NAT_AMI_ID=${{ needs.build-and-copy.outputs.test_ami_id }}" >> "$GITHUB_ENV" - - - name: Build Lambda binary - working-directory: cmd/lambda - run: | - GOOS=linux GOARCH=arm64 CGO_ENABLED=0 go build -tags lambda.norpc -ldflags='-s -w' -o bootstrap - zip lambda.zip bootstrap - mkdir -p ../../.build - cp lambda.zip ../../.build/lambda.zip - - - name: Test AMI upgrade path - working-directory: tests/integration - run: go test -v -timeout 15m -count=1 + uses: ./.github/workflows/integration-tests.yml + secrets: inherit + with: + nat_ami_id: ${{ vars.NAT_ZERO_TEST_AMI_ID }} + updated_nat_ami_id: ${{ needs.build-and-copy.outputs.test_ami_id }} publish-public: needs: - resolve-inputs - build-and-copy - - integration-new-ami - - integration-ami-upgrade + - integration if: >- always() && needs.resolve-inputs.outputs.should_run == 'true' && @@ -277,7 +201,7 @@ jobs: needs.build-and-copy.result == 'success' && ( needs.resolve-inputs.outputs.run_integration_gate != 'true' || - (needs.integration-new-ami.result == 'success' && needs.integration-ami-upgrade.result == 'success') + needs.integration.result == 'success' ) runs-on: ubuntu-latest environment: ami-build diff --git a/docs/testing.md b/docs/testing.md index a007590..30d179c 100644 --- a/docs/testing.md +++ b/docs/testing.md @@ -16,7 +16,7 @@ Integration tests require AWS credentials with permissions to manage EC2, IAM, L ## Integration Test Lifecycle -The test uses [Terratest](https://terratest.gruntwork.io/) with a single `terraform apply` / `destroy` cycle and four phases: +The test uses [Terratest](https://terratest.gruntwork.io/) with a single `terraform apply` / `destroy` cycle and five phases. Each run uses a unique `nat-test-*` module name so EventBridge, Lambda, and IAM resources do not collide across reruns. ### Phase 1: NAT Creation and Connectivity @@ -41,14 +41,21 @@ The test uses [Terratest](https://terratest.gruntwork.io/) with a single `terraf 3. Wait for NAT running with new EIP 4. Verify connectivity -### Phase 4: Cleanup Action +### Phase 4: AMI Replacement + +1. Reapply the fixture with `NAT_ZERO_TEST_UPDATED_NAT_AMI_ID` +2. Trigger reconciliation while a workload is active +3. Verify the old NAT instance is terminated +4. Verify the replacement NAT comes up on the new AMI and handles egress correctly + +### Phase 5: Cleanup Action 1. Invoke Lambda with `{action: "cleanup"}` 2. Verify all NAT instances terminated and EIPs released ### Teardown -`terraform destroy` removes all Terraform-managed resources. The cleanup action (Phase 4) ensures Lambda-created NAT instances are terminated first, so ENI deletion succeeds. +`terraform destroy` removes all Terraform-managed resources. The cleanup action (Phase 5) ensures Lambda-created NAT instances are terminated first, so ENI deletion succeeds. ## CI diff --git a/docs/workflows.md b/docs/workflows.md index 5f3cdae..7509c1d 100644 --- a/docs/workflows.md +++ b/docs/workflows.md @@ -58,9 +58,11 @@ Manual promotion workflow for the default public nat-zero AMI. 1. Build the AMI with Packer in the chosen source region. 2. Let Packer privately copy it to the regions listed in `ami/nat-zero-private-all-regions.pkrvars.hcl`. -3. Run two us-east-1 integration gates: - - direct test of the new AMI - - upgrade-path test that starts from the shared private test NAT AMI in `NAT_ZERO_TEST_AMI_ID`, reapplies the module with the new AMI, and verifies the old NAT is replaced +3. Run one us-east-1 integration gate on a single stack: + - deploy from the shared private test NAT AMI in `NAT_ZERO_TEST_AMI_ID` + - exercise the normal NAT lifecycle + - reapply the module with the new AMI + - verify the old NAT is replaced and the new NAT works 4. After the integration gates pass, run a small publish script that opens launch permissions for the copied AMIs. 5. Open a PR that updates the Terraform defaults (`ami_owner_account`, `ami_name_pattern`) so merge + release-please can publish the new module version. diff --git a/lambda.tf b/lambda.tf index 2d692c4..29f5733 100644 --- a/lambda.tf +++ b/lambda.tf @@ -59,7 +59,7 @@ resource "aws_lambda_function" "nat_zero" { runtime = "provided.al2023" source_code_hash = fileexists("${path.module}/.build/lambda.zip") ? filebase64sha256("${path.module}/.build/lambda.zip") : null architectures = ["arm64"] - timeout = 180 + timeout = 90 reserved_concurrent_executions = 1 memory_size = var.lambda_memory_size tags = local.common_tags diff --git a/tests/integration/fixture/main.tf b/tests/integration/fixture/main.tf index 9e182b2..932d90e 100644 --- a/tests/integration/fixture/main.tf +++ b/tests/integration/fixture/main.tf @@ -76,10 +76,15 @@ variable "nat_ami_id" { default = null } +variable "name" { + type = string + default = "nat-test" +} + module "nat_zero" { source = "../../../" - name = "nat-test" + name = var.name vpc_id = data.aws_vpc.default.id availability_zones = [data.aws_subnet.public.availability_zone] public_subnets = [data.aws_subnet.public.id] diff --git a/tests/integration/nat_zero_test.go b/tests/integration/nat_zero_test.go index df4cd33..c7f8a23 100644 --- a/tests/integration/nat_zero_test.go +++ b/tests/integration/nat_zero_test.go @@ -61,6 +61,7 @@ type phase struct { // connectivity, scale-down, restart, cleanup action, and terraform destroy. func TestNatZero(t *testing.T) { runID := fmt.Sprintf("tt-%d", time.Now().Unix()) + moduleName := fmt.Sprintf("nat-test-%s", runID) sess := session.Must(session.NewSession(&aws.Config{Region: aws.String(awsRegion)})) ec2Client := ec2.New(sess) iamClient := iam.New(sess) @@ -116,7 +117,10 @@ func TestNatZero(t *testing.T) { initialNatAMI := strings.TrimSpace(os.Getenv("NAT_ZERO_TEST_NAT_AMI_ID")) updatedNatAMI := strings.TrimSpace(os.Getenv("NAT_ZERO_TEST_UPDATED_NAT_AMI_ID")) - tfVars := map[string]interface{}{} + tfVars := map[string]interface{}{ + "name": moduleName, + } + t.Logf("Integration module name: %s", moduleName) if initialNatAMI != "" { tfVars["nat_ami_id"] = initialNatAMI t.Logf("Initial NAT AMI override: %s", initialNatAMI) @@ -951,13 +955,25 @@ func TestNoOrphanedResources(t *testing.T) { return found }}, {"Lambda", func() []string { - _, err := lambdaClient.GetFunction(&lambda.GetFunctionInput{ - FunctionName: aws.String(testPrefix + "-nat-zero"), - }) - if err == nil { - return []string{"Lambda nat-test-nat-zero"} + var found []string + var marker *string + for { + out, err := lambdaClient.ListFunctions(&lambda.ListFunctionsInput{Marker: marker}) + if err != nil { + return nil + } + for _, fn := range out.Functions { + name := aws.StringValue(fn.FunctionName) + if strings.HasPrefix(name, testPrefix) { + found = append(found, fmt.Sprintf("Lambda %s", name)) + } + } + if out.NextMarker == nil || aws.StringValue(out.NextMarker) == "" { + break + } + marker = out.NextMarker } - return nil + return found }}, {"LogGroups", func() []string { out, err := cwClient.DescribeLogGroups(&cloudwatchlogs.DescribeLogGroupsInput{ From 04a439cdc65581b69c9d5f6a6ffcd0812cd2b8b3 Mon Sep 17 00:00:00 2001 From: Leonard O'Sullivan Date: Sun, 8 Mar 2026 19:04:29 +1000 Subject: [PATCH 12/27] fix: run reusable integration gates on PR labels --- .github/workflows/integration-tests.yml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml index 0e12191..fa34f79 100644 --- a/.github/workflows/integration-tests.yml +++ b/.github/workflows/integration-tests.yml @@ -36,9 +36,10 @@ env: jobs: integration-test: if: >- - github.event_name == 'workflow_call' || - github.event_name == 'workflow_dispatch' || - github.event.label.name == 'integration-test' + github.event_name != 'pull_request' || + github.event.label.name == 'integration-test' || + inputs.nat_ami_id != '' || + inputs.updated_nat_ami_id != '' runs-on: ubuntu-latest timeout-minutes: 15 environment: integration From 5fb536b5a9a99adbb8f3140e8cbad73898d79930 Mon Sep 17 00:00:00 2001 From: Leonard O'Sullivan Date: Mon, 9 Mar 2026 06:48:40 +1000 Subject: [PATCH 13/27] chore: tighten workflow security permissions --- .github/workflows/nat-images.yml | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/.github/workflows/nat-images.yml b/.github/workflows/nat-images.yml index 0bfe43c..90ae6d4 100644 --- a/.github/workflows/nat-images.yml +++ b/.github/workflows/nat-images.yml @@ -26,9 +26,7 @@ concurrency: cancel-in-progress: false permissions: - id-token: write - contents: write - pull-requests: write + contents: read env: PACKER_REGIONS_FILE: ami/nat-zero-private-all-regions.pkrvars.hcl @@ -36,6 +34,8 @@ env: jobs: resolve-inputs: runs-on: ubuntu-latest + permissions: + contents: read outputs: build_subnet_id: ${{ steps.resolve.outputs.build_subnet_id }} source_region: ${{ steps.resolve.outputs.source_region }} @@ -90,6 +90,9 @@ jobs: if: ${{ needs.resolve-inputs.outputs.should_run == 'true' }} runs-on: ubuntu-latest environment: ami-build + permissions: + id-token: write + contents: read outputs: ami_name: ${{ steps.metadata.outputs.ami_name }} owner_account_id: ${{ steps.metadata.outputs.owner_account_id }} @@ -205,6 +208,9 @@ jobs: ) runs-on: ubuntu-latest environment: ami-build + permissions: + id-token: write + contents: read steps: - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 @@ -231,6 +237,9 @@ jobs: - publish-public if: ${{ needs.resolve-inputs.outputs.should_publish == 'true' && needs.publish-public.result == 'success' }} runs-on: ubuntu-latest + permissions: + contents: write + pull-requests: write steps: - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 with: From fb021b794fc294899e7100d33ac496311d979d7e Mon Sep 17 00:00:00 2001 From: Leonard O'Sullivan Date: Mon, 9 Mar 2026 08:32:29 +1000 Subject: [PATCH 14/27] fix: stabilize lambda artifact planning --- .github/workflows/release-please.yml | 8 ++-- README.md | 13 ++++-- docs/examples.md | 18 +++++++- docs/reference.md | 13 ++++-- docs/workflows.md | 23 ++++++----- lambda.tf | 61 ++++++++++++++++++++++++---- tests/integration/fixture/main.tf | 1 + variables.tf | 22 +++++++++- versions.tf | 6 ++- 9 files changed, 129 insertions(+), 36 deletions(-) diff --git a/.github/workflows/release-please.yml b/.github/workflows/release-please.yml index 9c35b37..42ab713 100644 --- a/.github/workflows/release-please.yml +++ b/.github/workflows/release-please.yml @@ -40,12 +40,14 @@ jobs: run: GOOS=linux GOARCH=arm64 CGO_ENABLED=0 go build -tags lambda.norpc -ldflags='-s -w' -o bootstrap - name: Package - run: zip lambda.zip bootstrap + run: | + zip lambda.zip bootstrap + openssl dgst -sha256 -binary lambda.zip | openssl base64 -A > lambda.zip.base64sha256 - name: Upload to versioned release env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: gh release upload "${{ needs.release-please.outputs.tag_name }}" lambda.zip --clobber + run: gh release upload "${{ needs.release-please.outputs.tag_name }}" lambda.zip lambda.zip.base64sha256 --clobber - name: Update rolling latest release env: @@ -55,4 +57,4 @@ jobs: --title "nat-zero Lambda (latest)" \ --notes "Auto-built Go Lambda binary from ${{ needs.release-please.outputs.tag_name }}" \ --latest=false 2>/dev/null || true - gh release upload nat-zero-lambda-latest lambda.zip --clobber + gh release upload nat-zero-lambda-latest lambda.zip lambda.zip.base64sha256 --clobber diff --git a/README.md b/README.md index 2a9bd5f..fd5c206 100644 --- a/README.md +++ b/README.md @@ -110,8 +110,9 @@ See [Performance](docs/performance.md) for detailed timings and cost breakdowns. | Name | Version | |------|---------| -| [terraform](#requirement\_terraform) | >= 1.3 | +| [terraform](#requirement\_terraform) | >= 1.4 | | [aws](#requirement\_aws) | >= 5.0 | +| [http](#requirement\_http) | >= 3.0 | | [null](#requirement\_null) | >= 3.0 | | [time](#requirement\_time) | >= 0.9 | @@ -141,10 +142,11 @@ No modules. | [aws_route.nat_route](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/route) | resource | | [aws_security_group.nat_security_group](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/security_group) | resource | | [null_resource.build_lambda](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource | -| [null_resource.download_lambda](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource | +| [terraform_data.download_lambda](https://registry.terraform.io/providers/hashicorp/terraform/latest/docs/resources/data) | resource | | [time_sleep.eventbridge_propagation](https://registry.terraform.io/providers/hashicorp/time/latest/docs/resources/sleep) | resource | | [time_sleep.lambda_ready](https://registry.terraform.io/providers/hashicorp/time/latest/docs/resources/sleep) | resource | | [aws_ami.nat](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/ami) | data source | +| [http_http.lambda_binary_hash](https://registry.terraform.io/providers/hashicorp/http/latest/docs/data-sources/http) | data source | ## Inputs @@ -155,13 +157,16 @@ No modules. | [ami\_owner\_account](#input\_ami\_owner\_account) | Owner account ID used when resolving the default nat-zero AMI by name pattern. Override this to use your own shared AMI. | `string` | `"590144423513"` | no | | [availability\_zones](#input\_availability\_zones) | List of availability zones to deploy NAT instances in | `list(string)` | n/a | yes | | [block\_device\_size](#input\_block\_device\_size) | Size in GB of the root EBS volume | `number` | `10` | no | -| [build\_lambda\_locally](#input\_build\_lambda\_locally) | Build the Lambda binary from Go source instead of downloading a pre-compiled release. Requires Go and zip installed locally. | `bool` | `false` | no | +| [build\_lambda\_locally](#input\_build\_lambda\_locally) | Build the Lambda binary from Go source during apply instead of downloading a pre-compiled release. This is primarily for local development and may require a second apply after code changes. | `bool` | `false` | no | | [enable\_logging](#input\_enable\_logging) | Create a CloudWatch log group for the Lambda function | `bool` | `true` | no | | [encrypt\_root\_volume](#input\_encrypt\_root\_volume) | Encrypt the root EBS volume. | `bool` | `true` | no | | [ignore\_tag\_key](#input\_ignore\_tag\_key) | Tag key used to mark instances the Lambda should ignore | `string` | `"nat-zero:ignore"` | no | | [ignore\_tag\_value](#input\_ignore\_tag\_value) | Tag value used to mark instances the Lambda should ignore | `string` | `"true"` | no | | [instance\_type](#input\_instance\_type) | Instance type for the NAT instance | `string` | `"t4g.nano"` | no | -| [lambda\_binary\_url](#input\_lambda\_binary\_url) | URL to the pre-compiled Go Lambda zip. Updated automatically by CI. | `string` | `"https://github.com/MachineDotDev/nat-zero/releases/download/nat-zero-lambda-latest/lambda.zip"` | no | +| [lambda\_binary\_base64sha256](#input\_lambda\_binary\_base64sha256) | Optional base64-encoded SHA256 of the Lambda zip. Override this for custom artifacts or when you want to avoid fetching the published checksum URL. | `string` | `null` | no | +| [lambda\_binary\_base64sha256\_url](#input\_lambda\_binary\_base64sha256\_url) | Optional URL returning the base64-encoded SHA256 for lambda\_binary\_url. Defaults to appending .base64sha256 to lambda\_binary\_url. | `string` | `null` | no | +| [lambda\_binary\_path](#input\_lambda\_binary\_path) | Optional path to a pre-built Lambda zip on disk. Use this to build the artifact outside Terraform and avoid apply-time compilation. | `string` | `null` | no | +| [lambda\_binary\_url](#input\_lambda\_binary\_url) | URL to the pre-compiled Go Lambda zip. Used when lambda\_binary\_path is null and build\_lambda\_locally is false. | `string` | `"https://github.com/MachineDotDev/nat-zero/releases/download/nat-zero-lambda-latest/lambda.zip"` | no | | [lambda\_memory\_size](#input\_lambda\_memory\_size) | Memory allocated to the Lambda function in MB (also scales CPU proportionally) | `number` | `128` | no | | [log\_retention\_days](#input\_log\_retention\_days) | CloudWatch log retention in days (only used when enable\_logging is true) | `number` | `14` | no | | [market\_type](#input\_market\_type) | Whether to use spot or on-demand instances | `string` | `"on-demand"` | no | diff --git a/docs/examples.md b/docs/examples.md index ab5ae72..7eab177 100644 --- a/docs/examples.md +++ b/docs/examples.md @@ -135,7 +135,7 @@ module "nat_zero" { ## Building Lambda Locally -For development or if you want to build from source: +For development or if you want to build from source during `terraform apply`: ```hcl module "nat_zero" { @@ -147,4 +147,18 @@ module "nat_zero" { } ``` -Requires Go and `zip` installed locally. +Requires Go and `zip` installed locally. This is a non-standard path and may require a second apply after code changes. + +## Using a Pre-built Local Lambda Zip + +For CI or local testing, it is cleaner to build the zip outside Terraform and pass it in directly: + +```hcl +module "nat_zero" { + source = "github.com/MachineDotDev/nat-zero" + + # ... required variables ... + + lambda_binary_path = "${path.module}/.build/lambda.zip" +} +``` diff --git a/docs/reference.md b/docs/reference.md index 6979b9d..455e247 100644 --- a/docs/reference.md +++ b/docs/reference.md @@ -2,8 +2,9 @@ | Name | Version | |------|---------| -| [terraform](#requirement\_terraform) | >= 1.3 | +| [terraform](#requirement\_terraform) | >= 1.4 | | [aws](#requirement\_aws) | >= 5.0 | +| [http](#requirement\_http) | >= 3.0 | | [null](#requirement\_null) | >= 3.0 | | [time](#requirement\_time) | >= 0.9 | @@ -33,10 +34,11 @@ No modules. | [aws_route.nat_route](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/route) | resource | | [aws_security_group.nat_security_group](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/security_group) | resource | | [null_resource.build_lambda](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource | -| [null_resource.download_lambda](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource | +| [terraform_data.download_lambda](https://registry.terraform.io/providers/hashicorp/terraform/latest/docs/resources/data) | resource | | [time_sleep.eventbridge_propagation](https://registry.terraform.io/providers/hashicorp/time/latest/docs/resources/sleep) | resource | | [time_sleep.lambda_ready](https://registry.terraform.io/providers/hashicorp/time/latest/docs/resources/sleep) | resource | | [aws_ami.nat](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/ami) | data source | +| [http_http.lambda_binary_hash](https://registry.terraform.io/providers/hashicorp/http/latest/docs/data-sources/http) | data source | ## Inputs @@ -47,13 +49,16 @@ No modules. | [ami\_owner\_account](#input\_ami\_owner\_account) | Owner account ID used when resolving the default nat-zero AMI by name pattern. Override this to use your own shared AMI. | `string` | `"590144423513"` | no | | [availability\_zones](#input\_availability\_zones) | List of availability zones to deploy NAT instances in | `list(string)` | n/a | yes | | [block\_device\_size](#input\_block\_device\_size) | Size in GB of the root EBS volume | `number` | `10` | no | -| [build\_lambda\_locally](#input\_build\_lambda\_locally) | Build the Lambda binary from Go source instead of downloading a pre-compiled release. Requires Go and zip installed locally. | `bool` | `false` | no | +| [build\_lambda\_locally](#input\_build\_lambda\_locally) | Build the Lambda binary from Go source during apply instead of downloading a pre-compiled release. This is primarily for local development and may require a second apply after code changes. | `bool` | `false` | no | | [enable\_logging](#input\_enable\_logging) | Create a CloudWatch log group for the Lambda function | `bool` | `true` | no | | [encrypt\_root\_volume](#input\_encrypt\_root\_volume) | Encrypt the root EBS volume. | `bool` | `true` | no | | [ignore\_tag\_key](#input\_ignore\_tag\_key) | Tag key used to mark instances the Lambda should ignore | `string` | `"nat-zero:ignore"` | no | | [ignore\_tag\_value](#input\_ignore\_tag\_value) | Tag value used to mark instances the Lambda should ignore | `string` | `"true"` | no | | [instance\_type](#input\_instance\_type) | Instance type for the NAT instance | `string` | `"t4g.nano"` | no | -| [lambda\_binary\_url](#input\_lambda\_binary\_url) | URL to the pre-compiled Go Lambda zip. Updated automatically by CI. | `string` | `"https://github.com/MachineDotDev/nat-zero/releases/download/nat-zero-lambda-latest/lambda.zip"` | no | +| [lambda\_binary\_base64sha256](#input\_lambda\_binary\_base64sha256) | Optional base64-encoded SHA256 of the Lambda zip. Override this for custom artifacts or when you want to avoid fetching the published checksum URL. | `string` | `null` | no | +| [lambda\_binary\_base64sha256\_url](#input\_lambda\_binary\_base64sha256\_url) | Optional URL returning the base64-encoded SHA256 for lambda\_binary\_url. Defaults to appending .base64sha256 to lambda\_binary\_url. | `string` | `null` | no | +| [lambda\_binary\_path](#input\_lambda\_binary\_path) | Optional path to a pre-built Lambda zip on disk. Use this to build the artifact outside Terraform and avoid apply-time compilation. | `string` | `null` | no | +| [lambda\_binary\_url](#input\_lambda\_binary\_url) | URL to the pre-compiled Go Lambda zip. Used when lambda\_binary\_path is null and build\_lambda\_locally is false. | `string` | `"https://github.com/MachineDotDev/nat-zero/releases/download/nat-zero-lambda-latest/lambda.zip"` | no | | [lambda\_memory\_size](#input\_lambda\_memory\_size) | Memory allocated to the Lambda function in MB (also scales CPU proportionally) | `number` | `128` | no | | [log\_retention\_days](#input\_log\_retention\_days) | CloudWatch log retention in days (only used when enable\_logging is true) | `number` | `14` | no | | [market\_type](#input\_market\_type) | Whether to use spot or on-demand instances | `string` | `"on-demand"` | no | diff --git a/docs/workflows.md b/docs/workflows.md index 7509c1d..19c408e 100644 --- a/docs/workflows.md +++ b/docs/workflows.md @@ -106,8 +106,9 @@ Only runs when `release_created == 'true'` (i.e., the push that merges a release 1. Cross-compiles the Go Lambda for `linux/arm64`. 2. Zips as `lambda.zip`. -3. **Uploads to the versioned release** (e.g., `v0.1.0`). -4. **Creates/updates a rolling `nat-zero-lambda-latest` release** with the same zip. This provides a stable URL for the module's default `lambda_binary_url`. +3. Writes `lambda.zip.base64sha256`, containing the base64-encoded SHA256 for the zip. +4. **Uploads the zip and checksum to the versioned release** (e.g., `v0.1.0`). +5. **Creates/updates a rolling `nat-zero-lambda-latest` release** with the same zip and checksum. This provides a stable URL for the module's default `lambda_binary_url` and the matching default checksum URL. ### Changelog sections @@ -124,16 +125,16 @@ Only runs when `release_created == 'true'` (i.e., the push that merges a release ### `main` branch ruleset -- **No direct push**: creation, update, deletion, and non-fast-forward all blocked. - **PRs required** with: - - 1 approving review + - 0 required approvals - Stale reviews dismissed on push - - Last push approval required (reviewer cannot be the person who pushed the last commit) - All review threads must be resolved - - **Squash merge only** -- **Required status checks**: `precommit`, `go-test`, `integration-test` - - `strict_required_status_checks_policy: false` -- checks that don't run (path filtering / label gating) won't block merge. -- **Bypass**: Admin role can bypass always. +- **Required status checks**: `precommit`, `go-test` + - strict mode enabled, so required checks must be up to date with `main` +- **Linear history required** +- **No force push** +- **No branch deletion** +- **Bypass**: Admin role can bypass because `enforce_admins` is disabled. ### `tags` ruleset @@ -148,7 +149,7 @@ Open PR -> precommit runs (always) -> go-test runs (if cmd/lambda/** changed) -> Add "integration-test" label -> integration tests run against real AWS - -> 1 approval + threads resolved + -> threads resolved -> Squash merge to main Post-merge to main: @@ -157,5 +158,5 @@ Post-merge to main: Merge release PR: -> release-please creates GitHub Release + tag - -> build-lambda uploads lambda.zip to release + rolling latest + -> build-lambda uploads lambda.zip + lambda.zip.base64sha256 to release + rolling latest ``` diff --git a/lambda.tf b/lambda.tf index 29f5733..1204653 100644 --- a/lambda.tf +++ b/lambda.tf @@ -17,15 +17,47 @@ resource "time_sleep" "lambda_ready" { destroy_duration = var.enable_logging ? "10s" : "0s" } -resource "null_resource" "download_lambda" { - count = var.build_lambda_locally ? 0 : 1 +locals { + downloaded_lambda_zip_path = "${path.module}/.build/lambda.zip" + lambda_binary_hash_url = coalesce(var.lambda_binary_base64sha256_url, "${var.lambda_binary_url}.base64sha256") + local_lambda_zip_path = coalesce(var.lambda_binary_path, local.downloaded_lambda_zip_path) + local_lambda_source_hash = var.lambda_binary_path != null ? ( + coalesce(var.lambda_binary_base64sha256, filebase64sha256(var.lambda_binary_path)) + ) : ( + fileexists(local.downloaded_lambda_zip_path) ? filebase64sha256(local.downloaded_lambda_zip_path) : null + ) + downloaded_lambda_source_hash = coalesce( + var.lambda_binary_base64sha256, + one(data.http.lambda_binary_hash[*].response_body), + null, + ) + lambda_source_hash = var.build_lambda_locally ? local.local_lambda_source_hash : trimspace(coalesce(local.downloaded_lambda_source_hash, "")) +} - triggers = { - url = var.lambda_binary_url +data "http" "lambda_binary_hash" { + count = var.build_lambda_locally || var.lambda_binary_path != null || var.lambda_binary_base64sha256 != null ? 0 : 1 + url = local.lambda_binary_hash_url + + request_headers = { + Accept = "text/plain" } +} + +resource "terraform_data" "download_lambda" { + count = var.build_lambda_locally || var.lambda_binary_path != null ? 0 : 1 + + triggers_replace = [ + path.module, + var.lambda_binary_url, + local.lambda_binary_hash_url, + trimspace(coalesce(local.downloaded_lambda_source_hash, "")), + ] provisioner "local-exec" { - command = "test -f ${path.module}/.build/lambda.zip || (mkdir -p ${path.module}/.build && curl -sfL -o ${path.module}/.build/lambda.zip ${var.lambda_binary_url})" + command = <<-EOT + mkdir -p "${path.module}/.build" && \ + curl -sfL -o "${local.downloaded_lambda_zip_path}" "${var.lambda_binary_url}" + EOT } } @@ -33,8 +65,12 @@ resource "null_resource" "build_lambda" { count = var.build_lambda_locally ? 1 : 0 triggers = { + module_path = path.module source_hash = sha256(join("", [ - for f in sort(fileset("${path.module}/cmd/lambda", "*.go")) : + for f in sort(concat( + tolist(fileset("${path.module}/cmd/lambda", "*.go")), + ["go.mod", "go.sum"], + )) : filesha256("${path.module}/cmd/lambda/${f}") ])) } @@ -52,12 +88,12 @@ resource "null_resource" "build_lambda" { } resource "aws_lambda_function" "nat_zero" { - filename = "${path.module}/.build/lambda.zip" + filename = local.local_lambda_zip_path function_name = "${var.name}-nat-zero" handler = "bootstrap" role = aws_iam_role.lambda_iam_role.arn runtime = "provided.al2023" - source_code_hash = fileexists("${path.module}/.build/lambda.zip") ? filebase64sha256("${path.module}/.build/lambda.zip") : null + source_code_hash = local.lambda_source_hash architectures = ["arm64"] timeout = 90 reserved_concurrent_executions = 1 @@ -81,7 +117,14 @@ resource "aws_lambda_function" "nat_zero" { } } - depends_on = [time_sleep.lambda_ready, null_resource.download_lambda, null_resource.build_lambda] + lifecycle { + precondition { + condition = !(var.build_lambda_locally && var.lambda_binary_path != null) + error_message = "build_lambda_locally and lambda_binary_path cannot be used together." + } + } + + depends_on = [time_sleep.lambda_ready, terraform_data.download_lambda, null_resource.build_lambda] } resource "aws_lambda_function_event_invoke_config" "nat_zero_invoke_config" { diff --git a/tests/integration/fixture/main.tf b/tests/integration/fixture/main.tf index 932d90e..820ebf9 100644 --- a/tests/integration/fixture/main.tf +++ b/tests/integration/fixture/main.tf @@ -97,6 +97,7 @@ module "nat_zero" { market_type = "on-demand" encrypt_root_volume = var.encrypt_root_volume ami_id = var.nat_ami_id + lambda_binary_path = fileexists("${path.module}/../../.build/lambda.zip") ? abspath("${path.module}/../../.build/lambda.zip") : null } output "vpc_id" { diff --git a/variables.tf b/variables.tf index 9513cee..b54c9f7 100644 --- a/variables.tf +++ b/variables.tf @@ -136,11 +136,29 @@ variable "log_retention_days" { variable "build_lambda_locally" { type = bool default = false - description = "Build the Lambda binary from Go source instead of downloading a pre-compiled release. Requires Go and zip installed locally." + description = "Build the Lambda binary from Go source during apply instead of downloading a pre-compiled release. This is primarily for local development and may require a second apply after code changes." +} + +variable "lambda_binary_path" { + type = string + default = null + description = "Optional path to a pre-built Lambda zip on disk. Use this to build the artifact outside Terraform and avoid apply-time compilation." } variable "lambda_binary_url" { type = string default = "https://github.com/MachineDotDev/nat-zero/releases/download/nat-zero-lambda-latest/lambda.zip" - description = "URL to the pre-compiled Go Lambda zip. Updated automatically by CI." + description = "URL to the pre-compiled Go Lambda zip. Used when lambda_binary_path is null and build_lambda_locally is false." +} + +variable "lambda_binary_base64sha256" { + type = string + default = null + description = "Optional base64-encoded SHA256 of the Lambda zip. Override this for custom artifacts or when you want to avoid fetching the published checksum URL." +} + +variable "lambda_binary_base64sha256_url" { + type = string + default = null + description = "Optional URL returning the base64-encoded SHA256 for lambda_binary_url. Defaults to appending .base64sha256 to lambda_binary_url." } diff --git a/versions.tf b/versions.tf index dd2367d..ff35e50 100644 --- a/versions.tf +++ b/versions.tf @@ -1,11 +1,15 @@ terraform { - required_version = ">= 1.3" + required_version = ">= 1.4" required_providers { aws = { source = "hashicorp/aws" version = ">= 5.0" } + http = { + source = "hashicorp/http" + version = ">= 3.0" + } null = { source = "hashicorp/null" version = ">= 3.0" From 405805d9d90718c081f3d7b8af8b699b9fb3f6ff Mon Sep 17 00:00:00 2001 From: Leonard O'Sullivan Date: Mon, 9 Mar 2026 08:46:54 +1000 Subject: [PATCH 15/27] refactor: pin lambda assets to module releases --- README.md | 3 +-- docs/reference.md | 3 +-- docs/workflows.md | 2 +- lambda.tf | 13 ++++++++----- variables.tf | 10 ++-------- 5 files changed, 13 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index fd5c206..80bb56e 100644 --- a/README.md +++ b/README.md @@ -164,9 +164,8 @@ No modules. | [ignore\_tag\_value](#input\_ignore\_tag\_value) | Tag value used to mark instances the Lambda should ignore | `string` | `"true"` | no | | [instance\_type](#input\_instance\_type) | Instance type for the NAT instance | `string` | `"t4g.nano"` | no | | [lambda\_binary\_base64sha256](#input\_lambda\_binary\_base64sha256) | Optional base64-encoded SHA256 of the Lambda zip. Override this for custom artifacts or when you want to avoid fetching the published checksum URL. | `string` | `null` | no | -| [lambda\_binary\_base64sha256\_url](#input\_lambda\_binary\_base64sha256\_url) | Optional URL returning the base64-encoded SHA256 for lambda\_binary\_url. Defaults to appending .base64sha256 to lambda\_binary\_url. | `string` | `null` | no | | [lambda\_binary\_path](#input\_lambda\_binary\_path) | Optional path to a pre-built Lambda zip on disk. Use this to build the artifact outside Terraform and avoid apply-time compilation. | `string` | `null` | no | -| [lambda\_binary\_url](#input\_lambda\_binary\_url) | URL to the pre-compiled Go Lambda zip. Used when lambda\_binary\_path is null and build\_lambda\_locally is false. | `string` | `"https://github.com/MachineDotDev/nat-zero/releases/download/nat-zero-lambda-latest/lambda.zip"` | no | +| [lambda\_binary\_url](#input\_lambda\_binary\_url) | Optional URL to a pre-compiled Go Lambda zip. Defaults to the versioned release asset matching this module release tag. | `string` | `null` | no | | [lambda\_memory\_size](#input\_lambda\_memory\_size) | Memory allocated to the Lambda function in MB (also scales CPU proportionally) | `number` | `128` | no | | [log\_retention\_days](#input\_log\_retention\_days) | CloudWatch log retention in days (only used when enable\_logging is true) | `number` | `14` | no | | [market\_type](#input\_market\_type) | Whether to use spot or on-demand instances | `string` | `"on-demand"` | no | diff --git a/docs/reference.md b/docs/reference.md index 455e247..3f44355 100644 --- a/docs/reference.md +++ b/docs/reference.md @@ -56,9 +56,8 @@ No modules. | [ignore\_tag\_value](#input\_ignore\_tag\_value) | Tag value used to mark instances the Lambda should ignore | `string` | `"true"` | no | | [instance\_type](#input\_instance\_type) | Instance type for the NAT instance | `string` | `"t4g.nano"` | no | | [lambda\_binary\_base64sha256](#input\_lambda\_binary\_base64sha256) | Optional base64-encoded SHA256 of the Lambda zip. Override this for custom artifacts or when you want to avoid fetching the published checksum URL. | `string` | `null` | no | -| [lambda\_binary\_base64sha256\_url](#input\_lambda\_binary\_base64sha256\_url) | Optional URL returning the base64-encoded SHA256 for lambda\_binary\_url. Defaults to appending .base64sha256 to lambda\_binary\_url. | `string` | `null` | no | | [lambda\_binary\_path](#input\_lambda\_binary\_path) | Optional path to a pre-built Lambda zip on disk. Use this to build the artifact outside Terraform and avoid apply-time compilation. | `string` | `null` | no | -| [lambda\_binary\_url](#input\_lambda\_binary\_url) | URL to the pre-compiled Go Lambda zip. Used when lambda\_binary\_path is null and build\_lambda\_locally is false. | `string` | `"https://github.com/MachineDotDev/nat-zero/releases/download/nat-zero-lambda-latest/lambda.zip"` | no | +| [lambda\_binary\_url](#input\_lambda\_binary\_url) | Optional URL to a pre-compiled Go Lambda zip. Defaults to the versioned release asset matching this module release tag. | `string` | `null` | no | | [lambda\_memory\_size](#input\_lambda\_memory\_size) | Memory allocated to the Lambda function in MB (also scales CPU proportionally) | `number` | `128` | no | | [log\_retention\_days](#input\_log\_retention\_days) | CloudWatch log retention in days (only used when enable\_logging is true) | `number` | `14` | no | | [market\_type](#input\_market\_type) | Whether to use spot or on-demand instances | `string` | `"on-demand"` | no | diff --git a/docs/workflows.md b/docs/workflows.md index 19c408e..c2e0375 100644 --- a/docs/workflows.md +++ b/docs/workflows.md @@ -108,7 +108,7 @@ Only runs when `release_created == 'true'` (i.e., the push that merges a release 2. Zips as `lambda.zip`. 3. Writes `lambda.zip.base64sha256`, containing the base64-encoded SHA256 for the zip. 4. **Uploads the zip and checksum to the versioned release** (e.g., `v0.1.0`). -5. **Creates/updates a rolling `nat-zero-lambda-latest` release** with the same zip and checksum. This provides a stable URL for the module's default `lambda_binary_url` and the matching default checksum URL. +5. **Creates/updates a rolling `nat-zero-lambda-latest` release** with the same zip and checksum for convenience, but the module default now pins to the versioned release asset that matches the tagged module version. ### Changelog sections diff --git a/lambda.tf b/lambda.tf index 1204653..442d5bc 100644 --- a/lambda.tf +++ b/lambda.tf @@ -18,9 +18,12 @@ resource "time_sleep" "lambda_ready" { } locals { - downloaded_lambda_zip_path = "${path.module}/.build/lambda.zip" - lambda_binary_hash_url = coalesce(var.lambda_binary_base64sha256_url, "${var.lambda_binary_url}.base64sha256") - local_lambda_zip_path = coalesce(var.lambda_binary_path, local.downloaded_lambda_zip_path) + module_release_version = jsondecode(file("${path.module}/.release-please-manifest.json"))["."] + default_lambda_binary_url = "https://github.com/MachineDotDev/nat-zero/releases/download/v${local.module_release_version}/lambda.zip" + downloaded_lambda_zip_path = "${path.module}/.build/lambda.zip" + effective_lambda_binary_url = coalesce(var.lambda_binary_url, local.default_lambda_binary_url) + lambda_binary_hash_url = "${local.effective_lambda_binary_url}.base64sha256" + local_lambda_zip_path = coalesce(var.lambda_binary_path, local.downloaded_lambda_zip_path) local_lambda_source_hash = var.lambda_binary_path != null ? ( coalesce(var.lambda_binary_base64sha256, filebase64sha256(var.lambda_binary_path)) ) : ( @@ -48,7 +51,7 @@ resource "terraform_data" "download_lambda" { triggers_replace = [ path.module, - var.lambda_binary_url, + local.effective_lambda_binary_url, local.lambda_binary_hash_url, trimspace(coalesce(local.downloaded_lambda_source_hash, "")), ] @@ -56,7 +59,7 @@ resource "terraform_data" "download_lambda" { provisioner "local-exec" { command = <<-EOT mkdir -p "${path.module}/.build" && \ - curl -sfL -o "${local.downloaded_lambda_zip_path}" "${var.lambda_binary_url}" + curl -sfL -o "${local.downloaded_lambda_zip_path}" "${local.effective_lambda_binary_url}" EOT } } diff --git a/variables.tf b/variables.tf index b54c9f7..d2dbca2 100644 --- a/variables.tf +++ b/variables.tf @@ -147,8 +147,8 @@ variable "lambda_binary_path" { variable "lambda_binary_url" { type = string - default = "https://github.com/MachineDotDev/nat-zero/releases/download/nat-zero-lambda-latest/lambda.zip" - description = "URL to the pre-compiled Go Lambda zip. Used when lambda_binary_path is null and build_lambda_locally is false." + default = null + description = "Optional URL to a pre-compiled Go Lambda zip. Defaults to the versioned release asset matching this module release tag." } variable "lambda_binary_base64sha256" { @@ -156,9 +156,3 @@ variable "lambda_binary_base64sha256" { default = null description = "Optional base64-encoded SHA256 of the Lambda zip. Override this for custom artifacts or when you want to avoid fetching the published checksum URL." } - -variable "lambda_binary_base64sha256_url" { - type = string - default = null - description = "Optional URL returning the base64-encoded SHA256 for lambda_binary_url. Defaults to appending .base64sha256 to lambda_binary_url." -} From 28dd6dc837d7f753a44e5eb2097be3a60e99a745 Mon Sep 17 00:00:00 2001 From: Leonard O'Sullivan Date: Mon, 9 Mar 2026 09:19:53 +1000 Subject: [PATCH 16/27] fix: simplify lambda artifact release flow --- .github/workflows/release-please-lambda.yml | 74 +++++++++++++++++++++ .github/workflows/release-please.yml | 20 ++++-- .lambda-release.json | 4 ++ README.md | 4 -- docs/examples.md | 2 + docs/reference.md | 4 -- docs/workflows.md | 26 ++++++-- lambda.tf | 47 ++++++------- variables.tf | 12 ---- versions.tf | 4 -- 10 files changed, 135 insertions(+), 62 deletions(-) create mode 100644 .github/workflows/release-please-lambda.yml create mode 100644 .lambda-release.json diff --git a/.github/workflows/release-please-lambda.yml b/.github/workflows/release-please-lambda.yml new file mode 100644 index 0000000..b7cd701 --- /dev/null +++ b/.github/workflows/release-please-lambda.yml @@ -0,0 +1,74 @@ +name: Prepare Release Lambda + +on: + pull_request_target: + types: [opened, synchronize, reopened] + +permissions: + contents: write + +jobs: + update-lambda-hash: + if: github.event.pull_request.head.repo.full_name == github.repository && startsWith(github.event.pull_request.head.ref, 'release-please--') && github.event.pull_request.user.login == 'app/github-actions' + runs-on: ubuntu-latest + defaults: + run: + working-directory: cmd/lambda + steps: + - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 + with: + ref: ${{ github.event.pull_request.head.ref }} + token: ${{ secrets.GITHUB_TOKEN }} + + - uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5 + with: + go-version-file: cmd/lambda/go.mod + + - name: Build deterministic Lambda zip + run: | + GOOS=linux GOARCH=arm64 CGO_ENABLED=0 go build -trimpath -buildvcs=false -tags lambda.norpc -ldflags='-s -w -buildid=' -o bootstrap + TZ=UTC touch -t 198001010000 bootstrap + zip -q -X lambda.zip bootstrap + + - name: Read release version + id: release-version + run: echo "release_version=$(jq -r '."."' ../../.release-please-manifest.json)" >> "$GITHUB_OUTPUT" + + - name: Read Lambda hash + id: lambda-hash + run: echo "lambda_hash=$(openssl dgst -sha256 -binary lambda.zip | openssl base64 -A)" >> "$GITHUB_OUTPUT" + + - name: Update committed Lambda metadata + env: + RELEASE_VERSION: ${{ steps.release-version.outputs.release_version }} + LAMBDA_HASH: ${{ steps.lambda-hash.outputs.lambda_hash }} + run: | + python - <<'PY' + import json + import os + from pathlib import Path + + Path("../../.lambda-release.json").write_text( + json.dumps( + { + "version": os.environ["RELEASE_VERSION"], + "base64sha256": os.environ["LAMBDA_HASH"], + }, + indent=2, + ) + + "\n" + ) + PY + + - name: Commit metadata update + working-directory: . + run: | + if git diff --quiet -- .lambda-release.json; then + exit 0 + fi + + git config user.name "github-actions[bot]" + git config user.email "41898282+github-actions[bot]@users.noreply.github.com" + git add .lambda-release.json + git commit -m "chore: update lambda release hash" + git push diff --git a/.github/workflows/release-please.yml b/.github/workflows/release-please.yml index 42ab713..7352f33 100644 --- a/.github/workflows/release-please.yml +++ b/.github/workflows/release-please.yml @@ -37,17 +37,27 @@ jobs: go-version-file: cmd/lambda/go.mod - name: Build - run: GOOS=linux GOARCH=arm64 CGO_ENABLED=0 go build -tags lambda.norpc -ldflags='-s -w' -o bootstrap + run: GOOS=linux GOARCH=arm64 CGO_ENABLED=0 go build -trimpath -buildvcs=false -tags lambda.norpc -ldflags='-s -w -buildid=' -o bootstrap - name: Package run: | - zip lambda.zip bootstrap - openssl dgst -sha256 -binary lambda.zip | openssl base64 -A > lambda.zip.base64sha256 + TZ=UTC touch -t 198001010000 bootstrap + zip -q -X lambda.zip bootstrap + + - name: Verify committed Lambda hash + run: | + expected_version=$(jq -r '.version' ../../.lambda-release.json) + actual_version=$(jq -r '."."' ../../.release-please-manifest.json) + test "$expected_version" = "$actual_version" + + expected_hash=$(jq -r '.base64sha256' ../../.lambda-release.json) + actual_hash=$(openssl dgst -sha256 -binary lambda.zip | openssl base64 -A) + test "$expected_hash" = "$actual_hash" - name: Upload to versioned release env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: gh release upload "${{ needs.release-please.outputs.tag_name }}" lambda.zip lambda.zip.base64sha256 --clobber + run: gh release upload "${{ needs.release-please.outputs.tag_name }}" lambda.zip --clobber - name: Update rolling latest release env: @@ -57,4 +67,4 @@ jobs: --title "nat-zero Lambda (latest)" \ --notes "Auto-built Go Lambda binary from ${{ needs.release-please.outputs.tag_name }}" \ --latest=false 2>/dev/null || true - gh release upload nat-zero-lambda-latest lambda.zip lambda.zip.base64sha256 --clobber + gh release upload nat-zero-lambda-latest lambda.zip --clobber diff --git a/.lambda-release.json b/.lambda-release.json new file mode 100644 index 0000000..c9e9f52 --- /dev/null +++ b/.lambda-release.json @@ -0,0 +1,4 @@ +{ + "version": "0.3.0", + "base64sha256": "Fk59AnRhiNu5O35sYG3VMM+iGPTu1il5La5+O1a51uo=" +} diff --git a/README.md b/README.md index 80bb56e..86f51d6 100644 --- a/README.md +++ b/README.md @@ -112,7 +112,6 @@ See [Performance](docs/performance.md) for detailed timings and cost breakdowns. |------|---------| | [terraform](#requirement\_terraform) | >= 1.4 | | [aws](#requirement\_aws) | >= 5.0 | -| [http](#requirement\_http) | >= 3.0 | | [null](#requirement\_null) | >= 3.0 | | [time](#requirement\_time) | >= 0.9 | @@ -146,7 +145,6 @@ No modules. | [time_sleep.eventbridge_propagation](https://registry.terraform.io/providers/hashicorp/time/latest/docs/resources/sleep) | resource | | [time_sleep.lambda_ready](https://registry.terraform.io/providers/hashicorp/time/latest/docs/resources/sleep) | resource | | [aws_ami.nat](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/ami) | data source | -| [http_http.lambda_binary_hash](https://registry.terraform.io/providers/hashicorp/http/latest/docs/data-sources/http) | data source | ## Inputs @@ -163,9 +161,7 @@ No modules. | [ignore\_tag\_key](#input\_ignore\_tag\_key) | Tag key used to mark instances the Lambda should ignore | `string` | `"nat-zero:ignore"` | no | | [ignore\_tag\_value](#input\_ignore\_tag\_value) | Tag value used to mark instances the Lambda should ignore | `string` | `"true"` | no | | [instance\_type](#input\_instance\_type) | Instance type for the NAT instance | `string` | `"t4g.nano"` | no | -| [lambda\_binary\_base64sha256](#input\_lambda\_binary\_base64sha256) | Optional base64-encoded SHA256 of the Lambda zip. Override this for custom artifacts or when you want to avoid fetching the published checksum URL. | `string` | `null` | no | | [lambda\_binary\_path](#input\_lambda\_binary\_path) | Optional path to a pre-built Lambda zip on disk. Use this to build the artifact outside Terraform and avoid apply-time compilation. | `string` | `null` | no | -| [lambda\_binary\_url](#input\_lambda\_binary\_url) | Optional URL to a pre-compiled Go Lambda zip. Defaults to the versioned release asset matching this module release tag. | `string` | `null` | no | | [lambda\_memory\_size](#input\_lambda\_memory\_size) | Memory allocated to the Lambda function in MB (also scales CPU proportionally) | `number` | `128` | no | | [log\_retention\_days](#input\_log\_retention\_days) | CloudWatch log retention in days (only used when enable\_logging is true) | `number` | `14` | no | | [market\_type](#input\_market\_type) | Whether to use spot or on-demand instances | `string` | `"on-demand"` | no | diff --git a/docs/examples.md b/docs/examples.md index 7eab177..2c0c802 100644 --- a/docs/examples.md +++ b/docs/examples.md @@ -162,3 +162,5 @@ module "nat_zero" { lambda_binary_path = "${path.module}/.build/lambda.zip" } ``` + +This is also the right way to test an unreleased branch when the branch includes Lambda code changes. The default downloaded Lambda zip is pinned to the latest tagged module release. diff --git a/docs/reference.md b/docs/reference.md index 3f44355..2d74990 100644 --- a/docs/reference.md +++ b/docs/reference.md @@ -4,7 +4,6 @@ |------|---------| | [terraform](#requirement\_terraform) | >= 1.4 | | [aws](#requirement\_aws) | >= 5.0 | -| [http](#requirement\_http) | >= 3.0 | | [null](#requirement\_null) | >= 3.0 | | [time](#requirement\_time) | >= 0.9 | @@ -38,7 +37,6 @@ No modules. | [time_sleep.eventbridge_propagation](https://registry.terraform.io/providers/hashicorp/time/latest/docs/resources/sleep) | resource | | [time_sleep.lambda_ready](https://registry.terraform.io/providers/hashicorp/time/latest/docs/resources/sleep) | resource | | [aws_ami.nat](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/ami) | data source | -| [http_http.lambda_binary_hash](https://registry.terraform.io/providers/hashicorp/http/latest/docs/data-sources/http) | data source | ## Inputs @@ -55,9 +53,7 @@ No modules. | [ignore\_tag\_key](#input\_ignore\_tag\_key) | Tag key used to mark instances the Lambda should ignore | `string` | `"nat-zero:ignore"` | no | | [ignore\_tag\_value](#input\_ignore\_tag\_value) | Tag value used to mark instances the Lambda should ignore | `string` | `"true"` | no | | [instance\_type](#input\_instance\_type) | Instance type for the NAT instance | `string` | `"t4g.nano"` | no | -| [lambda\_binary\_base64sha256](#input\_lambda\_binary\_base64sha256) | Optional base64-encoded SHA256 of the Lambda zip. Override this for custom artifacts or when you want to avoid fetching the published checksum URL. | `string` | `null` | no | | [lambda\_binary\_path](#input\_lambda\_binary\_path) | Optional path to a pre-built Lambda zip on disk. Use this to build the artifact outside Terraform and avoid apply-time compilation. | `string` | `null` | no | -| [lambda\_binary\_url](#input\_lambda\_binary\_url) | Optional URL to a pre-compiled Go Lambda zip. Defaults to the versioned release asset matching this module release tag. | `string` | `null` | no | | [lambda\_memory\_size](#input\_lambda\_memory\_size) | Memory allocated to the Lambda function in MB (also scales CPU proportionally) | `number` | `128` | no | | [log\_retention\_days](#input\_log\_retention\_days) | CloudWatch log retention in days (only used when enable\_logging is true) | `number` | `14` | no | | [market\_type](#input\_market\_type) | Whether to use spot or on-demand instances | `string` | `"on-demand"` | no | diff --git a/docs/workflows.md b/docs/workflows.md index c2e0375..bbaefcc 100644 --- a/docs/workflows.md +++ b/docs/workflows.md @@ -100,15 +100,31 @@ Runs `googleapis/release-please-action@v4` with: - Creates a **GitHub Release** with a version tag (e.g., `v0.1.0`). - Sets output `release_created=true` and `tag_name=v0.1.0`. +## Prepare Release Lambda (`release-please-lambda.yml`) + +Runs on release-please PRs before merge. + +1. Checks out the `release-please--...` branch. +2. Builds a deterministic `linux/arm64` Lambda zip from the PR contents. +3. Computes its base64 SHA256. +4. Updates `.lambda-release.json` with the release version from `.release-please-manifest.json` and the matching hash. +5. Commits that metadata back onto the release PR branch if it changed. + +This keeps the module default simple at runtime: + +- Terraform downloads a versioned `lambda.zip` release asset. +- Terraform reads the matching committed hash from `.lambda-release.json`. +- No checksum download is needed during `terraform plan`. + ### Job 2: `build-lambda` Only runs when `release_created == 'true'` (i.e., the push that merges a release PR). 1. Cross-compiles the Go Lambda for `linux/arm64`. -2. Zips as `lambda.zip`. -3. Writes `lambda.zip.base64sha256`, containing the base64-encoded SHA256 for the zip. -4. **Uploads the zip and checksum to the versioned release** (e.g., `v0.1.0`). -5. **Creates/updates a rolling `nat-zero-lambda-latest` release** with the same zip and checksum for convenience, but the module default now pins to the versioned release asset that matches the tagged module version. +2. Creates a deterministic `lambda.zip`. +3. Verifies the built zip matches the committed metadata in `.lambda-release.json`. +4. **Uploads the zip to the versioned release** (e.g., `v0.1.0`). +5. **Creates/updates a rolling `nat-zero-lambda-latest` release** with the same zip for convenience, but the module default pins to the versioned release asset that matches the tagged module version. ### Changelog sections @@ -158,5 +174,5 @@ Post-merge to main: Merge release PR: -> release-please creates GitHub Release + tag - -> build-lambda uploads lambda.zip + lambda.zip.base64sha256 to release + rolling latest + -> build-lambda uploads lambda.zip to release + rolling latest ``` diff --git a/lambda.tf b/lambda.tf index 442d5bc..38bbea7 100644 --- a/lambda.tf +++ b/lambda.tf @@ -18,32 +18,18 @@ resource "time_sleep" "lambda_ready" { } locals { - module_release_version = jsondecode(file("${path.module}/.release-please-manifest.json"))["."] - default_lambda_binary_url = "https://github.com/MachineDotDev/nat-zero/releases/download/v${local.module_release_version}/lambda.zip" - downloaded_lambda_zip_path = "${path.module}/.build/lambda.zip" - effective_lambda_binary_url = coalesce(var.lambda_binary_url, local.default_lambda_binary_url) - lambda_binary_hash_url = "${local.effective_lambda_binary_url}.base64sha256" - local_lambda_zip_path = coalesce(var.lambda_binary_path, local.downloaded_lambda_zip_path) + lambda_release_metadata = jsondecode(file("${path.module}/.lambda-release.json")) + module_release_version = jsondecode(file("${path.module}/.release-please-manifest.json"))["."] + default_lambda_binary_url = "https://github.com/MachineDotDev/nat-zero/releases/download/v${local.module_release_version}/lambda.zip" + default_lambda_binary_base64sha256 = local.lambda_release_metadata.base64sha256 + downloaded_lambda_zip_path = "${path.module}/.build/lambda.zip" + local_lambda_zip_path = coalesce(var.lambda_binary_path, local.downloaded_lambda_zip_path) local_lambda_source_hash = var.lambda_binary_path != null ? ( - coalesce(var.lambda_binary_base64sha256, filebase64sha256(var.lambda_binary_path)) + filebase64sha256(var.lambda_binary_path) ) : ( fileexists(local.downloaded_lambda_zip_path) ? filebase64sha256(local.downloaded_lambda_zip_path) : null ) - downloaded_lambda_source_hash = coalesce( - var.lambda_binary_base64sha256, - one(data.http.lambda_binary_hash[*].response_body), - null, - ) - lambda_source_hash = var.build_lambda_locally ? local.local_lambda_source_hash : trimspace(coalesce(local.downloaded_lambda_source_hash, "")) -} - -data "http" "lambda_binary_hash" { - count = var.build_lambda_locally || var.lambda_binary_path != null || var.lambda_binary_base64sha256 != null ? 0 : 1 - url = local.lambda_binary_hash_url - - request_headers = { - Accept = "text/plain" - } + lambda_source_hash = var.build_lambda_locally || var.lambda_binary_path != null ? local.local_lambda_source_hash : local.default_lambda_binary_base64sha256 } resource "terraform_data" "download_lambda" { @@ -51,15 +37,14 @@ resource "terraform_data" "download_lambda" { triggers_replace = [ path.module, - local.effective_lambda_binary_url, - local.lambda_binary_hash_url, - trimspace(coalesce(local.downloaded_lambda_source_hash, "")), + local.default_lambda_binary_url, + local.default_lambda_binary_base64sha256, ] provisioner "local-exec" { command = <<-EOT mkdir -p "${path.module}/.build" && \ - curl -sfL -o "${local.downloaded_lambda_zip_path}" "${local.effective_lambda_binary_url}" + curl -sfL -o "${local.downloaded_lambda_zip_path}" "${local.default_lambda_binary_url}" EOT } } @@ -81,8 +66,9 @@ resource "null_resource" "build_lambda" { provisioner "local-exec" { command = <<-EOT cd ${path.module}/cmd/lambda && \ - GOOS=linux GOARCH=arm64 CGO_ENABLED=0 go build -tags lambda.norpc -ldflags='-s -w' -o bootstrap && \ - zip lambda.zip bootstrap && \ + GOOS=linux GOARCH=arm64 CGO_ENABLED=0 go build -trimpath -buildvcs=false -tags lambda.norpc -ldflags='-s -w -buildid=' -o bootstrap && \ + TZ=UTC touch -t 198001010000 bootstrap && \ + zip -q -X lambda.zip bootstrap && \ mkdir -p ../../.build && \ cp lambda.zip ../../.build/lambda.zip && \ rm bootstrap lambda.zip @@ -125,6 +111,11 @@ resource "aws_lambda_function" "nat_zero" { condition = !(var.build_lambda_locally && var.lambda_binary_path != null) error_message = "build_lambda_locally and lambda_binary_path cannot be used together." } + + precondition { + condition = var.build_lambda_locally || var.lambda_binary_path != null || local.lambda_release_metadata.version == local.module_release_version + error_message = ".lambda-release.json must match .release-please-manifest.json when using the bundled Lambda release artifact." + } } depends_on = [time_sleep.lambda_ready, terraform_data.download_lambda, null_resource.build_lambda] diff --git a/variables.tf b/variables.tf index d2dbca2..941c9de 100644 --- a/variables.tf +++ b/variables.tf @@ -144,15 +144,3 @@ variable "lambda_binary_path" { default = null description = "Optional path to a pre-built Lambda zip on disk. Use this to build the artifact outside Terraform and avoid apply-time compilation." } - -variable "lambda_binary_url" { - type = string - default = null - description = "Optional URL to a pre-compiled Go Lambda zip. Defaults to the versioned release asset matching this module release tag." -} - -variable "lambda_binary_base64sha256" { - type = string - default = null - description = "Optional base64-encoded SHA256 of the Lambda zip. Override this for custom artifacts or when you want to avoid fetching the published checksum URL." -} diff --git a/versions.tf b/versions.tf index ff35e50..165d398 100644 --- a/versions.tf +++ b/versions.tf @@ -6,10 +6,6 @@ terraform { source = "hashicorp/aws" version = ">= 5.0" } - http = { - source = "hashicorp/http" - version = ">= 3.0" - } null = { source = "hashicorp/null" version = ">= 3.0" From 9929617099d831de221cdfedab5dcb4f2da51a3c Mon Sep 17 00:00:00 2001 From: Leonard O'Sullivan Date: Mon, 9 Mar 2026 09:39:27 +1000 Subject: [PATCH 17/27] fix: simplify lambda release flow --- .github/workflows/release-please-lambda.yml | 74 --------------------- .github/workflows/release-please.yml | 23 +------ .lambda-release.json | 4 -- README.md | 19 ++++++ docs/examples.md | 15 ++++- docs/reference.md | 2 + docs/workflows.md | 42 ++++++------ lambda.tf | 29 ++++---- versions.tf | 4 ++ 9 files changed, 75 insertions(+), 137 deletions(-) delete mode 100644 .github/workflows/release-please-lambda.yml delete mode 100644 .lambda-release.json diff --git a/.github/workflows/release-please-lambda.yml b/.github/workflows/release-please-lambda.yml deleted file mode 100644 index b7cd701..0000000 --- a/.github/workflows/release-please-lambda.yml +++ /dev/null @@ -1,74 +0,0 @@ -name: Prepare Release Lambda - -on: - pull_request_target: - types: [opened, synchronize, reopened] - -permissions: - contents: write - -jobs: - update-lambda-hash: - if: github.event.pull_request.head.repo.full_name == github.repository && startsWith(github.event.pull_request.head.ref, 'release-please--') && github.event.pull_request.user.login == 'app/github-actions' - runs-on: ubuntu-latest - defaults: - run: - working-directory: cmd/lambda - steps: - - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 - with: - ref: ${{ github.event.pull_request.head.ref }} - token: ${{ secrets.GITHUB_TOKEN }} - - - uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5 - with: - go-version-file: cmd/lambda/go.mod - - - name: Build deterministic Lambda zip - run: | - GOOS=linux GOARCH=arm64 CGO_ENABLED=0 go build -trimpath -buildvcs=false -tags lambda.norpc -ldflags='-s -w -buildid=' -o bootstrap - TZ=UTC touch -t 198001010000 bootstrap - zip -q -X lambda.zip bootstrap - - - name: Read release version - id: release-version - run: echo "release_version=$(jq -r '."."' ../../.release-please-manifest.json)" >> "$GITHUB_OUTPUT" - - - name: Read Lambda hash - id: lambda-hash - run: echo "lambda_hash=$(openssl dgst -sha256 -binary lambda.zip | openssl base64 -A)" >> "$GITHUB_OUTPUT" - - - name: Update committed Lambda metadata - env: - RELEASE_VERSION: ${{ steps.release-version.outputs.release_version }} - LAMBDA_HASH: ${{ steps.lambda-hash.outputs.lambda_hash }} - run: | - python - <<'PY' - import json - import os - from pathlib import Path - - Path("../../.lambda-release.json").write_text( - json.dumps( - { - "version": os.environ["RELEASE_VERSION"], - "base64sha256": os.environ["LAMBDA_HASH"], - }, - indent=2, - ) - + "\n" - ) - PY - - - name: Commit metadata update - working-directory: . - run: | - if git diff --quiet -- .lambda-release.json; then - exit 0 - fi - - git config user.name "github-actions[bot]" - git config user.email "41898282+github-actions[bot]@users.noreply.github.com" - git add .lambda-release.json - git commit -m "chore: update lambda release hash" - git push diff --git a/.github/workflows/release-please.yml b/.github/workflows/release-please.yml index 7352f33..38b3876 100644 --- a/.github/workflows/release-please.yml +++ b/.github/workflows/release-please.yml @@ -43,28 +43,9 @@ jobs: run: | TZ=UTC touch -t 198001010000 bootstrap zip -q -X lambda.zip bootstrap - - - name: Verify committed Lambda hash - run: | - expected_version=$(jq -r '.version' ../../.lambda-release.json) - actual_version=$(jq -r '."."' ../../.release-please-manifest.json) - test "$expected_version" = "$actual_version" - - expected_hash=$(jq -r '.base64sha256' ../../.lambda-release.json) - actual_hash=$(openssl dgst -sha256 -binary lambda.zip | openssl base64 -A) - test "$expected_hash" = "$actual_hash" + openssl dgst -sha256 -binary lambda.zip | openssl base64 -A > lambda.zip.base64sha256 - name: Upload to versioned release env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: gh release upload "${{ needs.release-please.outputs.tag_name }}" lambda.zip --clobber - - - name: Update rolling latest release - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - gh release create nat-zero-lambda-latest \ - --title "nat-zero Lambda (latest)" \ - --notes "Auto-built Go Lambda binary from ${{ needs.release-please.outputs.tag_name }}" \ - --latest=false 2>/dev/null || true - gh release upload nat-zero-lambda-latest lambda.zip --clobber + run: gh release upload "${{ needs.release-please.outputs.tag_name }}" lambda.zip lambda.zip.base64sha256 --clobber diff --git a/.lambda-release.json b/.lambda-release.json deleted file mode 100644 index c9e9f52..0000000 --- a/.lambda-release.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "version": "0.3.0", - "base64sha256": "Fk59AnRhiNu5O35sYG3VMM+iGPTu1il5La5+O1a51uo=" -} diff --git a/README.md b/README.md index 86f51d6..3dbee68 100644 --- a/README.md +++ b/README.md @@ -82,6 +82,23 @@ module "nat_zero" { See [Examples](docs/examples.md) for spot instances, custom AMIs, and building from source. +## Lambda Code Paths + +The module intentionally supports exactly three ways to supply the Lambda binary: + +1. Default release artifact + - Normal path for end users + - The module downloads the versioned `lambda.zip` and reads the matching `lambda.zip.base64sha256` from the tagged GitHub release + - The checksum file exists so Terraform can know the Lambda code hash during `plan`, before it downloads the zip during `apply` + - When a new release publishes a different checksum, Terraform sees the `source_code_hash` change during `plan` and knows the Lambda must be updated +2. Pre-built local zip via `lambda_binary_path` + - Best for CI, unreleased branch testing, or custom binaries + - Terraform hashes the local file during plan +3. Apply-time build via `build_lambda_locally = true` + - Local development only + - Requires Go and `zip` + - May require a second apply after Lambda code changes + ## Performance | Scenario | Time to connectivity | @@ -112,6 +129,7 @@ See [Performance](docs/performance.md) for detailed timings and cost breakdowns. |------|---------| | [terraform](#requirement\_terraform) | >= 1.4 | | [aws](#requirement\_aws) | >= 5.0 | +| [http](#requirement\_http) | >= 3.0 | | [null](#requirement\_null) | >= 3.0 | | [time](#requirement\_time) | >= 0.9 | @@ -145,6 +163,7 @@ No modules. | [time_sleep.eventbridge_propagation](https://registry.terraform.io/providers/hashicorp/time/latest/docs/resources/sleep) | resource | | [time_sleep.lambda_ready](https://registry.terraform.io/providers/hashicorp/time/latest/docs/resources/sleep) | resource | | [aws_ami.nat](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/ami) | data source | +| [http_http.lambda_binary_hash](https://registry.terraform.io/providers/hashicorp/http/latest/docs/data-sources/http) | data source | ## Inputs diff --git a/docs/examples.md b/docs/examples.md index 2c0c802..8a2e4af 100644 --- a/docs/examples.md +++ b/docs/examples.md @@ -133,9 +133,18 @@ module "nat_zero" { } ``` +## Lambda Code Paths + +This repo intentionally supports exactly three Lambda code paths: + +1. Default release path: do nothing extra. The module downloads the versioned `lambda.zip` and `lambda.zip.base64sha256` that match the tagged module release. + The checksum file exists so Terraform can know `source_code_hash` during `plan`, before it downloads the zip during `apply`. When the published checksum changes, Terraform can see the upstream Lambda code change in the plan. +2. Pre-built local zip: pass `lambda_binary_path` to test an unreleased branch or supply your own artifact. +3. Build during apply: set `build_lambda_locally = true` for local development only. + ## Building Lambda Locally -For development or if you want to build from source during `terraform apply`: +For development only, or if you explicitly want Terraform to build from source during `terraform apply`: ```hcl module "nat_zero" { @@ -151,7 +160,7 @@ Requires Go and `zip` installed locally. This is a non-standard path and may req ## Using a Pre-built Local Lambda Zip -For CI or local testing, it is cleaner to build the zip outside Terraform and pass it in directly: +For CI, branch testing, or if you want plan-time Lambda diffs without waiting for a release, build the zip outside Terraform and pass it in directly: ```hcl module "nat_zero" { @@ -163,4 +172,4 @@ module "nat_zero" { } ``` -This is also the right way to test an unreleased branch when the branch includes Lambda code changes. The default downloaded Lambda zip is pinned to the latest tagged module release. +This is the right way to test an unreleased branch when the branch includes Lambda code changes. The default downloaded Lambda zip is pinned to the latest tagged module release. diff --git a/docs/reference.md b/docs/reference.md index 2d74990..7777c48 100644 --- a/docs/reference.md +++ b/docs/reference.md @@ -4,6 +4,7 @@ |------|---------| | [terraform](#requirement\_terraform) | >= 1.4 | | [aws](#requirement\_aws) | >= 5.0 | +| [http](#requirement\_http) | >= 3.0 | | [null](#requirement\_null) | >= 3.0 | | [time](#requirement\_time) | >= 0.9 | @@ -37,6 +38,7 @@ No modules. | [time_sleep.eventbridge_propagation](https://registry.terraform.io/providers/hashicorp/time/latest/docs/resources/sleep) | resource | | [time_sleep.lambda_ready](https://registry.terraform.io/providers/hashicorp/time/latest/docs/resources/sleep) | resource | | [aws_ami.nat](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/ami) | data source | +| [http_http.lambda_binary_hash](https://registry.terraform.io/providers/hashicorp/http/latest/docs/data-sources/http) | data source | ## Inputs diff --git a/docs/workflows.md b/docs/workflows.md index bbaefcc..902c618 100644 --- a/docs/workflows.md +++ b/docs/workflows.md @@ -100,31 +100,16 @@ Runs `googleapis/release-please-action@v4` with: - Creates a **GitHub Release** with a version tag (e.g., `v0.1.0`). - Sets output `release_created=true` and `tag_name=v0.1.0`. -## Prepare Release Lambda (`release-please-lambda.yml`) - -Runs on release-please PRs before merge. - -1. Checks out the `release-please--...` branch. -2. Builds a deterministic `linux/arm64` Lambda zip from the PR contents. -3. Computes its base64 SHA256. -4. Updates `.lambda-release.json` with the release version from `.release-please-manifest.json` and the matching hash. -5. Commits that metadata back onto the release PR branch if it changed. - -This keeps the module default simple at runtime: - -- Terraform downloads a versioned `lambda.zip` release asset. -- Terraform reads the matching committed hash from `.lambda-release.json`. -- No checksum download is needed during `terraform plan`. - ### Job 2: `build-lambda` Only runs when `release_created == 'true'` (i.e., the push that merges a release PR). 1. Cross-compiles the Go Lambda for `linux/arm64`. 2. Creates a deterministic `lambda.zip`. -3. Verifies the built zip matches the committed metadata in `.lambda-release.json`. -4. **Uploads the zip to the versioned release** (e.g., `v0.1.0`). -5. **Creates/updates a rolling `nat-zero-lambda-latest` release** with the same zip for convenience, but the module default pins to the versioned release asset that matches the tagged module version. +3. Writes `lambda.zip.base64sha256`, containing the base64-encoded SHA256 for the zip. +4. **Uploads the zip and checksum to the versioned release** (e.g., `v0.1.0`). + +That is the full release artifact flow. There is no second workflow that edits the release PR, and there is no rolling "latest" Lambda artifact to keep in sync. ### Changelog sections @@ -174,5 +159,22 @@ Post-merge to main: Merge release PR: -> release-please creates GitHub Release + tag - -> build-lambda uploads lambda.zip to release + rolling latest + -> build-lambda uploads lambda.zip + lambda.zip.base64sha256 to that versioned release ``` + +## Lambda Code Paths + +The module intentionally supports exactly three ways to supply Lambda code: + +1. Default release artifact + - Best for normal users + - Terraform downloads the versioned `lambda.zip` and reads the matching `lambda.zip.base64sha256` + - The checksum file lets Terraform know `source_code_hash` during `plan`, before the zip is downloaded during `apply` + - A changed published checksum shows up as a Lambda code change in `terraform plan` +2. Pre-built local zip via `lambda_binary_path` + - Best for CI, branch testing, or custom unreleased binaries + - Terraform hashes the local file during plan +3. Apply-time build via `build_lambda_locally = true` + - Best for local development only + - Requires Go and `zip` + - May require a second apply after Lambda code changes diff --git a/lambda.tf b/lambda.tf index 38bbea7..e03a6bd 100644 --- a/lambda.tf +++ b/lambda.tf @@ -18,27 +18,32 @@ resource "time_sleep" "lambda_ready" { } locals { - lambda_release_metadata = jsondecode(file("${path.module}/.lambda-release.json")) - module_release_version = jsondecode(file("${path.module}/.release-please-manifest.json"))["."] - default_lambda_binary_url = "https://github.com/MachineDotDev/nat-zero/releases/download/v${local.module_release_version}/lambda.zip" - default_lambda_binary_base64sha256 = local.lambda_release_metadata.base64sha256 - downloaded_lambda_zip_path = "${path.module}/.build/lambda.zip" - local_lambda_zip_path = coalesce(var.lambda_binary_path, local.downloaded_lambda_zip_path) + module_release_version = jsondecode(file("${path.module}/.release-please-manifest.json"))["."] + default_lambda_binary_url = "https://github.com/MachineDotDev/nat-zero/releases/download/v${local.module_release_version}/lambda.zip" + lambda_binary_hash_url = "${local.default_lambda_binary_url}.base64sha256" + downloaded_lambda_zip_path = "${path.module}/.build/lambda.zip" + local_lambda_zip_path = coalesce(var.lambda_binary_path, local.downloaded_lambda_zip_path) local_lambda_source_hash = var.lambda_binary_path != null ? ( filebase64sha256(var.lambda_binary_path) ) : ( fileexists(local.downloaded_lambda_zip_path) ? filebase64sha256(local.downloaded_lambda_zip_path) : null ) - lambda_source_hash = var.build_lambda_locally || var.lambda_binary_path != null ? local.local_lambda_source_hash : local.default_lambda_binary_base64sha256 + downloaded_lambda_source_hash = one(data.http.lambda_binary_hash[*].response_body) + lambda_source_hash = var.build_lambda_locally || var.lambda_binary_path != null ? local.local_lambda_source_hash : trimspace(local.downloaded_lambda_source_hash) +} + +data "http" "lambda_binary_hash" { + count = var.build_lambda_locally || var.lambda_binary_path != null ? 0 : 1 + url = local.lambda_binary_hash_url } resource "terraform_data" "download_lambda" { count = var.build_lambda_locally || var.lambda_binary_path != null ? 0 : 1 triggers_replace = [ - path.module, local.default_lambda_binary_url, - local.default_lambda_binary_base64sha256, + local.lambda_binary_hash_url, + trimspace(local.downloaded_lambda_source_hash), ] provisioner "local-exec" { @@ -53,7 +58,6 @@ resource "null_resource" "build_lambda" { count = var.build_lambda_locally ? 1 : 0 triggers = { - module_path = path.module source_hash = sha256(join("", [ for f in sort(concat( tolist(fileset("${path.module}/cmd/lambda", "*.go")), @@ -111,11 +115,6 @@ resource "aws_lambda_function" "nat_zero" { condition = !(var.build_lambda_locally && var.lambda_binary_path != null) error_message = "build_lambda_locally and lambda_binary_path cannot be used together." } - - precondition { - condition = var.build_lambda_locally || var.lambda_binary_path != null || local.lambda_release_metadata.version == local.module_release_version - error_message = ".lambda-release.json must match .release-please-manifest.json when using the bundled Lambda release artifact." - } } depends_on = [time_sleep.lambda_ready, terraform_data.download_lambda, null_resource.build_lambda] diff --git a/versions.tf b/versions.tf index 165d398..ff35e50 100644 --- a/versions.tf +++ b/versions.tf @@ -6,6 +6,10 @@ terraform { source = "hashicorp/aws" version = ">= 5.0" } + http = { + source = "hashicorp/http" + version = ">= 3.0" + } null = { source = "hashicorp/null" version = ">= 3.0" From fcfa6eb3e292c85db0d82642714b43eb8dfa68f0 Mon Sep 17 00:00:00 2001 From: Leonard O'Sullivan Date: Mon, 9 Mar 2026 09:47:34 +1000 Subject: [PATCH 18/27] docs: recommend lambda usage by audience --- README.md | 10 ++++++++++ docs/examples.md | 10 ++++++++++ docs/index.md | 2 +- 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 3dbee68..3add691 100644 --- a/README.md +++ b/README.md @@ -99,6 +99,16 @@ The module intentionally supports exactly three ways to supply the Lambda binary - Requires Go and `zip` - May require a second apply after Lambda code changes +## Recommended Usage + +| Audience | Recommended module ref | Recommended Lambda path | Why | +|----------|------------------------|-------------------------|-----| +| Normal end users | Release tag such as `?ref=v0.4.0` | Default release artifact | Stable module code, stable versioned Lambda artifact, and clean plan/apply behavior | +| CI, branch testing, unreleased validation | Branch or commit ref | `lambda_binary_path` | Lets Terraform see Lambda code changes during plan before the branch has been released | +| Local module development | Working tree | `build_lambda_locally = true` | Fastest iteration loop while changing Go code inside this repo | + +`ref=main` is suitable for development, but it is not the stable consumption path for end users. If `main` has unreleased Go changes, the default Lambda artifact still comes from the latest tagged release until a new release is cut. + ## Performance | Scenario | Time to connectivity | diff --git a/docs/examples.md b/docs/examples.md index 8a2e4af..b0814bd 100644 --- a/docs/examples.md +++ b/docs/examples.md @@ -142,6 +142,16 @@ This repo intentionally supports exactly three Lambda code paths: 2. Pre-built local zip: pass `lambda_binary_path` to test an unreleased branch or supply your own artifact. 3. Build during apply: set `build_lambda_locally = true` for local development only. +## Recommended Usage By Audience + +| Audience | Recommended module ref | Recommended Lambda path | Why | +|----------|------------------------|-------------------------|-----| +| Normal end users | Release tag such as `?ref=v0.4.0` | Default release artifact | Stable module code, stable versioned Lambda artifact, and clean plan/apply behavior | +| CI, branch testing, unreleased validation | Branch or commit ref | `lambda_binary_path` | Lets Terraform see Lambda code changes during plan before the branch has been released | +| Local module development | Working tree | `build_lambda_locally = true` | Fastest iteration loop while changing Go code inside this repo | + +`ref=main` is fine for development, but it is not the stable consumer path. If `main` has unreleased Go changes, the default Lambda artifact still comes from the latest tagged release until the next release is published. + ## Building Lambda Locally For development only, or if you explicitly want Terraform to build from source during `terraform apply`: diff --git a/docs/index.md b/docs/index.md index ab08889..6b2ad71 100644 --- a/docs/index.md +++ b/docs/index.md @@ -34,7 +34,7 @@ module "nat_zero" { - [Architecture](architecture.md) — reconciliation model, decision matrix, event flows - [Performance](performance.md) — startup latency, Lambda execution times, cost breakdowns -- [Examples](examples.md) — spot instances, custom AMIs, building from source +- [Examples](examples.md) — spot instances, custom AMIs, Lambda code paths, recommended usage by audience - [Terraform Reference](reference.md) — inputs, outputs, resources - [Testing](testing.md) — integration test lifecycle and CI From a0f6591f05d3d4026c9d6e6dafce48d8dad3385c Mon Sep 17 00:00:00 2001 From: Leonard O'Sullivan Date: Mon, 9 Mar 2026 09:56:31 +1000 Subject: [PATCH 19/27] docs: clarify integration-only lambda overrides --- docs/testing.md | 1 + docs/workflows.md | 4 +++- tests/integration/fixture/main.tf | 9 +++++++-- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/docs/testing.md b/docs/testing.md index 30d179c..239e962 100644 --- a/docs/testing.md +++ b/docs/testing.md @@ -65,6 +65,7 @@ Integration tests run in GitHub Actions when the `integration-test` label is add - Timeout: 15 minutes - Region: us-east-1 - Default NAT AMI: shared private test nat-zero AMI supplied via the GitHub Actions variable `NAT_ZERO_TEST_AMI_ID` unless `nat_ami_id` is supplied explicitly +- These are integration-fixture overrides only. Normal module consumers should not set `nat_ami_id`; the module defaults to the published nat-zero AMI track. ## Orphan Detection diff --git a/docs/workflows.md b/docs/workflows.md index 902c618..49b9d73 100644 --- a/docs/workflows.md +++ b/docs/workflows.md @@ -42,9 +42,11 @@ Full end-to-end test: deploys real AWS infrastructure via Terratest, exercises t - **Timeout**: 15 minutes. - **Job name**: `integration-test` (required status check for merge). - **Optional inputs**: - - `nat_ami_id` to force the fixture onto a specific NAT AMI. If omitted, the workflow uses the shared private test AMI from the GitHub Actions variable `NAT_ZERO_TEST_AMI_ID`. + - `nat_ami_id` to force the integration fixture onto a specific NAT AMI. If omitted, the workflow uses the shared private test AMI from the GitHub Actions variable `NAT_ZERO_TEST_AMI_ID`. - `updated_nat_ami_id` to exercise the AMI replacement path after a second `terraform apply`. +These inputs are test-only fixture controls. Normal module consumers should omit them and use the published nat-zero AMI defaults. + ### Steps 1. Checkout, setup Go, setup Terraform (wrapper disabled). diff --git a/tests/integration/fixture/main.tf b/tests/integration/fixture/main.tf index 820ebf9..d8f208a 100644 --- a/tests/integration/fixture/main.tf +++ b/tests/integration/fixture/main.tf @@ -96,8 +96,13 @@ module "nat_zero" { instance_type = var.nat_instance_type market_type = "on-demand" encrypt_root_volume = var.encrypt_root_volume - ami_id = var.nat_ami_id - lambda_binary_path = fileexists("${path.module}/../../.build/lambda.zip") ? abspath("${path.module}/../../.build/lambda.zip") : null + + # Test-only overrides: + # - ami_id lets the integration suite force a specific baseline or upgraded NAT AMI. + # - lambda_binary_path lets branch tests exercise unreleased Lambda code. + # Normal module consumers should omit both and use the published defaults. + ami_id = var.nat_ami_id + lambda_binary_path = fileexists("${path.module}/../../.build/lambda.zip") ? abspath("${path.module}/../../.build/lambda.zip") : null } output "vpc_id" { From 6f9c1ea3cc49cc278c12cb993f227b1d37201d2d Mon Sep 17 00:00:00 2001 From: Leonard O'Sullivan Date: Mon, 9 Mar 2026 10:53:03 +1000 Subject: [PATCH 20/27] ci: route manual PR checks through a single workflow --- .github/workflows/integration-tests.yml | 20 +++++----- .github/workflows/manual-pr-checks.yml | 24 ++++++++++++ .github/workflows/nat-images.yml | 49 +++++++++++++++---------- docs/testing.md | 2 +- docs/workflows.md | 23 +++++++++--- 5 files changed, 82 insertions(+), 36 deletions(-) create mode 100644 .github/workflows/manual-pr-checks.yml diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml index fa34f79..4da113f 100644 --- a/.github/workflows/integration-tests.yml +++ b/.github/workflows/integration-tests.yml @@ -1,8 +1,6 @@ name: Integration Tests on: - pull_request: - types: [labeled] workflow_dispatch: inputs: nat_ami_id: @@ -13,6 +11,10 @@ on: description: Optional replacement NAT AMI ID to exercise the AMI upgrade path required: false type: string + checkout_ref: + description: Optional git ref to checkout before running the test + required: false + type: string workflow_call: inputs: nat_ami_id: @@ -21,6 +23,9 @@ on: updated_nat_ami_id: required: false type: string + checkout_ref: + required: false + type: string concurrency: group: nat-zero-integration @@ -35,16 +40,13 @@ env: jobs: integration-test: - if: >- - github.event_name != 'pull_request' || - github.event.label.name == 'integration-test' || - inputs.nat_ami_id != '' || - inputs.updated_nat_ami_id != '' runs-on: ubuntu-latest timeout-minutes: 15 environment: integration steps: - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 + with: + ref: ${{ inputs.checkout_ref || github.ref_name }} - uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5 with: @@ -61,8 +63,8 @@ jobs: - name: Resolve NAT AMI inputs env: - INPUT_NAT_AMI_ID: ${{ inputs.nat_ami_id || github.event.inputs.nat_ami_id || '' }} - INPUT_UPDATED_NAT_AMI_ID: ${{ inputs.updated_nat_ami_id || github.event.inputs.updated_nat_ami_id || '' }} + INPUT_NAT_AMI_ID: ${{ inputs.nat_ami_id }} + INPUT_UPDATED_NAT_AMI_ID: ${{ inputs.updated_nat_ami_id }} run: | nat_ami_id="${INPUT_NAT_AMI_ID:-$TEST_NAT_AMI_ID}" diff --git a/.github/workflows/manual-pr-checks.yml b/.github/workflows/manual-pr-checks.yml new file mode 100644 index 0000000..433a3a0 --- /dev/null +++ b/.github/workflows/manual-pr-checks.yml @@ -0,0 +1,24 @@ +name: Manual PR Checks + +on: + pull_request: + types: [labeled] + +permissions: + contents: read + id-token: write + +jobs: + integration: + if: ${{ github.event.label.name == 'integration-test' }} + uses: ./.github/workflows/integration-tests.yml + secrets: inherit + with: + checkout_ref: ${{ github.event.pull_request.head.ref }} + + nat-images: + if: ${{ github.event.label.name == 'nat-images' }} + uses: ./.github/workflows/nat-images.yml + secrets: inherit + with: + checkout_ref: ${{ github.event.pull_request.head.ref }} diff --git a/.github/workflows/nat-images.yml b/.github/workflows/nat-images.yml index 90ae6d4..9f50083 100644 --- a/.github/workflows/nat-images.yml +++ b/.github/workflows/nat-images.yml @@ -17,9 +17,25 @@ on: required: false default: true type: boolean - pull_request: - types: - - labeled + checkout_ref: + description: Optional git ref to checkout before running the workflow + required: false + type: string + workflow_call: + inputs: + build_subnet_id: + required: false + type: string + source_region: + required: false + type: string + run_integration_gate: + required: false + default: true + type: boolean + checkout_ref: + required: false + type: string concurrency: group: nat-zero-ami @@ -27,6 +43,7 @@ concurrency: permissions: contents: read + id-token: write env: PACKER_REGIONS_FILE: ami/nat-zero-private-all-regions.pkrvars.hcl @@ -40,7 +57,6 @@ jobs: build_subnet_id: ${{ steps.resolve.outputs.build_subnet_id }} source_region: ${{ steps.resolve.outputs.source_region }} run_integration_gate: ${{ steps.resolve.outputs.run_integration_gate }} - should_run: ${{ steps.resolve.outputs.should_run }} should_publish: ${{ steps.resolve.outputs.should_publish }} checkout_ref: ${{ steps.resolve.outputs.checkout_ref }} steps: @@ -48,30 +64,25 @@ jobs: id: resolve env: EVENT_NAME: ${{ github.event_name }} - EVENT_LABEL: ${{ github.event.label.name }} - INPUT_BUILD_SUBNET_ID: ${{ github.event.inputs.build_subnet_id }} - INPUT_SOURCE_REGION: ${{ github.event.inputs.source_region }} - INPUT_RUN_INTEGRATION_GATE: ${{ github.event.inputs.run_integration_gate }} + INPUT_BUILD_SUBNET_ID: ${{ inputs.build_subnet_id }} + INPUT_SOURCE_REGION: ${{ inputs.source_region }} + INPUT_RUN_INTEGRATION_GATE: ${{ inputs.run_integration_gate }} + INPUT_CHECKOUT_REF: ${{ inputs.checkout_ref }} DEFAULT_BUILD_SUBNET_ID: ${{ vars.NAT_ZERO_AMI_BUILD_SUBNET_ID }} - GITHUB_HEAD_REF: ${{ github.head_ref }} GITHUB_REF_NAME: ${{ github.ref_name }} run: | - should_run=true should_publish=true - if [ "$EVENT_NAME" = "pull_request" ]; then - if [ "$EVENT_LABEL" != "nat-images" ]; then - should_run=false - fi + if [ "$EVENT_NAME" = "workflow_call" ]; then should_publish=false fi build_subnet_id="${INPUT_BUILD_SUBNET_ID:-$DEFAULT_BUILD_SUBNET_ID}" source_region="${INPUT_SOURCE_REGION:-us-east-1}" run_integration_gate="${INPUT_RUN_INTEGRATION_GATE:-true}" - checkout_ref="${GITHUB_HEAD_REF:-$GITHUB_REF_NAME}" + checkout_ref="${INPUT_CHECKOUT_REF:-$GITHUB_REF_NAME}" - if [ "$should_run" = "true" ] && [ -z "$build_subnet_id" ]; then + if [ -z "$build_subnet_id" ]; then echo "build_subnet_id input is required unless vars.NAT_ZERO_AMI_BUILD_SUBNET_ID is set" >&2 exit 1 fi @@ -80,14 +91,12 @@ jobs: echo "build_subnet_id=$build_subnet_id" echo "source_region=$source_region" echo "run_integration_gate=$run_integration_gate" - echo "should_run=$should_run" echo "should_publish=$should_publish" echo "checkout_ref=$checkout_ref" } >> "$GITHUB_OUTPUT" build-and-copy: needs: resolve-inputs - if: ${{ needs.resolve-inputs.outputs.should_run == 'true' }} runs-on: ubuntu-latest environment: ami-build permissions: @@ -182,13 +191,14 @@ jobs: echo "test_ami_id=$test_ami_id" >> "$GITHUB_OUTPUT" integration: - if: ${{ needs.resolve-inputs.outputs.should_run == 'true' && needs.resolve-inputs.outputs.run_integration_gate == 'true' }} + if: ${{ needs.resolve-inputs.outputs.run_integration_gate == 'true' }} needs: - resolve-inputs - build-and-copy uses: ./.github/workflows/integration-tests.yml secrets: inherit with: + checkout_ref: ${{ needs.resolve-inputs.outputs.checkout_ref }} nat_ami_id: ${{ vars.NAT_ZERO_TEST_AMI_ID }} updated_nat_ami_id: ${{ needs.build-and-copy.outputs.test_ami_id }} @@ -199,7 +209,6 @@ jobs: - integration if: >- always() && - needs.resolve-inputs.outputs.should_run == 'true' && needs.resolve-inputs.outputs.should_publish == 'true' && needs.build-and-copy.result == 'success' && ( diff --git a/docs/testing.md b/docs/testing.md index 239e962..68fb08a 100644 --- a/docs/testing.md +++ b/docs/testing.md @@ -59,7 +59,7 @@ The test uses [Terratest](https://terratest.gruntwork.io/) with a single `terraf ## CI -Integration tests run in GitHub Actions when the `integration-test` label is added to a PR. They use OIDC to assume an AWS role in a dedicated test account. +Integration tests run in GitHub Actions when the `integration-test` label is added to a PR. A small router workflow handles the label event and then calls the reusable integration workflow. The tests use OIDC to assume an AWS role in a dedicated test account. - Concurrency: one test at a time (`cancel-in-progress: false`) - Timeout: 15 minutes diff --git a/docs/workflows.md b/docs/workflows.md index 49b9d73..8689bc1 100644 --- a/docs/workflows.md +++ b/docs/workflows.md @@ -6,10 +6,11 @@ Internal reference for GitHub Actions workflows, repo rulesets, and the release | Workflow | File | Triggers | Required Check | |----------|------|----------|----------------| +| Manual PR Checks | `manual-pr-checks.yml` | PR labeled `integration-test` or `nat-images` | No (router workflow) | | Pre-commit | `precommit.yml` | All PRs | `precommit` | | Go Tests | `go-tests.yml` | PRs touching `cmd/lambda/**`; push to `main` | `go-test` | -| Integration Tests | `integration-tests.yml` | PR labeled `integration-test`; manual dispatch; reusable workflow | `integration-test` | -| NAT Images | `nat-images.yml` | Manual dispatch; PR labeled `nat-images` | No (promotion workflow) | +| Integration Tests | `integration-tests.yml` | Manual dispatch; reusable workflow | `integration-test` | +| NAT Images | `nat-images.yml` | Manual dispatch; reusable workflow | No (promotion workflow) | | Docs | `docs.yml` | Push to `main` (filtered paths) | No (post-merge deploy) | | Release | `release-please.yml` | Push to `main`; manual dispatch | No (post-merge) | @@ -29,14 +30,23 @@ Runs `go test -v -race ./...` in `cmd/lambda/` (Lambda unit tests). - **Job name**: `go-test` (required status check for merge). - **Note**: Path-filtered. If a PR doesn't touch Go code, this check won't run and won't block merge (see ruleset notes below). +## Manual PR Checks (`manual-pr-checks.yml`) + +Single entry point for expensive, manually requested PR checks. + +- **PR trigger**: `labeled` type only. +- **Labels**: + - `integration-test` -> calls the reusable integration workflow + - `nat-images` -> calls the reusable NAT image workflow +- **Why this exists**: GitHub cannot filter `pull_request:labeled` by label name up front. A single router workflow keeps that complexity in one place and prevents both heavyweight workflows from waking up on every label event. +- **How it appears on the PR**: the called reusable jobs show up as normal PR checks under the router workflow run. + ## Integration Tests (`integration-tests.yml`) Full end-to-end test: deploys real AWS infrastructure via Terratest, exercises the Lambda lifecycle (create NAT, scale-down, restart, cleanup), then destroys everything. -- **PR trigger**: `labeled` type only. Runs when the `integration-test` label is added. - **Manual trigger**: `workflow_dispatch`. - **Reusable trigger**: `workflow_call`. -- **Condition**: `github.event.label.name == 'integration-test'` (or manual dispatch). - **Concurrency**: Group `nat-zero-integration`, `cancel-in-progress: false`. Only one integration test runs at a time; new ones queue. - **Environment**: `integration` (holds the `INTEGRATION_ROLE_ARN` secret for OIDC). - **Timeout**: 15 minutes. @@ -68,7 +78,7 @@ Manual promotion workflow for the default public nat-zero AMI. 4. After the integration gates pass, run a small publish script that opens launch permissions for the copied AMIs. 5. Open a PR that updates the Terraform defaults (`ami_owner_account`, `ami_name_pattern`) so merge + release-please can publish the new module version. -For pre-merge validation on a branch, add the `nat-images` label to the PR. That trigger uses the GitHub Actions variable `NAT_ZERO_AMI_BUILD_SUBNET_ID`, runs the build and integration gates on the PR branch, and intentionally skips the public-sharing and promotion-PR jobs. +For pre-merge validation on a branch, add the `nat-images` label to the PR. The router workflow calls `nat-images.yml` as a reusable workflow, which uses the GitHub Actions variable `NAT_ZERO_AMI_BUILD_SUBNET_ID`, runs the build and integration gates on the PR branch, and intentionally skips the public-sharing and promotion-PR jobs. ## Docs (`docs.yml`) @@ -151,7 +161,8 @@ That is the full release artifact flow. There is no second workflow that edits t Open PR -> precommit runs (always) -> go-test runs (if cmd/lambda/** changed) - -> Add "integration-test" label -> integration tests run against real AWS + -> Add "integration-test" label -> router calls integration tests + -> Add "nat-images" label -> router calls the NAT image build/integration gate -> threads resolved -> Squash merge to main From f49cadfe80a44ce73de730893687c8b10070c30b Mon Sep 17 00:00:00 2001 From: Leonard O'Sullivan Date: Mon, 9 Mar 2026 11:02:10 +1000 Subject: [PATCH 21/27] ci: make manual PR labels one-shot triggers --- .github/workflows/integration-tests.yml | 9 --------- .github/workflows/manual-pr-checks.yml | 20 ++++++++++++++++---- .github/workflows/nat-images.yml | 15 --------------- docs/workflows.md | 1 + 4 files changed, 17 insertions(+), 28 deletions(-) diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml index 4da113f..664e37b 100644 --- a/.github/workflows/integration-tests.yml +++ b/.github/workflows/integration-tests.yml @@ -11,10 +11,6 @@ on: description: Optional replacement NAT AMI ID to exercise the AMI upgrade path required: false type: string - checkout_ref: - description: Optional git ref to checkout before running the test - required: false - type: string workflow_call: inputs: nat_ami_id: @@ -23,9 +19,6 @@ on: updated_nat_ami_id: required: false type: string - checkout_ref: - required: false - type: string concurrency: group: nat-zero-integration @@ -45,8 +38,6 @@ jobs: environment: integration steps: - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 - with: - ref: ${{ inputs.checkout_ref || github.ref_name }} - uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5 with: diff --git a/.github/workflows/manual-pr-checks.yml b/.github/workflows/manual-pr-checks.yml index 433a3a0..4c67293 100644 --- a/.github/workflows/manual-pr-checks.yml +++ b/.github/workflows/manual-pr-checks.yml @@ -7,18 +7,30 @@ on: permissions: contents: read id-token: write + pull-requests: write jobs: integration: if: ${{ github.event.label.name == 'integration-test' }} uses: ./.github/workflows/integration-tests.yml secrets: inherit - with: - checkout_ref: ${{ github.event.pull_request.head.ref }} nat-images: if: ${{ github.event.label.name == 'nat-images' }} uses: ./.github/workflows/nat-images.yml secrets: inherit - with: - checkout_ref: ${{ github.event.pull_request.head.ref }} + + clear-trigger-label: + if: >- + always() && + (github.event.label.name == 'integration-test' || github.event.label.name == 'nat-images') + needs: + - integration + - nat-images + runs-on: ubuntu-latest + steps: + - name: Remove trigger label + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + LABEL_NAME: ${{ github.event.label.name }} + run: gh pr edit "${{ github.event.pull_request.number }}" --remove-label "$LABEL_NAME" diff --git a/.github/workflows/nat-images.yml b/.github/workflows/nat-images.yml index 9f50083..868c426 100644 --- a/.github/workflows/nat-images.yml +++ b/.github/workflows/nat-images.yml @@ -17,10 +17,6 @@ on: required: false default: true type: boolean - checkout_ref: - description: Optional git ref to checkout before running the workflow - required: false - type: string workflow_call: inputs: build_subnet_id: @@ -33,9 +29,6 @@ on: required: false default: true type: boolean - checkout_ref: - required: false - type: string concurrency: group: nat-zero-ami @@ -58,7 +51,6 @@ jobs: source_region: ${{ steps.resolve.outputs.source_region }} run_integration_gate: ${{ steps.resolve.outputs.run_integration_gate }} should_publish: ${{ steps.resolve.outputs.should_publish }} - checkout_ref: ${{ steps.resolve.outputs.checkout_ref }} steps: - name: Resolve workflow inputs id: resolve @@ -67,9 +59,7 @@ jobs: INPUT_BUILD_SUBNET_ID: ${{ inputs.build_subnet_id }} INPUT_SOURCE_REGION: ${{ inputs.source_region }} INPUT_RUN_INTEGRATION_GATE: ${{ inputs.run_integration_gate }} - INPUT_CHECKOUT_REF: ${{ inputs.checkout_ref }} DEFAULT_BUILD_SUBNET_ID: ${{ vars.NAT_ZERO_AMI_BUILD_SUBNET_ID }} - GITHUB_REF_NAME: ${{ github.ref_name }} run: | should_publish=true @@ -80,7 +70,6 @@ jobs: build_subnet_id="${INPUT_BUILD_SUBNET_ID:-$DEFAULT_BUILD_SUBNET_ID}" source_region="${INPUT_SOURCE_REGION:-us-east-1}" run_integration_gate="${INPUT_RUN_INTEGRATION_GATE:-true}" - checkout_ref="${INPUT_CHECKOUT_REF:-$GITHUB_REF_NAME}" if [ -z "$build_subnet_id" ]; then echo "build_subnet_id input is required unless vars.NAT_ZERO_AMI_BUILD_SUBNET_ID is set" >&2 @@ -92,7 +81,6 @@ jobs: echo "source_region=$source_region" echo "run_integration_gate=$run_integration_gate" echo "should_publish=$should_publish" - echo "checkout_ref=$checkout_ref" } >> "$GITHUB_OUTPUT" build-and-copy: @@ -198,7 +186,6 @@ jobs: uses: ./.github/workflows/integration-tests.yml secrets: inherit with: - checkout_ref: ${{ needs.resolve-inputs.outputs.checkout_ref }} nat_ami_id: ${{ vars.NAT_ZERO_TEST_AMI_ID }} updated_nat_ami_id: ${{ needs.build-and-copy.outputs.test_ami_id }} @@ -251,8 +238,6 @@ jobs: pull-requests: write steps: - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 - with: - ref: ${{ needs.resolve-inputs.outputs.checkout_ref }} - uses: hashicorp/setup-terraform@b9cd54a3c349d3f38e8881555d616ced269862dd # v3 with: diff --git a/docs/workflows.md b/docs/workflows.md index 8689bc1..6378c7c 100644 --- a/docs/workflows.md +++ b/docs/workflows.md @@ -40,6 +40,7 @@ Single entry point for expensive, manually requested PR checks. - `nat-images` -> calls the reusable NAT image workflow - **Why this exists**: GitHub cannot filter `pull_request:labeled` by label name up front. A single router workflow keeps that complexity in one place and prevents both heavyweight workflows from waking up on every label event. - **How it appears on the PR**: the called reusable jobs show up as normal PR checks under the router workflow run. +- **One-shot labels**: the router removes the trigger label after the run is queued, so adding the same label later will trigger a fresh run again. ## Integration Tests (`integration-tests.yml`) From d3a3ce455b0a933ac5fe35604ad6ad9a6100aa5c Mon Sep 17 00:00:00 2001 From: Leonard O'Sullivan Date: Mon, 9 Mar 2026 11:04:38 +1000 Subject: [PATCH 22/27] ci: allow manual check router to remove labels --- .github/workflows/manual-pr-checks.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/manual-pr-checks.yml b/.github/workflows/manual-pr-checks.yml index 4c67293..17314b0 100644 --- a/.github/workflows/manual-pr-checks.yml +++ b/.github/workflows/manual-pr-checks.yml @@ -7,6 +7,7 @@ on: permissions: contents: read id-token: write + issues: write pull-requests: write jobs: From f68bd7d45cb668bf1e1de98c63a9a631ad375034 Mon Sep 17 00:00:00 2001 From: Leonard O'Sullivan Date: Mon, 9 Mar 2026 11:05:42 +1000 Subject: [PATCH 23/27] ci: match manual check router permissions to nested workflows --- .github/workflows/manual-pr-checks.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/manual-pr-checks.yml b/.github/workflows/manual-pr-checks.yml index 17314b0..b3918f4 100644 --- a/.github/workflows/manual-pr-checks.yml +++ b/.github/workflows/manual-pr-checks.yml @@ -5,7 +5,7 @@ on: types: [labeled] permissions: - contents: read + contents: write id-token: write issues: write pull-requests: write From fa2b712716ac1b696e559961fa463f013912f559 Mon Sep 17 00:00:00 2001 From: Leonard O'Sullivan Date: Mon, 9 Mar 2026 11:16:22 +1000 Subject: [PATCH 24/27] ci: remove manual trigger labels via api --- .github/workflows/manual-pr-checks.yml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/workflows/manual-pr-checks.yml b/.github/workflows/manual-pr-checks.yml index b3918f4..16e5c5b 100644 --- a/.github/workflows/manual-pr-checks.yml +++ b/.github/workflows/manual-pr-checks.yml @@ -31,7 +31,13 @@ jobs: runs-on: ubuntu-latest steps: - name: Remove trigger label + continue-on-error: true env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} LABEL_NAME: ${{ github.event.label.name }} - run: gh pr edit "${{ github.event.pull_request.number }}" --remove-label "$LABEL_NAME" + REPOSITORY: ${{ github.repository }} + PR_NUMBER: ${{ github.event.pull_request.number }} + run: | + gh api \ + --method DELETE \ + "repos/$REPOSITORY/issues/$PR_NUMBER/labels/$LABEL_NAME" From e246f6a0692d7de0f951ae81e0c632b256ca76e6 Mon Sep 17 00:00:00 2001 From: Leonard O'Sullivan Date: Mon, 9 Mar 2026 11:42:03 +1000 Subject: [PATCH 25/27] fix: handle launch template version fallback --- cmd/lambda/ec2ops.go | 8 +++-- cmd/lambda/ec2ops_test.go | 52 ++++++++++++++++++++++++++++++ tests/integration/nat_zero_test.go | 33 +++++++++++++------ 3 files changed, 82 insertions(+), 11 deletions(-) diff --git a/cmd/lambda/ec2ops.go b/cmd/lambda/ec2ops.go index 4fade97..fd16644 100644 --- a/cmd/lambda/ec2ops.go +++ b/cmd/lambda/ec2ops.go @@ -335,7 +335,7 @@ func (h *Handler) resolveLT(ctx context.Context, az, vpc string) (string, int64) ltID := aws.ToString(resp.LaunchTemplates[0].LaunchTemplateId) version := aws.ToInt64(resp.LaunchTemplates[0].LatestVersionNumber) if version == 0 { - return "", 0 + version = aws.ToInt64(resp.LaunchTemplates[0].DefaultVersionNumber) } return ltID, version } @@ -352,11 +352,15 @@ func (h *Handler) createNAT(ctx context.Context, az, vpc string) string { input := &ec2.RunInstancesInput{ LaunchTemplate: &ec2types.LaunchTemplateSpecification{ LaunchTemplateId: aws.String(ltID), - Version: aws.String(fmt.Sprintf("%d", version)), }, MinCount: aws.Int32(1), MaxCount: aws.Int32(1), } + if version > 0 { + input.LaunchTemplate.Version = aws.String(fmt.Sprintf("%d", version)) + } else { + log.Printf("Launch template %s has no version metadata, using EC2 default version", ltID) + } if h.ConfigVersion != "" { input.TagSpecifications = []ec2types.TagSpecification{{ diff --git a/cmd/lambda/ec2ops_test.go b/cmd/lambda/ec2ops_test.go index bc1fc0f..8bd3fb8 100644 --- a/cmd/lambda/ec2ops_test.go +++ b/cmd/lambda/ec2ops_test.go @@ -395,6 +395,58 @@ func TestCreateNAT(t *testing.T) { } }) + t.Run("falls back to default version when latest version missing", func(t *testing.T) { + mock := &mockEC2{} + mock.DescribeLaunchTemplatesFn = func(ctx context.Context, params *ec2.DescribeLaunchTemplatesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeLaunchTemplatesOutput, error) { + return &ec2.DescribeLaunchTemplatesOutput{ + LaunchTemplates: []ec2types.LaunchTemplate{{ + LaunchTemplateId: aws.String("lt-123"), + DefaultVersionNumber: aws.Int64(2), + }}, + }, nil + } + mock.RunInstancesFn = func(ctx context.Context, params *ec2.RunInstancesInput, optFns ...func(*ec2.Options)) (*ec2.RunInstancesOutput, error) { + if params.LaunchTemplate == nil || aws.ToString(params.LaunchTemplate.Version) != "2" { + t.Fatalf("expected launch template version 2, got %#v", params.LaunchTemplate) + } + return &ec2.RunInstancesOutput{ + Instances: []ec2types.Instance{{InstanceId: aws.String("i-new2")}}, + }, nil + } + h := newTestHandler(mock) + result := h.createNAT(context.Background(), testAZ, testVPC) + if result != "i-new2" { + t.Errorf("expected i-new2, got %s", result) + } + }) + + t.Run("uses template without explicit version when metadata missing", func(t *testing.T) { + mock := &mockEC2{} + mock.DescribeLaunchTemplatesFn = func(ctx context.Context, params *ec2.DescribeLaunchTemplatesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeLaunchTemplatesOutput, error) { + return &ec2.DescribeLaunchTemplatesOutput{ + LaunchTemplates: []ec2types.LaunchTemplate{{ + LaunchTemplateId: aws.String("lt-123"), + }}, + }, nil + } + mock.RunInstancesFn = func(ctx context.Context, params *ec2.RunInstancesInput, optFns ...func(*ec2.Options)) (*ec2.RunInstancesOutput, error) { + if params.LaunchTemplate == nil || aws.ToString(params.LaunchTemplate.LaunchTemplateId) != "lt-123" { + t.Fatalf("expected launch template id lt-123, got %#v", params.LaunchTemplate) + } + if params.LaunchTemplate.Version != nil { + t.Fatalf("expected launch template version to be omitted, got %q", aws.ToString(params.LaunchTemplate.Version)) + } + return &ec2.RunInstancesOutput{ + Instances: []ec2types.Instance{{InstanceId: aws.String("i-new3")}}, + }, nil + } + h := newTestHandler(mock) + result := h.createNAT(context.Background(), testAZ, testVPC) + if result != "i-new3" { + t.Errorf("expected i-new3, got %s", result) + } + }) + t.Run("no launch template", func(t *testing.T) { mock := &mockEC2{} mock.DescribeLaunchTemplatesFn = func(ctx context.Context, params *ec2.DescribeLaunchTemplatesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeLaunchTemplatesOutput, error) { diff --git a/tests/integration/nat_zero_test.go b/tests/integration/nat_zero_test.go index c7f8a23..abdb182 100644 --- a/tests/integration/nat_zero_test.go +++ b/tests/integration/nat_zero_test.go @@ -189,11 +189,18 @@ func TestNatZero(t *testing.T) { // Shared across phases — set by Phase 1, used by Phase 2. var activeWorkloadID string + runPhase := func(name string, fn func(t *testing.T)) bool { + if t.Run(name, fn) { + return true + } + t.Logf("Phase %s failed, aborting remaining phases so deferred cleanup can run", name) + return false + } // ── Phase 1: NAT creation and connectivity ────────────────────────── // Launch a workload and let EventBridge trigger the Lambda automatically. - t.Run("NATCreationAndConnectivity", func(t *testing.T) { + if !runPhase("NATCreationAndConnectivity", func(t *testing.T) { wlStart := time.Now() activeWorkloadID = launchWorkload(t, ec2Client, privateSubnet, amiID, runID, profileName, queueURL) record("Launch workload instance", time.Since(wlStart)) @@ -241,13 +248,15 @@ func TestNatZero(t *testing.T) { assert.Equal(t, natEIP, msg.EgressIP, "workload egress IP should match NAT EIP") t.Logf("Confirmed: workload egresses via NAT EIP %s", natEIP) - }) + }) { + return + } // ── Phase 2: NAT scale-down ───────────────────────────────────────── // Terminate the workload and let EventBridge drive the full // scale-down flow: stop NAT, then detach/release EIP. - t.Run("NATScaleDown", func(t *testing.T) { + if !runPhase("NATScaleDown", func(t *testing.T) { require.NotEmpty(t, activeWorkloadID, "Phase 1 must set activeWorkloadID") // Terminate the workload instance. EventBridge fires shutting-down @@ -304,12 +313,14 @@ func TestNatZero(t *testing.T) { }) record("Wait for EIP released", time.Since(eipStart)) t.Log("NAT stopped and EIP released") - }) + }) { + return + } // ── Phase 3: NAT restart from stopped state ───────────────────────── // Launch a new workload and let EventBridge trigger the restart. - t.Run("NATRestart", func(t *testing.T) { + if !runPhase("NATRestart", func(t *testing.T) { t.Log("Launching new workload to trigger NAT restart...") wlStart := time.Now() newWorkloadID := launchWorkload(t, ec2Client, privateSubnet, amiID, runID, profileName, queueURL) @@ -347,11 +358,13 @@ func TestNatZero(t *testing.T) { } else { t.Logf("Workload egressed via NAT auto-assigned IP %s (EIP %s attached after; expected during restart)", msg.EgressIP, natEIP) } - }) + }) { + return + } // ── Phase 4: NAT replacement on AMI update ───────────────────────── - t.Run("NATAMIUpgrade", func(t *testing.T) { + if !runPhase("NATAMIUpgrade", func(t *testing.T) { if updatedNatAMI == "" { t.Skip("NAT_ZERO_TEST_UPDATED_NAT_AMI_ID not set") } @@ -406,11 +419,13 @@ func TestNatZero(t *testing.T) { msg := waitForEgress(t, sqsClient, queueURL, 4*time.Minute) record("Wait for workload egress IP (AMI update)", time.Since(egressStart)) require.Equal(t, replacementEIP, msg.EgressIP, "workload egress IP should match replacement NAT EIP") - }) + }) { + return + } // ── Phase 5: Cleanup action ───────────────────────────────────────── - t.Run("CleanupAction", func(t *testing.T) { + runPhase("CleanupAction", func(t *testing.T) { // Terminate all test workloads before cleanup to match production // destroy ordering where Terraform deletes the EventBridge target // (stopping new events) before invoking the cleanup Lambda. From 46a22f8467691ced081c8d19743be56a16d548cb Mon Sep 17 00:00:00 2001 From: Leonard O'Sullivan Date: Mon, 9 Mar 2026 11:55:07 +1000 Subject: [PATCH 26/27] fix: restore launch template version permission --- iam.tf | 2 ++ 1 file changed, 2 insertions(+) diff --git a/iam.tf b/iam.tf index 848362f..f9274a2 100644 --- a/iam.tf +++ b/iam.tf @@ -48,6 +48,8 @@ resource "aws_iam_role_policy" "lambda_iam_policy" { Action = [ "ec2:DescribeInstances", "ec2:DescribeLaunchTemplates", + # EC2 resolves launch template versions during RunInstances authorization. + "ec2:DescribeLaunchTemplateVersions", "ec2:DescribeAddresses", ] Resource = "*" From 8af7b80310ea73a1a4f2d3c19ecac94b327794be Mon Sep 17 00:00:00 2001 From: Leonard O'Sullivan Date: Mon, 9 Mar 2026 12:20:37 +1000 Subject: [PATCH 27/27] fix: test branch lambda artifacts in integration --- tests/integration/fixture/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/fixture/main.tf b/tests/integration/fixture/main.tf index d8f208a..473c9c7 100644 --- a/tests/integration/fixture/main.tf +++ b/tests/integration/fixture/main.tf @@ -102,7 +102,7 @@ module "nat_zero" { # - lambda_binary_path lets branch tests exercise unreleased Lambda code. # Normal module consumers should omit both and use the published defaults. ami_id = var.nat_ami_id - lambda_binary_path = fileexists("${path.module}/../../.build/lambda.zip") ? abspath("${path.module}/../../.build/lambda.zip") : null + lambda_binary_path = fileexists("${path.module}/../../../.build/lambda.zip") ? abspath("${path.module}/../../../.build/lambda.zip") : null } output "vpc_id" {