From fbbedbecb2fcde9ee25af9c6e262465ec64905b1 Mon Sep 17 00:00:00 2001
From: Huy Do <huydo@meta.com>
Date: Wed, 10 Jun 2026 01:52:22 -0700
Subject: [PATCH 01/14] Update

[ghstack-poisoned]
---
 .../docker/test-buildkit-scale/Dockerfile     |  6 ++
 .../scripts/python/phases.py                  | 20 ++++---
 .../scripts/python/test_run.py                | 10 ++--
 .../workflows/build-image-scale.yaml          | 59 +++++++++++++++++++
 .../workflows/integration-test.yaml.tpl       | 14 +++++
 5 files changed, 96 insertions(+), 13 deletions(-)
 create mode 100644 osdc/integration-tests/docker/test-buildkit-scale/Dockerfile
 create mode 100644 osdc/integration-tests/workflows/build-image-scale.yaml

diff --git a/osdc/integration-tests/docker/test-buildkit-scale/Dockerfile b/osdc/integration-tests/docker/test-buildkit-scale/Dockerfile
new file mode 100644
index 00000000..e317eb61
--- /dev/null
+++ b/osdc/integration-tests/docker/test-buildkit-scale/Dockerfile
@@ -0,0 +1,6 @@
+# Holds one BuildKit slot (~10 min) so a burst of parallel builds exceeds the
+# warm baseline and forces KEDA scale-up. CACHEBUST differs per build so each
+# actually runs (no layer reuse).
+FROM alpine:3.21
+ARG CACHEBUST
+RUN echo "osdc buildkit scale-test ${CACHEBUST}" && sleep 600
diff --git a/osdc/integration-tests/scripts/python/phases.py b/osdc/integration-tests/scripts/python/phases.py
index 27ddebf6..17654000 100644
--- a/osdc/integration-tests/scripts/python/phases.py
+++ b/osdc/integration-tests/scripts/python/phases.py
@@ -276,15 +276,17 @@ def prepare_pr(
     # Write integration test workflow
     (workflows_dir / "integration-test.yaml").write_text(workflow_content)
 
-    # Copy build-image reusable workflow
-    build_wf_src = upstream_dir / "integration-tests" / "workflows" / "build-image.yaml"
-    (workflows_dir / "build-image.yaml").write_text(build_wf_src.read_text())
-
-    # Copy test Dockerfile
-    docker_dir = canary_path / "docker" / "test-buildkit"
-    docker_dir.mkdir(parents=True, exist_ok=True)
-    dockerfile_src = upstream_dir / "integration-tests" / "docker" / "test-buildkit" / "Dockerfile"
-    (docker_dir / "Dockerfile").write_text(dockerfile_src.read_text())
+    # Copy reusable BuildKit workflows
+    wf_root = upstream_dir / "integration-tests" / "workflows"
+    for wf in ("build-image.yaml", "build-image-scale.yaml"):
+        (workflows_dir / wf).write_text((wf_root / wf).read_text())
+
+    # Copy test Dockerfiles (connectivity + autoscaling scale test)
+    docker_root = upstream_dir / "integration-tests" / "docker"
+    for name in ("test-buildkit", "test-buildkit-scale"):
+        dst = canary_path / "docker" / name
+        dst.mkdir(parents=True, exist_ok=True)
+        (dst / "Dockerfile").write_text((docker_root / name / "Dockerfile").read_text())
 
     # Commit
     run_cmd(["git", "add", "-A"], cwd=canary_path)
diff --git a/osdc/integration-tests/scripts/python/test_run.py b/osdc/integration-tests/scripts/python/test_run.py
index ff3a7bcb..dc4d5188 100644
--- a/osdc/integration-tests/scripts/python/test_run.py
+++ b/osdc/integration-tests/scripts/python/test_run.py
@@ -112,11 +112,13 @@ def workflow_template(tmp_path):
     )
     (wf_dir / "integration-test.yaml.tpl").write_text(template)
 
-    # Also create build-image.yaml and Dockerfile for prepare_pr
+    # Also create reusable workflows and Dockerfiles for prepare_pr
     (wf_dir / "build-image.yaml").write_text("name: build-image\n")
-    docker_dir = upstream / "integration-tests" / "docker" / "test-buildkit"
-    docker_dir.mkdir(parents=True)
-    (docker_dir / "Dockerfile").write_text("FROM alpine\n")
+    (wf_dir / "build-image-scale.yaml").write_text("name: build-image-scale\n")
+    for name in ("test-buildkit", "test-buildkit-scale"):
+        docker_dir = upstream / "integration-tests" / "docker" / name
+        docker_dir.mkdir(parents=True)
+        (docker_dir / "Dockerfile").write_text("FROM alpine\n")
 
     return upstream
 
diff --git a/osdc/integration-tests/workflows/build-image-scale.yaml b/osdc/integration-tests/workflows/build-image-scale.yaml
new file mode 100644
index 00000000..b485dc5a
--- /dev/null
+++ b/osdc/integration-tests/workflows/build-image-scale.yaml
@@ -0,0 +1,59 @@
+# Reusable workflow: BuildKit autoscaling scale test.
+# Launches a burst of 8 parallel builds against one arch's remote BuildKit, each
+# holding a slot (server maxconn=1) for ~10 min via a sleep. The warm baseline
+# (amd64_min / arm64_min) is below the burst, so the builds finish within
+# timeout-minutes only if KEDA scales the pool up. Without scale-up they
+# serialize through the baseline pods and the back of the queue times out — i.e.
+# this job FAILS when autoscaling does not happen.
+name: BuildKit Scale Test
+
+on:
+  workflow_call:
+    inputs:
+      arch:
+        description: "Target architecture (amd64 or arm64)"
+        required: true
+        type: string
+      runner_label:
+        description: "Runner label to use (includes cluster prefix)"
+        required: false
+        type: string
+        default: "l-x86iavx512-2-4"
+
+jobs:
+  scale:
+    # Runs on x86 — BuildKit is a *remote* builder; arch selects the endpoint.
+    # timeout-minutes is the gate: scaled-up ~18 min, serialized ~43 min.
+    runs-on: ${{ inputs.runner_label }}
+    timeout-minutes: 30
+    strategy:
+      fail-fast: false
+      matrix:
+        replica: [1, 2, 3, 4, 5, 6, 7, 8]
+    container:
+      image: ghcr.io/actions/actions-runner:latest
+    steps:
+      - name: Install buildctl
+        run: |
+          BUILDKIT_VERSION="v0.29.0"
+          mkdir -p "$HOME/.local/bin"
+          curl -sSL "https://github.com/moby/buildkit/releases/download/${BUILDKIT_VERSION}/buildkit-${BUILDKIT_VERSION}.linux-amd64.tar.gz" \
+            | tar xz --strip-components=1 -C "$HOME/.local/bin" bin/buildctl
+          echo "$HOME/.local/bin" >> "$GITHUB_PATH"
+          "$HOME/.local/bin/buildctl" --version
+
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Occupy a BuildKit slot (~10 min) to drive autoscaling
+        run: |
+          ENDPOINT="tcp://buildkitd-${{ inputs.arch }}.buildkit:1234"
+          echo "Build ${{ matrix.replica }} -> $ENDPOINT"
+          buildctl --addr "$ENDPOINT" build \
+            --frontend dockerfile.v0 \
+            --local context=docker/test-buildkit-scale \
+            --local dockerfile=docker/test-buildkit-scale \
+            --opt build-arg:CACHEBUST=${{ inputs.arch }}-${{ matrix.replica }}-${{ github.run_id }} \
+            --no-cache \
+            --output type=cacheonly
+          echo "PASS: build ${{ matrix.replica }} finished within timeout"
diff --git a/osdc/integration-tests/workflows/integration-test.yaml.tpl b/osdc/integration-tests/workflows/integration-test.yaml.tpl
index c9563480..7d4fb3aa 100644
--- a/osdc/integration-tests/workflows/integration-test.yaml.tpl
+++ b/osdc/integration-tests/workflows/integration-test.yaml.tpl
@@ -1514,6 +1514,20 @@ jobs:
       arch: arm64
       runner_label: {{PREFIX}}l-x86iamx-8-32
 
+  # ── BuildKit Autoscaling Scale Test ───────────────────────────────────
+  # Bursts 8 parallel builds per arch; fails if KEDA does not scale the pool up.
+  buildkit-scale-amd64:
+    uses: ./.github/workflows/build-image-scale.yaml
+    with:
+      arch: amd64
+      runner_label: {{PREFIX}}l-x86iamx-8-32
+
+  buildkit-scale-arm64:
+    uses: ./.github/workflows/build-image-scale.yaml
+    with:
+      arch: arm64
+      runner_label: {{PREFIX}}l-x86iamx-8-32
+
   # ── Harbor Cache Test ─────────────────────────────────────────────────
   test-harbor:
     runs-on: {{PREFIX}}l-x86iamx-8-32

From 4f58d648123935f125e406cb5549e247c26811af Mon Sep 17 00:00:00 2001
From: Huy Do <huydo@meta.com>
Date: Wed, 10 Jun 2026 01:52:22 -0700
Subject: [PATCH 02/14] Update (base update)

[ghstack-poisoned]
---
 osdc/clusters.yaml                            |  21 ++-
 osdc/modules/buildkit/README.md               |  32 ++++
 osdc/modules/buildkit/deploy.sh               |  42 ++++-
 .../kubernetes/base/drain-configmap.yaml      |  21 +++
 .../buildkit/kubernetes/base/haproxy.yaml     |  10 +-
 .../kubernetes/base/kustomization.yaml        |   2 +
 .../kubernetes/base/poddisruptionbudget.yaml  |  25 +++
 .../scripts/python/generate_buildkit.py       | 157 ++++++++++++++++--
 .../scripts/python/test_generate_buildkit.py  | 118 +++++++++++++
 osdc/modules/keda/deploy.sh                   |  34 ++++
 10 files changed, 430 insertions(+), 32 deletions(-)
 create mode 100644 osdc/modules/buildkit/README.md
 create mode 100644 osdc/modules/buildkit/kubernetes/base/drain-configmap.yaml
 create mode 100644 osdc/modules/buildkit/kubernetes/base/poddisruptionbudget.yaml
 create mode 100755 osdc/modules/keda/deploy.sh

diff --git a/osdc/clusters.yaml b/osdc/clusters.yaml
index 44aa9a75..f6e15a08 100644
--- a/osdc/clusters.yaml
+++ b/osdc/clusters.yaml
@@ -83,6 +83,10 @@ defaults:
     cpu_disruption_budget: "20%"
   buildkit:
     replicas_per_arch: 12
+    autoscaling:
+      enabled: false
+  keda:
+    chart_version: "2.16.1"
   monitoring:
     grafana_cloud_url: "https://prometheus-prod-36-prod-us-west-0.grafana.net/api/prom/push"
     grafana_cloud_read_url: "https://prometheus-prod-36-prod-us-west-0.grafana.net/api/prom"
@@ -162,8 +166,14 @@ clusters:
       github_secret_name: pytorch-arc-staging
       runner_name_prefix: "c-mt-"
     buildkit:
-      replicas_per_arch: 2
       arm64_instance_type: m7gd.16xlarge
+      arm64_pods_per_node: 4
+      autoscaling:
+        enabled: true
+        amd64_min: 2  # 1x m6id.24xlarge (2 pods/node)
+        amd64_max: 8
+        arm64_min: 4  # 1x m7gd.16xlarge (4 pods/node)
+        arm64_max: 8
     pypi_cache:
       replicas: 1
     modules:
@@ -171,6 +181,7 @@ clusters:
       - arc
       - nodepools
       - arc-runners
+      - keda
       - buildkit
       - pypi-cache
       - cache-enforcer
@@ -283,11 +294,15 @@ clusters:
       min_node_age_seconds: 900
     buildkit:
       amd64_instance_type: m6id.24xlarge
-      amd64_replicas: 32
       amd64_pods_per_node: 2
       arm64_instance_type: m7gd.16xlarge
-      arm64_replicas: 8
       arm64_pods_per_node: 4
+      autoscaling:
+        enabled: true
+        amd64_min: 2   # 1x m6id.24xlarge (2 pods/node)
+        amd64_max: 128  # 14d peak 105; headroom to spare above it
+        arm64_min: 4   # 1x m7gd.16xlarge (4 pods/node)
+        arm64_max: 16  # 14d peak 8, likely capped by fixed pool; headroom
     arc-runners:
       github_config_url: "https://github.com/pytorch"
       github_secret_name: pytorch-arc-cbr-production
diff --git a/osdc/modules/buildkit/README.md b/osdc/modules/buildkit/README.md
new file mode 100644
index 00000000..e17fafdf
--- /dev/null
+++ b/osdc/modules/buildkit/README.md
@@ -0,0 +1,32 @@
+# BuildKit module
+
+Remote BuildKit build service: per-arch `buildkitd` Deployments behind an HAProxy
+LB, on dedicated Karpenter NodePools. Clients build with
+`buildctl --addr tcp://buildkitd-<arch>.buildkit:1234`.
+
+Sizing is per-arch in `clusters.yaml` (`buildkit.{amd64,arm64}_{replicas,pods_per_node}`,
+`*_instance_type`); pod CPU/mem is computed by `scripts/python/generate_buildkit.py`.
+
+## Autoscaling (optional, `buildkit.autoscaling.enabled`)
+
+Absorb bursts of concurrent builds without overloading existing pods, and scale
+back to a small warm baseline when idle.
+
+- **One build per pod** — HAProxy `server maxconn 1` (matches buildkitd
+  `max-parallelism = 1`). Excess builds **queue** in HAProxy (`timeout queue`)
+  instead of stacking on a busy pod; as new pods register (DNS), queued builds
+  flow onto them, so scaled-up pods never sit idle.
+- **In-cluster scale signal** — KEDA `ScaledObject` per arch, `metrics-api`
+  scraping the LB's own metrics (`haproxy_backend_current_sessions`) — no external
+  metrics backend.
+- **Warm baseline** — `amd64_min` / `arm64_min` keep ≥1 node per arch up so the
+  common case gets a free warm pod immediately. `*_max` caps the burst; NodePool
+  limits are sized to `*_max`.
+- **Safe scale-down** — `preStop` drain (waits until the pod's `:1234` is idle)
+  + long `terminationGracePeriodSeconds` + PDB, so a build is never killed
+  mid-flight.
+
+Build clients should retry the connect so a build can wait for a pod from a cold
+or queued pool.
+
+Requires the `keda` module deployed before `buildkit` (provides the CRDs).
diff --git a/osdc/modules/buildkit/deploy.sh b/osdc/modules/buildkit/deploy.sh
index c7be6d39..ece804ad 100755
--- a/osdc/modules/buildkit/deploy.sh
+++ b/osdc/modules/buildkit/deploy.sh
@@ -34,22 +34,38 @@ AMD64_REPLICAS=$(uv run "$CFG" "$CLUSTER" buildkit.amd64_replicas "$REPLICAS")
 ARM64_REPLICAS=$(uv run "$CFG" "$CLUSTER" buildkit.arm64_replicas "$REPLICAS")
 AMD64_PODS_PER_NODE=$(uv run "$CFG" "$CLUSTER" buildkit.amd64_pods_per_node "$PODS_PER_NODE")
 ARM64_PODS_PER_NODE=$(uv run "$CFG" "$CLUSTER" buildkit.arm64_pods_per_node "$PODS_PER_NODE")
+AUTOSCALING=$(uv run "$CFG" "$CLUSTER" buildkit.autoscaling.enabled false)
 
 GENERATED_DIR="$MODULE_DIR/generated"
 
 # --- Generate manifests ---
 
 echo "Generating BuildKit manifests..."
-uv run "$MODULE_DIR/scripts/python/generate_buildkit.py" \
-  --arm64-instance-type "$ARM64_INSTANCE" \
-  --amd64-instance-type "$AMD64_INSTANCE" \
-  --replicas "$REPLICAS" \
-  --pods-per-node "$PODS_PER_NODE" \
-  --amd64-replicas "$AMD64_REPLICAS" \
-  --arm64-replicas "$ARM64_REPLICAS" \
-  --amd64-pods-per-node "$AMD64_PODS_PER_NODE" \
-  --arm64-pods-per-node "$ARM64_PODS_PER_NODE" \
+GEN_ARGS=(
+  --arm64-instance-type "$ARM64_INSTANCE"
+  --amd64-instance-type "$AMD64_INSTANCE"
+  --replicas "$REPLICAS"
+  --pods-per-node "$PODS_PER_NODE"
+  --amd64-replicas "$AMD64_REPLICAS"
+  --arm64-replicas "$ARM64_REPLICAS"
+  --amd64-pods-per-node "$AMD64_PODS_PER_NODE"
+  --arm64-pods-per-node "$ARM64_PODS_PER_NODE"
   --output-dir "$GENERATED_DIR"
+)
+if [[ "${AUTOSCALING,,}" == "true" ]]; then
+  AMD64_MIN=$(uv run "$CFG" "$CLUSTER" buildkit.autoscaling.amd64_min 2)
+  AMD64_MAX=$(uv run "$CFG" "$CLUSTER" buildkit.autoscaling.amd64_max 8)
+  ARM64_MIN=$(uv run "$CFG" "$CLUSTER" buildkit.autoscaling.arm64_min 4)
+  ARM64_MAX=$(uv run "$CFG" "$CLUSTER" buildkit.autoscaling.arm64_max 8)
+  GEN_ARGS+=(
+    --autoscaling
+    --amd64-min "$AMD64_MIN"
+    --amd64-max "$AMD64_MAX"
+    --arm64-min "$ARM64_MIN"
+    --arm64-max "$ARM64_MAX"
+  )
+fi
+uv run "$MODULE_DIR/scripts/python/generate_buildkit.py" "${GEN_ARGS[@]}"
 
 # --- Apply NodePools (with cluster name substitution) ---
 
@@ -93,5 +109,13 @@ else
   kubectl rollout status deployment/buildkitd-amd64 -n buildkit --timeout=15m
 fi
 
+# --- KEDA autoscaling (optional) ---
+# Scales on the in-cluster buildkit LB metrics; no external metrics backend.
+
+if [[ "${AUTOSCALING,,}" == "true" ]]; then
+  echo "Applying KEDA autoscaling manifests..."
+  kubectl_apply_if_changed -f "$GENERATED_DIR/autoscaling.yaml"
+fi
+
 echo "BuildKit deployed."
 kubectl get pods -n buildkit -o wide
diff --git a/osdc/modules/buildkit/kubernetes/base/drain-configmap.yaml b/osdc/modules/buildkit/kubernetes/base/drain-configmap.yaml
new file mode 100644
index 00000000..89395ae9
--- /dev/null
+++ b/osdc/modules/buildkit/kubernetes/base/drain-configmap.yaml
@@ -0,0 +1,21 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: buildkitd-drain
+  namespace: buildkit
+data:
+  # preStop drain: block termination until no in-flight build remains. A build
+  # keeps an ESTABLISHED inbound connection on :1234 for its whole duration;
+  # require two consecutive idle polls so a transient health check can't be
+  # mistaken for "done". terminationGracePeriodSeconds caps the total wait.
+  drain.sh: |
+    #!/bin/sh
+    idle=0
+    while [ "$idle" -lt 2 ]; do
+      if netstat -tn 2>/dev/null | awk '$NF=="ESTABLISHED" && $4 ~ /:1234$/{f=1} END{exit !f}'; then
+        idle=0
+      else
+        idle=$((idle + 1))
+      fi
+      sleep 15
+    done
diff --git a/osdc/modules/buildkit/kubernetes/base/haproxy.yaml b/osdc/modules/buildkit/kubernetes/base/haproxy.yaml
index 3bfd4eb2..a52cece7 100644
--- a/osdc/modules/buildkit/kubernetes/base/haproxy.yaml
+++ b/osdc/modules/buildkit/kubernetes/base/haproxy.yaml
@@ -24,6 +24,10 @@ data:
       timeout connect 5s
       timeout client 120m
       timeout server 120m
+      # Queue a build (server maxconn=1, below) while KEDA/Karpenter add pods,
+      # instead of stacking it on a busy pod. The runner also retries the
+      # connect, so this is an upper bound, not the only safety net.
+      timeout queue 10m
       log global
       option tcplog
 
@@ -71,15 +75,15 @@ data:
     # resolution warnings.
     backend bk_arm64
       balance leastconn
-      server-template arm 10 buildkitd-arm64-pods.buildkit.svc.cluster.local:1234 check resolvers k8s init-addr none resolve-prefer ipv6
+      server-template arm 10 buildkitd-arm64-pods.buildkit.svc.cluster.local:1234 check resolvers k8s init-addr none resolve-prefer ipv6 maxconn 1
 
     backend bk_amd64
       balance leastconn
-      server-template amd 10 buildkitd-amd64-pods.buildkit.svc.cluster.local:1234 check resolvers k8s init-addr none resolve-prefer ipv6
+      server-template amd 10 buildkitd-amd64-pods.buildkit.svc.cluster.local:1234 check resolvers k8s init-addr none resolve-prefer ipv6 maxconn 1
 
     backend bk_all
       balance leastconn
-      server-template all 20 buildkitd-pods.buildkit.svc.cluster.local:1234 check resolvers k8s init-addr none resolve-prefer ipv6
+      server-template all 20 buildkitd-pods.buildkit.svc.cluster.local:1234 check resolvers k8s init-addr none resolve-prefer ipv6 maxconn 1
 
 ---
 apiVersion: apps/v1
diff --git a/osdc/modules/buildkit/kubernetes/base/kustomization.yaml b/osdc/modules/buildkit/kubernetes/base/kustomization.yaml
index 9cbffb38..32572408 100644
--- a/osdc/modules/buildkit/kubernetes/base/kustomization.yaml
+++ b/osdc/modules/buildkit/kubernetes/base/kustomization.yaml
@@ -5,6 +5,8 @@ kind: Kustomization
 resources:
   - namespace.yaml
   - configmap.yaml
+  - drain-configmap.yaml
   - haproxy.yaml
   - service.yaml
   - networkpolicy.yaml
+  - poddisruptionbudget.yaml
diff --git a/osdc/modules/buildkit/kubernetes/base/poddisruptionbudget.yaml b/osdc/modules/buildkit/kubernetes/base/poddisruptionbudget.yaml
new file mode 100644
index 00000000..b717bb64
--- /dev/null
+++ b/osdc/modules/buildkit/kubernetes/base/poddisruptionbudget.yaml
@@ -0,0 +1,25 @@
+# Cap voluntary disruptions (node consolidation, drains) to one builder per arch
+# at a time so evictions go through the preStop drain instead of killing builds.
+apiVersion: policy/v1
+kind: PodDisruptionBudget
+metadata:
+  name: buildkitd-arm64
+  namespace: buildkit
+spec:
+  maxUnavailable: 1
+  selector:
+    matchLabels:
+      app: buildkitd
+      arch: arm64
+---
+apiVersion: policy/v1
+kind: PodDisruptionBudget
+metadata:
+  name: buildkitd-amd64
+  namespace: buildkit
+spec:
+  maxUnavailable: 1
+  selector:
+    matchLabels:
+      app: buildkitd
+      arch: amd64
diff --git a/osdc/modules/buildkit/scripts/python/generate_buildkit.py b/osdc/modules/buildkit/scripts/python/generate_buildkit.py
index e7468418..830b8998 100644
--- a/osdc/modules/buildkit/scripts/python/generate_buildkit.py
+++ b/osdc/modules/buildkit/scripts/python/generate_buildkit.py
@@ -103,6 +103,7 @@ def generate_deployment_yaml(
     arm64_replicas: int | None = None,
     amd64_pods_per_node: int | None = None,
     arm64_pods_per_node: int | None = None,
+    autoscaling: bool = False,
 ) -> str:
     """Generate the combined Deployment YAML for both architectures.
 
@@ -118,6 +119,37 @@ def generate_deployment_yaml(
     arm64_res = compute_pod_resources(arm64_instance, arm64_pods_per_node)
     amd64_res = compute_pod_resources(amd64_instance, amd64_pods_per_node)
 
+    # When KEDA owns the replica count, omit `replicas` and add a preStop drain
+    # that holds the pod open until its in-flight build finishes. `replicas_line`
+    # is computed per-arch inside _deployment_block (below).
+    grace_line = "      terminationGracePeriodSeconds: 8100\n" if autoscaling else ""
+    lifecycle_block = (
+        """
+          lifecycle:
+            preStop:
+              exec:
+                command: ["/bin/sh", "/opt/drain/drain.sh"]"""
+        if autoscaling
+        else ""
+    )
+    drain_mount = (
+        """
+            - name: drain
+              mountPath: /opt/drain
+              readOnly: true"""
+        if autoscaling
+        else ""
+    )
+    drain_volume = (
+        """
+        - name: drain
+          configMap:
+            name: buildkitd-drain
+            defaultMode: 0555"""
+        if autoscaling
+        else ""
+    )
+
     log_info(
         f"arm64 ({arm64_instance}): {arm64_res['cpu']} vCPU, {arm64_res['memory_gi']}Gi per pod "
         f"(allocatable: {arm64_res['allocatable_cpu_m']}m CPU, {arm64_res['allocatable_mem_mi']}Mi mem)"
@@ -128,6 +160,7 @@ def generate_deployment_yaml(
     )
 
     def _deployment_block(arch, instance_type, cpu, memory_gi, replicas, pods_per_node):
+        replicas_line = "" if autoscaling else f"  replicas: {replicas}\n"
         return f"""apiVersion: apps/v1
 kind: Deployment
 metadata:
@@ -139,8 +172,7 @@ def _deployment_block(arch, instance_type, cpu, memory_gi, replicas, pods_per_no
     app.kubernetes.io/name: buildkitd
     app.kubernetes.io/component: build-service
 spec:
-  replicas: {replicas}
-  strategy:
+{replicas_line}  strategy:
     type: RollingUpdate
     rollingUpdate:
       maxSurge: 0
@@ -155,7 +187,7 @@ def _deployment_block(arch, instance_type, cpu, memory_gi, replicas, pods_per_no
         app: buildkitd
         arch: {arch}
     spec:
-      nodeSelector:
+{grace_line}      nodeSelector:
         workload-type: buildkit
         instance-type: "{instance_type}"
 
@@ -208,7 +240,7 @@ def _deployment_block(arch, instance_type, cpu, memory_gi, replicas, pods_per_no
                   fieldPath: metadata.name
 
           securityContext:
-            privileged: true
+            privileged: true{lifecycle_block}
 
           readinessProbe:
             exec:
@@ -237,7 +269,7 @@ def _deployment_block(arch, instance_type, cpu, memory_gi, replicas, pods_per_no
               subPathExpr: $(POD_NAME)
             - name: git-cache
               mountPath: /opt/git-cache
-              readOnly: true
+              readOnly: true{drain_mount}
 
       volumes:
         - name: config
@@ -252,7 +284,7 @@ def _deployment_block(arch, instance_type, cpu, memory_gi, replicas, pods_per_no
         - name: git-cache
           hostPath:
             path: /mnt/k8s-disks/0/git-cache
-            type: DirectoryOrCreate"""
+            type: DirectoryOrCreate{drain_volume}"""
 
     arm64_block = _deployment_block(
         "arm64", arm64_instance, arm64_res["cpu"], arm64_res["memory_gi"], arm64_replicas, arm64_pods_per_node
@@ -453,6 +485,63 @@ def _nodepool_block(arch, instance_type, cpu_limit, memory_limit_gi):
     return arm64_block + "\n\n---\n" + amd64_block + "\n"
 
 
+def generate_autoscaling_yaml(
+    amd64_min: int,
+    amd64_max: int,
+    arm64_min: int,
+    arm64_max: int,
+) -> str:
+    """Generate per-arch KEDA ScaledObjects.
+
+    Each arch scales on its HAProxy backend's active build count
+    (haproxy_backend_current_sessions), scraped in-cluster from the buildkit LB
+    metrics endpoint — no external metrics backend. With server maxconn=1, the LB
+    queues bursts while KEDA/Karpenter bring up pods; minReplicaCount keeps a warm
+    baseline so the common case has a free pod immediately.
+    """
+
+    metrics_url = "http://buildkitd-lb-metrics.buildkit.svc.cluster.local:9404/metrics"
+
+    def _scaledobject(arch, backend, min_replicas, max_replicas):
+        return f"""apiVersion: keda.sh/v1alpha1
+kind: ScaledObject
+metadata:
+  name: buildkitd-{arch}
+  namespace: buildkit
+spec:
+  scaleTargetRef:
+    name: buildkitd-{arch}
+  minReplicaCount: {min_replicas}
+  maxReplicaCount: {max_replicas}
+  cooldownPeriod: 600
+  advanced:
+    horizontalPodAutoscalerConfig:
+      behavior:
+        scaleDown:
+          stabilizationWindowSeconds: 600
+          policies:
+            - type: Pods
+              value: 1
+              periodSeconds: 120
+  triggers:
+    - type: metrics-api
+      metadata:
+        url: "{metrics_url}"
+        format: "prometheus"
+        valueLocation: 'haproxy_backend_current_sessions{{proxy="{backend}"}}'
+        targetValue: "1"
+"""
+
+    header = "# KEDA autoscaling — auto-generated by generate_buildkit.py. Do not edit by hand.\n"
+    return (
+        header
+        + "\n"
+        + _scaledobject("amd64", "bk_amd64", amd64_min, amd64_max)
+        + "\n---\n"
+        + _scaledobject("arm64", "bk_arm64", arm64_min, arm64_max)
+    )
+
+
 def main():
     parser = argparse.ArgumentParser(description="Generate BuildKit Deployment and NodePool YAMLs")
     parser.add_argument("--arm64-instance-type", required=True, help="ARM64 instance type (e.g., m8gd.24xlarge)")
@@ -464,8 +553,17 @@ def main():
     parser.add_argument("--amd64-pods-per-node", type=int, default=None, help="Override amd64 pods per node")
     parser.add_argument("--arm64-pods-per-node", type=int, default=None, help="Override arm64 pods per node")
     parser.add_argument("--output-dir", required=True, help="Output directory for generated YAMLs")
+    parser.add_argument("--autoscaling", action="store_true", help="Generate KEDA autoscaling manifests")
+    parser.add_argument("--amd64-min", type=int, default=0, help="KEDA minReplicaCount for amd64")
+    parser.add_argument("--amd64-max", type=int, default=0, help="KEDA maxReplicaCount for amd64")
+    parser.add_argument("--arm64-min", type=int, default=0, help="KEDA minReplicaCount for arm64")
+    parser.add_argument("--arm64-max", type=int, default=0, help="KEDA maxReplicaCount for arm64")
     args = parser.parse_args()
 
+    if args.autoscaling and not (args.amd64_max and args.arm64_max):
+        log_error("--autoscaling requires --amd64-max and --arm64-max")
+        return 1
+
     # Validate instance types
     for it in [args.arm64_instance_type, args.amd64_instance_type]:
         if it not in INSTANCE_SPECS:
@@ -498,26 +596,51 @@ def main():
         arm64_replicas=args.arm64_replicas,
         amd64_pods_per_node=args.amd64_pods_per_node,
         arm64_pods_per_node=args.arm64_pods_per_node,
+        autoscaling=args.autoscaling,
     )
     deployment_path = output_dir / "deployment.yaml"
     deployment_path.write_text(deployment_yaml)
     log_info(f"Wrote {deployment_path}")
 
-    # Generate nodepools
-    nodepools_yaml = generate_nodepools_yaml(
-        args.arm64_instance_type,
-        args.amd64_instance_type,
-        args.replicas,
-        args.pods_per_node,
-        amd64_replicas=args.amd64_replicas,
-        arm64_replicas=args.arm64_replicas,
-        amd64_pods_per_node=args.amd64_pods_per_node,
-        arm64_pods_per_node=args.arm64_pods_per_node,
-    )
+    # Size NodePool limits for the peak: the per-arch autoscaling ceiling
+    # (amd64_max / arm64_max) when enabled, otherwise the per-arch replica counts.
+    if args.autoscaling:
+        nodepools_yaml = generate_nodepools_yaml(
+            args.arm64_instance_type,
+            args.amd64_instance_type,
+            args.replicas,
+            args.pods_per_node,
+            amd64_replicas=args.amd64_max,
+            arm64_replicas=args.arm64_max,
+            amd64_pods_per_node=args.amd64_pods_per_node,
+            arm64_pods_per_node=args.arm64_pods_per_node,
+        )
+    else:
+        nodepools_yaml = generate_nodepools_yaml(
+            args.arm64_instance_type,
+            args.amd64_instance_type,
+            args.replicas,
+            args.pods_per_node,
+            amd64_replicas=args.amd64_replicas,
+            arm64_replicas=args.arm64_replicas,
+            amd64_pods_per_node=args.amd64_pods_per_node,
+            arm64_pods_per_node=args.arm64_pods_per_node,
+        )
     nodepools_path = output_dir / "nodepools.yaml"
     nodepools_path.write_text(nodepools_yaml)
     log_info(f"Wrote {nodepools_path}")
 
+    if args.autoscaling:
+        autoscaling_yaml = generate_autoscaling_yaml(
+            args.amd64_min,
+            args.amd64_max,
+            args.arm64_min,
+            args.arm64_max,
+        )
+        autoscaling_path = output_dir / "autoscaling.yaml"
+        autoscaling_path.write_text(autoscaling_yaml)
+        log_info(f"Wrote {autoscaling_path}")
+
     return 0
 
 
diff --git a/osdc/modules/buildkit/scripts/python/test_generate_buildkit.py b/osdc/modules/buildkit/scripts/python/test_generate_buildkit.py
index 387a283d..c373c9c0 100644
--- a/osdc/modules/buildkit/scripts/python/test_generate_buildkit.py
+++ b/osdc/modules/buildkit/scripts/python/test_generate_buildkit.py
@@ -12,6 +12,7 @@
     DAEMONSET_OVERHEAD_MEM_MI,
     MARGIN,
     compute_pod_resources,
+    generate_autoscaling_yaml,
     generate_deployment_yaml,
     generate_nodepools_yaml,
 )
@@ -341,6 +342,65 @@ def test_nodepool_limits_scale_per_arch(self):
         assert np["buildkit-arm64"]["spec"]["limits"]["cpu"] == "256"
 
 
+# ============================================================================
+# autoscaling
+# ============================================================================
+
+
+class TestAutoscalingDeployment:
+    """When autoscaling=True the Deployment yields control of replicas to KEDA
+    and gains a preStop drain so scale-down never kills an in-flight build."""
+
+    def test_omits_replicas(self):
+        output = generate_deployment_yaml("m8gd.24xlarge", "m6id.24xlarge", 4, 2, autoscaling=True)
+        for d in parse_all_yaml(output):
+            assert "replicas" not in d["spec"]
+
+    def test_keeps_replicas_when_disabled(self):
+        output = generate_deployment_yaml("m8gd.24xlarge", "m6id.24xlarge", 4, 2)
+        for d in parse_all_yaml(output):
+            assert d["spec"]["replicas"] == 4
+
+    def test_drain_prestop_and_grace(self):
+        output = generate_deployment_yaml("m8gd.24xlarge", "m6id.24xlarge", 4, 2, autoscaling=True)
+        for d in parse_all_yaml(output):
+            spec = d["spec"]["template"]["spec"]
+            assert spec["terminationGracePeriodSeconds"] == 8100
+            container = spec["containers"][0]
+            assert container["lifecycle"]["preStop"]["exec"]["command"] == ["/bin/sh", "/opt/drain/drain.sh"]
+            assert "drain" in {vm["name"] for vm in container["volumeMounts"]}
+            assert "drain" in {v["name"] for v in spec["volumes"]}
+
+
+class TestGenerateAutoscalingYaml:
+    """Tests for generate_autoscaling_yaml — in-cluster KEDA ScaledObjects."""
+
+    def _docs(self):
+        output = generate_autoscaling_yaml(2, 8, 4, 8)
+        return parse_all_yaml(output)
+
+    def test_only_scaledobjects_no_external_auth(self):
+        # In-cluster signal → no TriggerAuthentication / Grafana secret needed.
+        kinds = sorted(d["kind"] for d in self._docs())
+        assert kinds == ["ScaledObject", "ScaledObject"]
+
+    def test_per_arch_min_max(self):
+        scaled = {d["metadata"]["name"]: d for d in self._docs() if d["kind"] == "ScaledObject"}
+        assert set(scaled) == {"buildkitd-amd64", "buildkitd-arm64"}
+        assert scaled["buildkitd-amd64"]["spec"]["minReplicaCount"] == 2
+        assert scaled["buildkitd-amd64"]["spec"]["maxReplicaCount"] == 8
+        assert scaled["buildkitd-arm64"]["spec"]["minReplicaCount"] == 4
+        assert scaled["buildkitd-arm64"]["spec"]["maxReplicaCount"] == 8
+
+    def test_in_cluster_metrics_api_trigger(self):
+        scaled = {d["metadata"]["name"]: d for d in self._docs() if d["kind"] == "ScaledObject"}
+        for name, backend in [("buildkitd-amd64", "bk_amd64"), ("buildkitd-arm64", "bk_arm64")]:
+            trig = scaled[name]["spec"]["triggers"][0]
+            assert trig["type"] == "metrics-api"
+            assert "buildkitd-lb-metrics.buildkit" in trig["metadata"]["url"]
+            assert f'proxy="{backend}"' in trig["metadata"]["valueLocation"]
+
+
 # ============================================================================
 # generate_nodepools_yaml
 # ============================================================================
@@ -520,6 +580,64 @@ def test_deployment_yaml_parseable(self, tmp_path):
         docs = parse_all_yaml(deployment_text)
         assert len(docs) == 2
 
+    def test_autoscaling_writes_manifests(self, tmp_path):
+        output_dir = tmp_path / "output"
+
+        import generate_buildkit
+
+        test_args = [
+            "generate_buildkit.py",
+            "--arm64-instance-type",
+            "m8gd.24xlarge",
+            "--amd64-instance-type",
+            "m6id.24xlarge",
+            "--replicas",
+            "1",
+            "--pods-per-node",
+            "2",
+            "--output-dir",
+            str(output_dir),
+            "--autoscaling",
+            "--amd64-min",
+            "2",
+            "--amd64-max",
+            "8",
+            "--arm64-min",
+            "4",
+            "--arm64-max",
+            "8",
+        ]
+        with patch.object(sys, "argv", test_args):
+            result = generate_buildkit.main()
+
+        assert result == 0
+        autoscaling = parse_all_yaml((output_dir / "autoscaling.yaml").read_text())
+        assert sorted(d["kind"] for d in autoscaling) == ["ScaledObject", "ScaledObject"]
+
+    def test_autoscaling_requires_params(self, tmp_path):
+        output_dir = tmp_path / "output"
+
+        import generate_buildkit
+
+        test_args = [
+            "generate_buildkit.py",
+            "--arm64-instance-type",
+            "m8gd.24xlarge",
+            "--amd64-instance-type",
+            "m6id.24xlarge",
+            "--replicas",
+            "1",
+            "--pods-per-node",
+            "2",
+            "--output-dir",
+            str(output_dir),
+            "--autoscaling",
+        ]
+        with patch.object(sys, "argv", test_args):
+            result = generate_buildkit.main()
+
+        assert result == 1
+
     def test_unknown_instance_type_fails(self, tmp_path):
         output_dir = tmp_path / "output"
         test_args = [
diff --git a/osdc/modules/keda/deploy.sh b/osdc/modules/keda/deploy.sh
new file mode 100755
index 00000000..59f422ad
--- /dev/null
+++ b/osdc/modules/keda/deploy.sh
@@ -0,0 +1,34 @@
+#!/usr/bin/env bash
+set -euo pipefail
+#
+# KEDA module deploy script.
+# Called by: just deploy-module <cluster> keda
+# Args: $1=cluster-id  $2=cluster-name  $3=region
+#
+# Installs the KEDA operator (provides the ScaledObject/TriggerAuthentication
+# CRDs that the buildkit module uses to autoscale builders).
+
+CLUSTER="$1"
+MODULE_DIR="$(cd "$(dirname "$0")" && pwd)"
+REPO_ROOT="${OSDC_ROOT:-$(cd "$MODULE_DIR/../.." && pwd)}"
+UPSTREAM_ROOT="${OSDC_UPSTREAM:-$REPO_ROOT}"
+# shellcheck source=/dev/null
+source "$UPSTREAM_ROOT/scripts/mise-activate.sh"
+# shellcheck source=/dev/null
+source "$UPSTREAM_ROOT/scripts/helm-upgrade.sh"
+CFG="$UPSTREAM_ROOT/scripts/cluster-config.py"
+
+NAMESPACE="keda"
+CHART_VERSION=$(uv run "$CFG" "$CLUSTER" keda.chart_version 2.16.1)
+
+helm repo add kedacore https://kedacore.github.io/charts >/dev/null 2>&1 || true
+helm repo update kedacore >/dev/null 2>&1 || true
+
+helm_upgrade_if_changed keda "$NAMESPACE" \
+  --create-namespace \
+  --version "$CHART_VERSION" \
+  --timeout 5m \
+  --wait \
+  kedacore/keda
+
+echo "KEDA deployed (chart $CHART_VERSION)."

From 0475d6da04e9732b7e0cb62d3792b7baa616f2cf Mon Sep 17 00:00:00 2001
From: Huy Do <huydo@meta.com>
Date: Wed, 10 Jun 2026 02:07:35 -0700
Subject: [PATCH 03/14] Update (base update)

[ghstack-poisoned]
---
 osdc/modules/buildkit/README.md               | 15 +++++-
 .../scripts/python/generate_buildkit.py       | 48 +++++++++++--------
 2 files changed, 43 insertions(+), 20 deletions(-)

diff --git a/osdc/modules/buildkit/README.md b/osdc/modules/buildkit/README.md
index e17fafdf..37d64f9c 100644
--- a/osdc/modules/buildkit/README.md
+++ b/osdc/modules/buildkit/README.md
@@ -24,7 +24,20 @@ back to a small warm baseline when idle.
   limits are sized to `*_max`.
 - **Safe scale-down** — `preStop` drain (waits until the pod's `:1234` is idle)
   + long `terminationGracePeriodSeconds` + PDB, so a build is never killed
-  mid-flight.
+  mid-flight. Scale-down removes an arbitrary pod, which may be mid-build; the
+  drain holds termination until that build finishes, but
+  `terminationGracePeriodSeconds` is a hard SIGKILL cap, so it must outlast the
+  longest possible build. It's set to **8100s (135m) = 120m** (the max time a
+  docker build may run, matching HAProxy `timeout server`) **+ ~15m** of
+  headroom for the drain's idle-detection polling. A build that starts just
+  before drain still completes; the cap only fires as a backstop if a pod never
+  drains.
+  The **PDB** (`maxUnavailable: 1` per arch) bounds *voluntary* disruptions —
+  node consolidation and manual `kubectl drain` — to one builder per arch at a
+  time, so those go through the preStop drain one pod at a time instead of
+  evicting several in-flight builds at once. (KEDA scale-down deletes pods
+  directly rather than via the eviction API, so it isn't PDB-gated — the drain +
+  grace cap above is what protects that path.)
 
 Build clients should retry the connect so a build can wait for a pod from a cold
 or queued pool.
diff --git a/osdc/modules/buildkit/scripts/python/generate_buildkit.py b/osdc/modules/buildkit/scripts/python/generate_buildkit.py
index 830b8998..35f00fd0 100644
--- a/osdc/modules/buildkit/scripts/python/generate_buildkit.py
+++ b/osdc/modules/buildkit/scripts/python/generate_buildkit.py
@@ -36,6 +36,12 @@
 RED = "\033[0;31m"
 NC = "\033[0m"
 
+# Sentinel for optional template lines. An optional fragment is either its YAML
+# lines or this sentinel; lines equal to it are dropped when a block is assembled
+# (see _deployment_block). This lets every fragment sit on its own line in the
+# templates below instead of being concatenated onto an adjacent line.
+_OMIT = "<<omit>>"
+
 
 def log_info(msg):
     print(f"{GREEN}\u2192{NC} {msg}")
@@ -120,34 +126,32 @@ def generate_deployment_yaml(
     amd64_res = compute_pod_resources(amd64_instance, amd64_pods_per_node)
 
     # When KEDA owns the replica count, omit `replicas` and add a preStop drain
-    # that holds the pod open until its in-flight build finishes. `replicas_line`
-    # is computed per-arch inside _deployment_block (below).
-    grace_line = "      terminationGracePeriodSeconds: 8100\n" if autoscaling else ""
+    # that holds the pod open until its in-flight build finishes. Each fragment is
+    # either its YAML lines or _OMIT (dropped at assembly), so it sits on its own
+    # line in the template. (`replicas_line` is per-arch — computed below.)
+    grace_line = "      terminationGracePeriodSeconds: 8100" if autoscaling else _OMIT
     lifecycle_block = (
-        """
-          lifecycle:
+        """          lifecycle:
             preStop:
               exec:
                 command: ["/bin/sh", "/opt/drain/drain.sh"]"""
         if autoscaling
-        else ""
+        else _OMIT
     )
     drain_mount = (
-        """
-            - name: drain
+        """            - name: drain
               mountPath: /opt/drain
               readOnly: true"""
         if autoscaling
-        else ""
+        else _OMIT
     )
     drain_volume = (
-        """
-        - name: drain
+        """        - name: drain
           configMap:
             name: buildkitd-drain
             defaultMode: 0555"""
         if autoscaling
-        else ""
+        else _OMIT
     )
 
     log_info(
@@ -160,8 +164,8 @@ def generate_deployment_yaml(
     )
 
     def _deployment_block(arch, instance_type, cpu, memory_gi, replicas, pods_per_node):
-        replicas_line = "" if autoscaling else f"  replicas: {replicas}\n"
-        return f"""apiVersion: apps/v1
+        replicas_line = _OMIT if autoscaling else f"  replicas: {replicas}"
+        block = f"""apiVersion: apps/v1
 kind: Deployment
 metadata:
   name: buildkitd-{arch}
@@ -172,7 +176,8 @@ def _deployment_block(arch, instance_type, cpu, memory_gi, replicas, pods_per_no
     app.kubernetes.io/name: buildkitd
     app.kubernetes.io/component: build-service
 spec:
-{replicas_line}  strategy:
+{replicas_line}
+  strategy:
     type: RollingUpdate
     rollingUpdate:
       maxSurge: 0
@@ -187,7 +192,8 @@ def _deployment_block(arch, instance_type, cpu, memory_gi, replicas, pods_per_no
         app: buildkitd
         arch: {arch}
     spec:
-{grace_line}      nodeSelector:
+{grace_line}
+      nodeSelector:
         workload-type: buildkit
         instance-type: "{instance_type}"
 
@@ -240,7 +246,8 @@ def _deployment_block(arch, instance_type, cpu, memory_gi, replicas, pods_per_no
                   fieldPath: metadata.name
 
           securityContext:
-            privileged: true{lifecycle_block}
+            privileged: true
+{lifecycle_block}
 
           readinessProbe:
             exec:
@@ -269,7 +276,8 @@ def _deployment_block(arch, instance_type, cpu, memory_gi, replicas, pods_per_no
               subPathExpr: $(POD_NAME)
             - name: git-cache
               mountPath: /opt/git-cache
-              readOnly: true{drain_mount}
+              readOnly: true
+{drain_mount}
 
       volumes:
         - name: config
@@ -284,7 +292,9 @@ def _deployment_block(arch, instance_type, cpu, memory_gi, replicas, pods_per_no
         - name: git-cache
           hostPath:
             path: /mnt/k8s-disks/0/git-cache
-            type: DirectoryOrCreate{drain_volume}"""
+            type: DirectoryOrCreate
+{drain_volume}"""
+        return "\n".join(line for line in block.splitlines() if line != _OMIT)
 
     arm64_block = _deployment_block(
         "arm64", arm64_instance, arm64_res["cpu"], arm64_res["memory_gi"], arm64_replicas, arm64_pods_per_node

From 24a5f136b08f884c2916ed7570ce4485b309da4c Mon Sep 17 00:00:00 2001
From: Huy Do <huydo@meta.com>
Date: Wed, 10 Jun 2026 02:41:02 -0700
Subject: [PATCH 04/14] Update

[ghstack-poisoned]
---
 .../docker/test-buildkit-scale/Dockerfile     |  6 --
 .../scripts/python/phases.py                  | 14 ++---
 .../scripts/python/test_run.py                |  9 ++-
 .../workflows/build-image-scale.yaml          | 56 +++++++++----------
 4 files changed, 39 insertions(+), 46 deletions(-)
 delete mode 100644 osdc/integration-tests/docker/test-buildkit-scale/Dockerfile

diff --git a/osdc/integration-tests/docker/test-buildkit-scale/Dockerfile b/osdc/integration-tests/docker/test-buildkit-scale/Dockerfile
deleted file mode 100644
index e317eb61..00000000
--- a/osdc/integration-tests/docker/test-buildkit-scale/Dockerfile
+++ /dev/null
@@ -1,6 +0,0 @@
-# Holds one BuildKit slot (~10 min) so a burst of parallel builds exceeds the
-# warm baseline and forces KEDA scale-up. CACHEBUST differs per build so each
-# actually runs (no layer reuse).
-FROM alpine:3.21
-ARG CACHEBUST
-RUN echo "osdc buildkit scale-test ${CACHEBUST}" && sleep 600
diff --git a/osdc/integration-tests/scripts/python/phases.py b/osdc/integration-tests/scripts/python/phases.py
index 17654000..04d3589d 100644
--- a/osdc/integration-tests/scripts/python/phases.py
+++ b/osdc/integration-tests/scripts/python/phases.py
@@ -276,17 +276,17 @@ def prepare_pr(
     # Write integration test workflow
     (workflows_dir / "integration-test.yaml").write_text(workflow_content)
 
-    # Copy reusable BuildKit workflows
+    # Copy reusable BuildKit workflows (connectivity + autoscaling scale test).
+    # The scale test builds an inline Dockerfile, so it needs no copied context.
     wf_root = upstream_dir / "integration-tests" / "workflows"
     for wf in ("build-image.yaml", "build-image-scale.yaml"):
         (workflows_dir / wf).write_text((wf_root / wf).read_text())
 
-    # Copy test Dockerfiles (connectivity + autoscaling scale test)
-    docker_root = upstream_dir / "integration-tests" / "docker"
-    for name in ("test-buildkit", "test-buildkit-scale"):
-        dst = canary_path / "docker" / name
-        dst.mkdir(parents=True, exist_ok=True)
-        (dst / "Dockerfile").write_text((docker_root / name / "Dockerfile").read_text())
+    # Copy test Dockerfile (connectivity test context)
+    docker_dir = canary_path / "docker" / "test-buildkit"
+    docker_dir.mkdir(parents=True, exist_ok=True)
+    dockerfile_src = upstream_dir / "integration-tests" / "docker" / "test-buildkit" / "Dockerfile"
+    (docker_dir / "Dockerfile").write_text(dockerfile_src.read_text())
 
     # Commit
     run_cmd(["git", "add", "-A"], cwd=canary_path)
diff --git a/osdc/integration-tests/scripts/python/test_run.py b/osdc/integration-tests/scripts/python/test_run.py
index dc4d5188..5f51e8d2 100644
--- a/osdc/integration-tests/scripts/python/test_run.py
+++ b/osdc/integration-tests/scripts/python/test_run.py
@@ -112,13 +112,12 @@ def workflow_template(tmp_path):
     )
     (wf_dir / "integration-test.yaml.tpl").write_text(template)
 
-    # Also create reusable workflows and Dockerfiles for prepare_pr
+    # Also create reusable workflows and Dockerfile for prepare_pr
     (wf_dir / "build-image.yaml").write_text("name: build-image\n")
     (wf_dir / "build-image-scale.yaml").write_text("name: build-image-scale\n")
-    for name in ("test-buildkit", "test-buildkit-scale"):
-        docker_dir = upstream / "integration-tests" / "docker" / name
-        docker_dir.mkdir(parents=True)
-        (docker_dir / "Dockerfile").write_text("FROM alpine\n")
+    docker_dir = upstream / "integration-tests" / "docker" / "test-buildkit"
+    docker_dir.mkdir(parents=True)
+    (docker_dir / "Dockerfile").write_text("FROM alpine\n")
 
     return upstream
 
diff --git a/osdc/integration-tests/workflows/build-image-scale.yaml b/osdc/integration-tests/workflows/build-image-scale.yaml
index b485dc5a..dbc3d185 100644
--- a/osdc/integration-tests/workflows/build-image-scale.yaml
+++ b/osdc/integration-tests/workflows/build-image-scale.yaml
@@ -1,10 +1,12 @@
 # Reusable workflow: BuildKit autoscaling scale test.
-# Launches a burst of 8 parallel builds against one arch's remote BuildKit, each
-# holding a slot (server maxconn=1) for ~10 min via a sleep. The warm baseline
-# (amd64_min / arm64_min) is below the burst, so the builds finish within
-# timeout-minutes only if KEDA scales the pool up. Without scale-up they
-# serialize through the baseline pods and the back of the queue times out — i.e.
-# this job FAILS when autoscaling does not happen.
+# Mirrors the real .ci/docker/docker-builds.yml path — Docker Buildx with the
+# remote driver pointed at the per-arch buildkitd endpoint — but builds a
+# trivial image that just sleeps, so each of the 8 parallel jobs holds one
+# BuildKit slot (server maxconn=1) for ~10 min. The warm baseline is below the
+# burst, so the builds finish within timeout-minutes only if KEDA scales the
+# pool up; without scale-up they serialize through the baseline pods and the
+# back of the queue times out — i.e. this job FAILS when autoscaling does not
+# happen.
 name: BuildKit Scale Test
 
 on:
@@ -16,9 +18,8 @@ on:
         type: string
       runner_label:
         description: "Runner label to use (includes cluster prefix)"
-        required: false
+        required: true
         type: string
-        default: "l-x86iavx512-2-4"
 
 jobs:
   scale:
@@ -33,27 +34,26 @@ jobs:
     container:
       image: ghcr.io/actions/actions-runner:latest
     steps:
-      - name: Install buildctl
-        run: |
-          BUILDKIT_VERSION="v0.29.0"
-          mkdir -p "$HOME/.local/bin"
-          curl -sSL "https://github.com/moby/buildkit/releases/download/${BUILDKIT_VERSION}/buildkit-${BUILDKIT_VERSION}.linux-amd64.tar.gz" \
-            | tar xz --strip-components=1 -C "$HOME/.local/bin" bin/buildctl
-          echo "$HOME/.local/bin" >> "$GITHUB_PATH"
-          "$HOME/.local/bin/buildctl" --version
-
-      - name: Checkout
-        uses: actions/checkout@v4
+      - name: Set up Docker Buildx (remote)
+        uses: docker/setup-buildx-action@v3
+        with:
+          driver: remote
+          endpoint: tcp://buildkitd-${{ inputs.arch }}.buildkit:1234
 
       - name: Occupy a BuildKit slot (~10 min) to drive autoscaling
+        shell: bash
         run: |
-          ENDPOINT="tcp://buildkitd-${{ inputs.arch }}.buildkit:1234"
-          echo "Build ${{ matrix.replica }} -> $ENDPOINT"
-          buildctl --addr "$ENDPOINT" build \
-            --frontend dockerfile.v0 \
-            --local context=docker/test-buildkit-scale \
-            --local dockerfile=docker/test-buildkit-scale \
-            --opt build-arg:CACHEBUST=${{ inputs.arch }}-${{ matrix.replica }}-${{ github.run_id }} \
+          set -ex
+          cat > Dockerfile.scale <<'EOF'
+          FROM alpine:3.21
+          ARG CACHEBUST
+          RUN echo "osdc buildkit scale-test ${CACHEBUST}" && sleep 600
+          EOF
+          # --no-cache + a per-job CACHEBUST make each build distinct so it
+          # actually runs the sleep; cacheonly keeps it remote (no push/load).
+          docker buildx build \
+            --platform "linux/${{ inputs.arch }}" \
+            --build-arg "CACHEBUST=${{ inputs.arch }}-${{ matrix.replica }}-${{ github.run_id }}" \
             --no-cache \
-            --output type=cacheonly
-          echo "PASS: build ${{ matrix.replica }} finished within timeout"
+            --output type=cacheonly \
+            -f Dockerfile.scale .

From c906ed44af0bafd64ecdd90ed4541571fc41f4c6 Mon Sep 17 00:00:00 2001
From: Huy Do <huydo@meta.com>
Date: Wed, 10 Jun 2026 02:51:46 -0700
Subject: [PATCH 05/14] Update (base update)

[ghstack-poisoned]
---
 .../buildkit/kubernetes/base/haproxy.yaml        |  6 +++---
 .../buildkit/scripts/python/generate_buildkit.py | 16 +++++++++-------
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/osdc/modules/buildkit/kubernetes/base/haproxy.yaml b/osdc/modules/buildkit/kubernetes/base/haproxy.yaml
index a52cece7..45c630d6 100644
--- a/osdc/modules/buildkit/kubernetes/base/haproxy.yaml
+++ b/osdc/modules/buildkit/kubernetes/base/haproxy.yaml
@@ -25,9 +25,9 @@ data:
       timeout client 120m
       timeout server 120m
       # Queue a build (server maxconn=1, below) while KEDA/Karpenter add pods,
-      # instead of stacking it on a busy pod. The runner also retries the
-      # connect, so this is an upper bound, not the only safety net.
-      timeout queue 10m
+      # instead of stacking it on a busy pod. Keep <= 120m, the max time a
+      # docker build is allowed to run.
+      timeout queue 60m
       log global
       option tcplog
 
diff --git a/osdc/modules/buildkit/scripts/python/generate_buildkit.py b/osdc/modules/buildkit/scripts/python/generate_buildkit.py
index 35f00fd0..b39a09a2 100644
--- a/osdc/modules/buildkit/scripts/python/generate_buildkit.py
+++ b/osdc/modules/buildkit/scripts/python/generate_buildkit.py
@@ -127,9 +127,11 @@ def generate_deployment_yaml(
 
     # When KEDA owns the replica count, omit `replicas` and add a preStop drain
     # that holds the pod open until its in-flight build finishes. Each fragment is
-    # either its YAML lines or _OMIT (dropped at assembly), so it sits on its own
-    # line in the template. (`replicas_line` is per-arch — computed below.)
-    grace_line = "      terminationGracePeriodSeconds: 8100" if autoscaling else _OMIT
+    # either its YAML or _OMIT; _deployment_block drops _OMIT lines (matched after
+    # stripping), so single lines carry their indent in the template (e.g.
+    # `      {grace_line}`) while multi-line blocks self-indent. (`replicas_line`
+    # is per-arch — computed below.)
+    grace_line = "terminationGracePeriodSeconds: 8100" if autoscaling else _OMIT
     lifecycle_block = (
         """          lifecycle:
             preStop:
@@ -164,7 +166,7 @@ def generate_deployment_yaml(
     )
 
     def _deployment_block(arch, instance_type, cpu, memory_gi, replicas, pods_per_node):
-        replicas_line = _OMIT if autoscaling else f"  replicas: {replicas}"
+        replicas_line = _OMIT if autoscaling else f"replicas: {replicas}"
         block = f"""apiVersion: apps/v1
 kind: Deployment
 metadata:
@@ -176,7 +178,7 @@ def _deployment_block(arch, instance_type, cpu, memory_gi, replicas, pods_per_no
     app.kubernetes.io/name: buildkitd
     app.kubernetes.io/component: build-service
 spec:
-{replicas_line}
+  {replicas_line}
   strategy:
     type: RollingUpdate
     rollingUpdate:
@@ -192,7 +194,7 @@ def _deployment_block(arch, instance_type, cpu, memory_gi, replicas, pods_per_no
         app: buildkitd
         arch: {arch}
     spec:
-{grace_line}
+      {grace_line}
       nodeSelector:
         workload-type: buildkit
         instance-type: "{instance_type}"
@@ -294,7 +296,7 @@ def _deployment_block(arch, instance_type, cpu, memory_gi, replicas, pods_per_no
             path: /mnt/k8s-disks/0/git-cache
             type: DirectoryOrCreate
 {drain_volume}"""
-        return "\n".join(line for line in block.splitlines() if line != _OMIT)
+        return "\n".join(line for line in block.splitlines() if line.strip() != _OMIT)
 
     arm64_block = _deployment_block(
         "arm64", arm64_instance, arm64_res["cpu"], arm64_res["memory_gi"], arm64_replicas, arm64_pods_per_node

From dd8d8e08e54cc155e91855658c2457d5a7635356 Mon Sep 17 00:00:00 2001
From: Huy Do <huydo@meta.com>
Date: Wed, 10 Jun 2026 03:52:30 -0700
Subject: [PATCH 06/14] Update (base update)

[ghstack-poisoned]
---
 osdc/modules/buildkit/README.md                    | 11 +++++++++++
 osdc/modules/buildkit/deploy.sh                    | 10 +++++++++-
 osdc/modules/buildkit/kubernetes/base/haproxy.yaml |  6 ++++++
 3 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/osdc/modules/buildkit/README.md b/osdc/modules/buildkit/README.md
index 37d64f9c..8f63639b 100644
--- a/osdc/modules/buildkit/README.md
+++ b/osdc/modules/buildkit/README.md
@@ -42,4 +42,15 @@ back to a small warm baseline when idle.
 Build clients should retry the connect so a build can wait for a pod from a cold
 or queued pool.
 
+## HAProxy config changes roll the LB
+
+HAProxy renders its config only at container start, and nothing else restarts
+the `buildkitd-lb` pod, so a bare ConfigMap update (`maxconn`, timeouts,
+backends) would silently not take effect. `deploy.sh` stamps the LB pod template
+with a `checksum/config` annotation = a hash of `haproxy.yaml`; when the config
+changes the hash changes, which rolls the Deployment so the new pod picks up the
+new config. An unchanged config keeps the same hash, so routine deploys don't
+churn the LB. (The buildkitd worker pods do **not** yet have this, so a
+`buildkitd.toml` / `drain.sh` change needs a manual rollout to take effect.)
+
 Requires the `keda` module deployed before `buildkit` (provides the CRDs).
diff --git a/osdc/modules/buildkit/deploy.sh b/osdc/modules/buildkit/deploy.sh
index ece804ad..f97d0f42 100755
--- a/osdc/modules/buildkit/deploy.sh
+++ b/osdc/modules/buildkit/deploy.sh
@@ -75,7 +75,15 @@ sed "s/CLUSTER_NAME_PLACEHOLDER/$CNAME/g" "$GENERATED_DIR/nodepools.yaml" | kube
 # --- Apply static k8s resources ---
 
 echo "Applying BuildKit static manifests..."
-kubectl_apply_if_changed -k "$MODULE_DIR/kubernetes/base/"
+# Stamp the buildkitd-lb pod template with a hash of haproxy.yaml. HAProxy reads
+# its config only at container start, and nothing else restarts the LB, so
+# without this a ConfigMap change (maxconn, timeouts, backends) would silently
+# not take effect until the pod happened to be recreated. Changing the hash
+# rolls the Deployment whenever the config changes.
+HAPROXY_SUM=$(shasum -a 256 "$MODULE_DIR/kubernetes/base/haproxy.yaml" | cut -c1-12)
+kubectl kustomize "$MODULE_DIR/kubernetes/base/" \
+  | sed "s/__HAPROXY_CFG_CHECKSUM__/$HAPROXY_SUM/" \
+  | kubectl_apply_if_changed -f -
 
 # --- Apply generated Deployments (only if changed) ---
 
diff --git a/osdc/modules/buildkit/kubernetes/base/haproxy.yaml b/osdc/modules/buildkit/kubernetes/base/haproxy.yaml
index 45c630d6..5b5f2f7f 100644
--- a/osdc/modules/buildkit/kubernetes/base/haproxy.yaml
+++ b/osdc/modules/buildkit/kubernetes/base/haproxy.yaml
@@ -104,6 +104,12 @@ spec:
     metadata:
       labels:
         app: buildkitd-lb
+      annotations:
+        # HAProxy renders its config once at container start, so a ConfigMap
+        # change has no effect until this pod restarts. deploy.sh fills this in
+        # with a hash of haproxy.yaml, so the Deployment rolls automatically
+        # whenever the config changes (nothing else restarts the LB).
+        checksum/config: "__HAPROXY_CFG_CHECKSUM__"
     spec:
       # Runs on base-infra nodes (CriticalAddonsOnly taint)
       tolerations:

From 190e88c94e300e6cb62afd05e74e8cc4045d8b61 Mon Sep 17 00:00:00 2001
From: Huy Do <huydo@meta.com>
Date: Wed, 10 Jun 2026 09:32:10 -0700
Subject: [PATCH 07/14] Update

[ghstack-poisoned]
---
 .../workflows/build-image-scale.yaml          | 23 +++++++++++++++----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/osdc/integration-tests/workflows/build-image-scale.yaml b/osdc/integration-tests/workflows/build-image-scale.yaml
index dbc3d185..86ccc08e 100644
--- a/osdc/integration-tests/workflows/build-image-scale.yaml
+++ b/osdc/integration-tests/workflows/build-image-scale.yaml
@@ -34,11 +34,20 @@ jobs:
     container:
       image: ghcr.io/actions/actions-runner:latest
     steps:
-      - name: Set up Docker Buildx (remote)
-        uses: docker/setup-buildx-action@v3
-        with:
-          driver: remote
-          endpoint: tcp://buildkitd-${{ inputs.arch }}.buildkit:1234
+      - name: Set up Docker Buildx (remote, no bootstrap)
+        shell: bash
+        # NOT docker/setup-buildx-action: it always runs `buildx inspect
+        # --bootstrap`, whose remote-driver health check gives up after a
+        # hardcoded ~20s — shorter than a cold scale-up, so jobs fail at setup
+        # during a burst. `create` (no --bootstrap) just registers the builder;
+        # the build step below retries so the connection waits out scale-up.
+        run: |
+          set -ex
+          docker buildx create \
+            --name osdc-remote \
+            --driver remote \
+            --use \
+            "tcp://buildkitd-${{ inputs.arch }}.buildkit:1234"
 
       - name: Occupy a BuildKit slot (~10 min) to drive autoscaling
         shell: bash
@@ -51,6 +60,10 @@ jobs:
           EOF
           # --no-cache + a per-job CACHEBUST make each build distinct so it
           # actually runs the sleep; cacheonly keeps it remote (no push/load).
+          # No retry needed: with the builder created above (no --bootstrap), the
+          # build connection waits in HAProxy's queue for a pod to free / scale up
+          # (bounded by `timeout queue`), instead of buildx's ~20s bootstrap gate.
+          # The 30-min job timeout still fails the test if scale-up never happens.
           docker buildx build \
             --platform "linux/${{ inputs.arch }}" \
             --build-arg "CACHEBUST=${{ inputs.arch }}-${{ matrix.replica }}-${{ github.run_id }}" \

From 2453c6fb7a1144946b41e68ef603208cf2da9f4c Mon Sep 17 00:00:00 2001
From: Huy Do <huydo@meta.com>
Date: Wed, 10 Jun 2026 10:29:00 -0700
Subject: [PATCH 08/14] Update (base update)

[ghstack-poisoned]
---
 osdc/modules/buildkit/README.md | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/osdc/modules/buildkit/README.md b/osdc/modules/buildkit/README.md
index 8f63639b..a98ee0ac 100644
--- a/osdc/modules/buildkit/README.md
+++ b/osdc/modules/buildkit/README.md
@@ -39,8 +39,28 @@ back to a small warm baseline when idle.
   directly rather than via the eviction API, so it isn't PDB-gated — the drain +
   grace cap above is what protects that path.)
 
-Build clients should retry the connect so a build can wait for a pod from a cold
-or queued pool.
+## Build with `buildctl`, not `docker buildx`
+
+Clients must reach the pool with **`buildctl`** (`buildctl --addr
+tcp://buildkitd-<arch>.buildkit:1234 build ...`), not `docker buildx` against a
+remote builder.
+
+The autoscaling design relies on a *patient* client: during a burst the build's
+connection sits in HAProxy's queue (above) for the minutes it takes KEDA +
+Karpenter to add a pod, and that pending connection is also what keeps the
+scale-up signal alive. `buildctl` does exactly this — its build call waits in
+the queue up to `timeout queue` with no separate connect deadline.
+
+`docker buildx` does **not**: before solving it "boots" the remote builder with
+a **hardcoded ~20s connect timeout** (`[internal] waiting for connection`), which
+is not configurable and far shorter than a cold scale-up. Under a burst the
+connection is still queued at 20s, buildx aborts with `context deadline
+exceeded`, and — because the connection then drops — the scale-up signal
+disappears before KEDA can act, so the pool never grows and every queued build
+fails. (`docker/setup-buildx-action` hits the same gate via `inspect
+--bootstrap`; removing it doesn't help because `docker buildx build` re-runs the
+same boot.) This was confirmed on the staging cluster. So PyTorch's
+`.ci/docker/build.sh` uses `buildctl` whenever `REMOTE_BUILDKIT` is set.
 
 ## HAProxy config changes roll the LB
 

From 5d09df7ad72ae6fec701abf0265c6a65227d540b Mon Sep 17 00:00:00 2001
From: Huy Do <huydo@meta.com>
Date: Wed, 10 Jun 2026 11:44:16 -0700
Subject: [PATCH 09/14] Update (base update)

[ghstack-poisoned]
---
 osdc/modules/buildkit/README.md               | 40 ++++++++-----------
 .../buildkit/kubernetes/base/haproxy.yaml     |  4 --
 2 files changed, 17 insertions(+), 27 deletions(-)

diff --git a/osdc/modules/buildkit/README.md b/osdc/modules/buildkit/README.md
index a98ee0ac..6e4e30eb 100644
--- a/osdc/modules/buildkit/README.md
+++ b/osdc/modules/buildkit/README.md
@@ -13,9 +13,9 @@ Absorb bursts of concurrent builds without overloading existing pods, and scale
 back to a small warm baseline when idle.
 
 - **One build per pod** — HAProxy `server maxconn 1` (matches buildkitd
-  `max-parallelism = 1`). Excess builds **queue** in HAProxy (`timeout queue`)
-  instead of stacking on a busy pod; as new pods register (DNS), queued builds
-  flow onto them, so scaled-up pods never sit idle.
+  `max-parallelism = 1`) so a build never stacks on a busy pod. When every pod is
+  busy the LB has no slot, so the client must **retry the connect** (see below)
+  until a pod frees or the pool scales up.
 - **In-cluster scale signal** — KEDA `ScaledObject` per arch, `metrics-api`
   scraping the LB's own metrics (`haproxy_backend_current_sessions`) — no external
   metrics backend.
@@ -39,28 +39,22 @@ back to a small warm baseline when idle.
   directly rather than via the eviction API, so it isn't PDB-gated — the drain +
   grace cap above is what protects that path.)
 
-## Build with `buildctl`, not `docker buildx`
+## Clients must retry the connect
 
-Clients must reach the pool with **`buildctl`** (`buildctl --addr
-tcp://buildkitd-<arch>.buildkit:1234 build ...`), not `docker buildx` against a
-remote builder.
+Build clients (both `docker buildx` and `buildctl`) use the `moby/buildkit` Go
+client, which dials with gRPC's default **~20s `MinConnectTimeout`** and
+**fail-fast** RPCs — there is no client-side flag to make it wait longer. During
+a burst, a build whose connection finds no free pod (`maxconn 1`) is dropped by
+the client after ~20s, well before KEDA/Karpenter can add a pod (minutes). An
+HAProxy-side `timeout queue` does **not** help: the client gives up at 20s
+regardless, so queueing on the LB is pointless (and was removed).
 
-The autoscaling design relies on a *patient* client: during a burst the build's
-connection sits in HAProxy's queue (above) for the minutes it takes KEDA +
-Karpenter to add a pod, and that pending connection is also what keeps the
-scale-up signal alive. `buildctl` does exactly this — its build call waits in
-the queue up to `timeout queue` with no separate connect deadline.
-
-`docker buildx` does **not**: before solving it "boots" the remote builder with
-a **hardcoded ~20s connect timeout** (`[internal] waiting for connection`), which
-is not configurable and far shorter than a cold scale-up. Under a burst the
-connection is still queued at 20s, buildx aborts with `context deadline
-exceeded`, and — because the connection then drops — the scale-up signal
-disappears before KEDA can act, so the pool never grows and every queued build
-fails. (`docker/setup-buildx-action` hits the same gate via `inspect
---bootstrap`; removing it doesn't help because `docker buildx build` re-runs the
-same boot.) This was confirmed on the staging cluster. So PyTorch's
-`.ci/docker/build.sh` uses `buildctl` whenever `REMOTE_BUILDKIT` is set.
+So the **client must retry the build** on connection failures until a pod is
+free or the pool has scaled up; the repeated attempts also keep the autoscaler's
+load signal alive. PyTorch's `.ci/docker/build.sh` does this when
+`REMOTE_BUILDKIT` is set, and the workflow creates the remote builder *without*
+`--bootstrap` (the `docker buildx inspect --bootstrap` health check hits the same
+20s gate at setup). This was confirmed on the staging cluster.
 
 ## HAProxy config changes roll the LB
 
diff --git a/osdc/modules/buildkit/kubernetes/base/haproxy.yaml b/osdc/modules/buildkit/kubernetes/base/haproxy.yaml
index 5b5f2f7f..d37eeca9 100644
--- a/osdc/modules/buildkit/kubernetes/base/haproxy.yaml
+++ b/osdc/modules/buildkit/kubernetes/base/haproxy.yaml
@@ -24,10 +24,6 @@ data:
       timeout connect 5s
       timeout client 120m
       timeout server 120m
-      # Queue a build (server maxconn=1, below) while KEDA/Karpenter add pods,
-      # instead of stacking it on a busy pod. Keep <= 120m, the max time a
-      # docker build is allowed to run.
-      timeout queue 60m
       log global
       option tcplog
 

From 031399ae5b50a2115c2a535bcd975a61a12ef575 Mon Sep 17 00:00:00 2001
From: Huy Do <huydo@meta.com>
Date: Wed, 10 Jun 2026 12:18:11 -0700
Subject: [PATCH 10/14] Update

[ghstack-poisoned]
---
 .../workflows/build-image.yaml                | 26 +++++++++++++------
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/osdc/integration-tests/workflows/build-image.yaml b/osdc/integration-tests/workflows/build-image.yaml
index b523d03d..74956d19 100644
--- a/osdc/integration-tests/workflows/build-image.yaml
+++ b/osdc/integration-tests/workflows/build-image.yaml
@@ -38,18 +38,28 @@ jobs:
 
       - name: Build test image via BuildKit
         run: |
+          set -eu
           echo "=== BuildKit ${{ inputs.arch }} connectivity test ==="
           ENDPOINT="tcp://buildkitd-${{ inputs.arch }}.buildkit:1234"
           echo "Connecting to: $ENDPOINT"
 
-          buildctl --addr "$ENDPOINT" build \
-            --frontend dockerfile.v0 \
-            --local context=docker/test-buildkit \
-            --local dockerfile=docker/test-buildkit \
-            --output type=image,name=ghcr.io/${{ github.repository }}:integration-test-${{ inputs.arch }}-${{ github.sha }},push=false
-
-          echo "PASS: BuildKit ${{ inputs.arch }} built successfully"
-          echo "Endpoint: $ENDPOINT"
+          # The buildkit client dials with gRPC's ~20s connect timeout, so a
+          # busy / cold autoscaled pool drops the connection fast. Retry until a
+          # pod is free or the pool scales up.
+          for attempt in $(seq 1 15); do
+            if buildctl --addr "$ENDPOINT" build \
+                --frontend dockerfile.v0 \
+                --local context=docker/test-buildkit \
+                --local dockerfile=docker/test-buildkit \
+                --output type=image,name=ghcr.io/${{ github.repository }}:integration-test-${{ inputs.arch }}-${{ github.sha }},push=false; then
+              echo "PASS: BuildKit ${{ inputs.arch }} built on attempt ${attempt} ($ENDPOINT)"
+              exit 0
+            fi
+            echo "attempt ${attempt}/15 failed (BuildKit cold/queued); retrying in 15s..."
+            sleep 15
+          done
+          echo "FAIL: BuildKit ${{ inputs.arch }} build failed after retries" >&2
+          exit 1
 
       - name: Verify BuildKit endpoint info
         run: |

From 3536b0e5a5691ebe89d0159e3f70237e8a6aa8e3 Mon Sep 17 00:00:00 2001
From: Huy Do <huydo@meta.com>
Date: Wed, 10 Jun 2026 12:25:23 -0700
Subject: [PATCH 11/14] Update

[ghstack-poisoned]
---
 .../scripts/python/phases.py                  |  9 +--
 .../scripts/python/test_run.py                |  3 +-
 .../workflows/build-image-scale.yaml          | 78 -------------------
 .../workflows/build-image.yaml                | 65 +++++++++++++++-
 .../workflows/integration-test.yaml.tpl       | 20 +----
 5 files changed, 71 insertions(+), 104 deletions(-)
 delete mode 100644 osdc/integration-tests/workflows/build-image-scale.yaml

diff --git a/osdc/integration-tests/scripts/python/phases.py b/osdc/integration-tests/scripts/python/phases.py
index 04d3589d..7a21f204 100644
--- a/osdc/integration-tests/scripts/python/phases.py
+++ b/osdc/integration-tests/scripts/python/phases.py
@@ -276,11 +276,10 @@ def prepare_pr(
     # Write integration test workflow
     (workflows_dir / "integration-test.yaml").write_text(workflow_content)
 
-    # Copy reusable BuildKit workflows (connectivity + autoscaling scale test).
-    # The scale test builds an inline Dockerfile, so it needs no copied context.
-    wf_root = upstream_dir / "integration-tests" / "workflows"
-    for wf in ("build-image.yaml", "build-image-scale.yaml"):
-        (workflows_dir / wf).write_text((wf_root / wf).read_text())
+    # Copy the reusable BuildKit workflow (connectivity + autoscaling scale jobs).
+    # The scale job builds an inline Dockerfile, so it needs no copied context.
+    build_wf_src = upstream_dir / "integration-tests" / "workflows" / "build-image.yaml"
+    (workflows_dir / "build-image.yaml").write_text(build_wf_src.read_text())
 
     # Copy test Dockerfile (connectivity test context)
     docker_dir = canary_path / "docker" / "test-buildkit"
diff --git a/osdc/integration-tests/scripts/python/test_run.py b/osdc/integration-tests/scripts/python/test_run.py
index 5f51e8d2..58e6ab48 100644
--- a/osdc/integration-tests/scripts/python/test_run.py
+++ b/osdc/integration-tests/scripts/python/test_run.py
@@ -112,9 +112,8 @@ def workflow_template(tmp_path):
     )
     (wf_dir / "integration-test.yaml.tpl").write_text(template)
 
-    # Also create reusable workflows and Dockerfile for prepare_pr
+    # Also create reusable workflow and Dockerfile for prepare_pr
     (wf_dir / "build-image.yaml").write_text("name: build-image\n")
-    (wf_dir / "build-image-scale.yaml").write_text("name: build-image-scale\n")
     docker_dir = upstream / "integration-tests" / "docker" / "test-buildkit"
     docker_dir.mkdir(parents=True)
     (docker_dir / "Dockerfile").write_text("FROM alpine\n")
diff --git a/osdc/integration-tests/workflows/build-image-scale.yaml b/osdc/integration-tests/workflows/build-image-scale.yaml
deleted file mode 100644
index e2618d00..00000000
--- a/osdc/integration-tests/workflows/build-image-scale.yaml
+++ /dev/null
@@ -1,78 +0,0 @@
-# Reusable workflow: BuildKit autoscaling scale test.
-# Mirrors the real .ci/docker/docker-builds.yml path — Docker Buildx with the
-# remote driver pointed at the per-arch buildkitd endpoint — but builds a
-# trivial image that just sleeps, so each of the 8 parallel jobs holds one
-# BuildKit slot (server maxconn=1) for ~10 min. The warm baseline is below the
-# burst, so the builds finish within timeout-minutes only if KEDA scales the
-# pool up; without scale-up they serialize through the baseline pods and the
-# back of the queue times out — i.e. this job FAILS when autoscaling does not
-# happen.
-name: BuildKit Scale Test
-
-on:
-  workflow_call:
-    inputs:
-      arch:
-        description: "Target architecture (amd64 or arm64)"
-        required: true
-        type: string
-      runner_label:
-        description: "Runner label to use (includes cluster prefix)"
-        required: true
-        type: string
-
-jobs:
-  scale:
-    # Runs on x86 — BuildKit is a *remote* builder; arch selects the endpoint.
-    # timeout-minutes is the gate: scaled-up ~18 min, serialized ~43 min.
-    runs-on: ${{ inputs.runner_label }}
-    timeout-minutes: 30
-    strategy:
-      fail-fast: false
-      matrix:
-        replica: [1, 2, 3, 4, 5, 6, 7, 8]
-    container:
-      image: ghcr.io/actions/actions-runner:latest
-    steps:
-      - name: Set up Docker Buildx (remote, no bootstrap)
-        shell: bash
-        # NOT docker/setup-buildx-action: it runs `buildx inspect --bootstrap`,
-        # whose ~20s connect timeout fails at setup during a cold scale-up.
-        # `create` (no --bootstrap) just registers the builder; the build step
-        # retries to wait out scale-up.
-        run: |
-          set -ex
-          docker buildx create \
-            --name osdc-remote \
-            --driver remote \
-            --use \
-            "tcp://buildkitd-${{ inputs.arch }}.buildkit:1234"
-
-      - name: Occupy a BuildKit slot (~10 min) to drive autoscaling
-        shell: bash
-        run: |
-          set -eu
-          cat > Dockerfile.scale <<'EOF'
-          FROM alpine:3.21
-          ARG CACHEBUST
-          RUN echo "osdc buildkit scale-test ${CACHEBUST}" && sleep 600
-          EOF
-          # buildx boots the builder with a hardcoded ~20s connect timeout (gRPC
-          # MinConnectTimeout), so retry to wait out cold scale-up; the repeated
-          # attempts also keep KEDA's load signal alive. The 30-min job timeout
-          # still fails the test if scale-up never happens.
-          for attempt in $(seq 1 15); do
-            if docker buildx build \
-                --platform "linux/${{ inputs.arch }}" \
-                --build-arg "CACHEBUST=${{ inputs.arch }}-${{ matrix.replica }}-${{ github.run_id }}" \
-                --no-cache \
-                --output type=cacheonly \
-                -f Dockerfile.scale .; then
-              echo "build succeeded on attempt ${attempt}"
-              exit 0
-            fi
-            echo "attempt ${attempt}/15 failed (BuildKit cold/queued); retrying in 15s..."
-            sleep 15
-          done
-          echo "build failed after retries" >&2
-          exit 1
diff --git a/osdc/integration-tests/workflows/build-image.yaml b/osdc/integration-tests/workflows/build-image.yaml
index 74956d19..04663043 100644
--- a/osdc/integration-tests/workflows/build-image.yaml
+++ b/osdc/integration-tests/workflows/build-image.yaml
@@ -1,6 +1,7 @@
-# Reusable workflow: Build a test image via OSDC BuildKit
-# Called by integration-test.yaml to validate BuildKit connectivity.
-# Uses buildctl directly — no Docker daemon required.
+# Reusable workflow: exercise OSDC BuildKit for one arch.
+# Called by integration-test.yaml. Two jobs:
+#   build  — single buildctl build (validates connectivity; buildctl route)
+#   scale  — burst of docker buildx builds (validates autoscaling; prod client)
 name: Build Test Image
 
 on:
@@ -66,3 +67,61 @@ jobs:
           ENDPOINT="tcp://buildkitd-${{ inputs.arch }}.buildkit:1234"
           buildctl --addr "$ENDPOINT" debug info || echo "WARN: debug info not available"
           echo "PASS: BuildKit ${{ inputs.arch }} endpoint is responsive"
+
+  scale:
+    # 8 parallel docker buildx builds (the prod client), each holding a BuildKit
+    # slot (server maxconn=1) ~10 min via a sleep. The warm baseline is below the
+    # burst, so they finish within timeout-minutes only if KEDA scales the pool
+    # up; otherwise the back of the burst serializes and the job times out — i.e.
+    # this FAILS if autoscaling does not happen.
+    runs-on: ${{ inputs.runner_label }}
+    timeout-minutes: 30
+    strategy:
+      fail-fast: false
+      matrix:
+        replica: [1, 2, 3, 4, 5, 6, 7, 8]
+    container:
+      image: ghcr.io/actions/actions-runner:latest
+    steps:
+      - name: Set up Docker Buildx (remote, no bootstrap)
+        shell: bash
+        # NOT docker/setup-buildx-action: it runs `buildx inspect --bootstrap`,
+        # whose ~20s connect timeout fails at setup during a cold scale-up.
+        # `create` (no --bootstrap) just registers the builder; the build step
+        # retries to wait out scale-up.
+        run: |
+          set -ex
+          docker buildx create \
+            --name osdc-remote \
+            --driver remote \
+            --use \
+            "tcp://buildkitd-${{ inputs.arch }}.buildkit:1234"
+
+      - name: Occupy a BuildKit slot (~10 min) to drive autoscaling
+        shell: bash
+        run: |
+          set -eu
+          cat > Dockerfile.scale <<'EOF'
+          FROM alpine:3.21
+          ARG CACHEBUST
+          RUN echo "osdc buildkit scale-test ${CACHEBUST}" && sleep 600
+          EOF
+          # buildx boots the builder with a hardcoded ~20s connect timeout (gRPC
+          # MinConnectTimeout), so retry to wait out cold scale-up; the repeated
+          # attempts also keep KEDA's load signal alive. The 30-min job timeout
+          # still fails the test if scale-up never happens.
+          for attempt in $(seq 1 15); do
+            if docker buildx build \
+                --platform "linux/${{ inputs.arch }}" \
+                --build-arg "CACHEBUST=${{ inputs.arch }}-${{ matrix.replica }}-${{ github.run_id }}" \
+                --no-cache \
+                --output type=cacheonly \
+                -f Dockerfile.scale .; then
+              echo "build succeeded on attempt ${attempt}"
+              exit 0
+            fi
+            echo "attempt ${attempt}/15 failed (BuildKit cold/queued); retrying in 15s..."
+            sleep 15
+          done
+          echo "build failed after retries" >&2
+          exit 1
diff --git a/osdc/integration-tests/workflows/integration-test.yaml.tpl b/osdc/integration-tests/workflows/integration-test.yaml.tpl
index 7d4fb3aa..bb453f0d 100644
--- a/osdc/integration-tests/workflows/integration-test.yaml.tpl
+++ b/osdc/integration-tests/workflows/integration-test.yaml.tpl
@@ -1502,32 +1502,20 @@ jobs:
   # END_B200
 
   # ── BuildKit Tests ────────────────────────────────────────────────────
-  build-amd64:
+  # Each call runs a buildctl connectivity build + an 8-wide docker buildx burst
+  # (fails if KEDA does not scale the pool up).
+  buildkit-amd64:
     uses: ./.github/workflows/build-image.yaml
     with:
       arch: amd64
       runner_label: {{PREFIX}}l-x86iamx-8-32
 
-  build-arm64:
+  buildkit-arm64:
     uses: ./.github/workflows/build-image.yaml
     with:
       arch: arm64
       runner_label: {{PREFIX}}l-x86iamx-8-32
 
-  # ── BuildKit Autoscaling Scale Test ───────────────────────────────────
-  # Bursts 8 parallel builds per arch; fails if KEDA does not scale the pool up.
-  buildkit-scale-amd64:
-    uses: ./.github/workflows/build-image-scale.yaml
-    with:
-      arch: amd64
-      runner_label: {{PREFIX}}l-x86iamx-8-32
-
-  buildkit-scale-arm64:
-    uses: ./.github/workflows/build-image-scale.yaml
-    with:
-      arch: arm64
-      runner_label: {{PREFIX}}l-x86iamx-8-32
-
   # ── Harbor Cache Test ─────────────────────────────────────────────────
   test-harbor:
     runs-on: {{PREFIX}}l-x86iamx-8-32

From 94148dc5838b6401060046cf70be73fd3beeb8a4 Mon Sep 17 00:00:00 2001
From: Huy Do <huydo@meta.com>
Date: Wed, 10 Jun 2026 13:00:53 -0700
Subject: [PATCH 12/14] Update

[ghstack-poisoned]
---
 .../workflows/build-image.yaml                | 26 ++++++++++++-------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/osdc/integration-tests/workflows/build-image.yaml b/osdc/integration-tests/workflows/build-image.yaml
index 04663043..df36cfc8 100644
--- a/osdc/integration-tests/workflows/build-image.yaml
+++ b/osdc/integration-tests/workflows/build-image.yaml
@@ -44,10 +44,11 @@ jobs:
           ENDPOINT="tcp://buildkitd-${{ inputs.arch }}.buildkit:1234"
           echo "Connecting to: $ENDPOINT"
 
-          # The buildkit client dials with gRPC's ~20s connect timeout, so a
-          # busy / cold autoscaled pool drops the connection fast. Retry until a
-          # pod is free or the pool scales up.
-          for attempt in $(seq 1 15); do
+          # The buildkit client dials with gRPC's ~20s connect timeout, so a busy
+          # / cold pool drops the connection fast (no HAProxy queue holds it).
+          # Retry long enough to outlast a peer's ~10 min build when the pool is
+          # over-subscribed (9 builds > 8 pods): ~45 x (≈5s fail + 15s) ≈ 15 min.
+          for attempt in $(seq 1 45); do
             if buildctl --addr "$ENDPOINT" build \
                 --frontend dockerfile.v0 \
                 --local context=docker/test-buildkit \
@@ -56,7 +57,7 @@ jobs:
               echo "PASS: BuildKit ${{ inputs.arch }} built on attempt ${attempt} ($ENDPOINT)"
               exit 0
             fi
-            echo "attempt ${attempt}/15 failed (BuildKit cold/queued); retrying in 15s..."
+            echo "attempt ${attempt}/45 failed (BuildKit cold/queued); retrying in 15s..."
             sleep 15
           done
           echo "FAIL: BuildKit ${{ inputs.arch }} build failed after retries" >&2
@@ -74,6 +75,10 @@ jobs:
     # burst, so they finish within timeout-minutes only if KEDA scales the pool
     # up; otherwise the back of the burst serializes and the job times out — i.e.
     # this FAILS if autoscaling does not happen.
+    #
+    # Runs concurrently with `build`, so 9 builds contend for a max-8 pool: the
+    # odd one out has no pod until a peer's ~10 min build finishes, exercising
+    # the over-subscription wait (the retry below must outlast that).
     runs-on: ${{ inputs.runner_label }}
     timeout-minutes: 30
     strategy:
@@ -107,10 +112,11 @@ jobs:
           RUN echo "osdc buildkit scale-test ${CACHEBUST}" && sleep 600
           EOF
           # buildx boots the builder with a hardcoded ~20s connect timeout (gRPC
-          # MinConnectTimeout), so retry to wait out cold scale-up; the repeated
-          # attempts also keep KEDA's load signal alive. The 30-min job timeout
-          # still fails the test if scale-up never happens.
-          for attempt in $(seq 1 15); do
+          # MinConnectTimeout), so retry to wait out cold scale-up and, when the
+          # pool is over-subscribed, a peer's ~10 min build; the repeated attempts
+          # also keep KEDA's load signal alive. ~45 x (≈5s fail + 15s) ≈ 15 min,
+          # within the 30-min job timeout (still fails if scale-up never happens).
+          for attempt in $(seq 1 45); do
             if docker buildx build \
                 --platform "linux/${{ inputs.arch }}" \
                 --build-arg "CACHEBUST=${{ inputs.arch }}-${{ matrix.replica }}-${{ github.run_id }}" \
@@ -120,7 +126,7 @@ jobs:
               echo "build succeeded on attempt ${attempt}"
               exit 0
             fi
-            echo "attempt ${attempt}/15 failed (BuildKit cold/queued); retrying in 15s..."
+            echo "attempt ${attempt}/45 failed (BuildKit cold/queued); retrying in 15s..."
             sleep 15
           done
           echo "build failed after retries" >&2

From 5b75928745d90964ff2efb2c8034c1fc26f2de19 Mon Sep 17 00:00:00 2001
From: Huy Do <huydo@meta.com>
Date: Wed, 10 Jun 2026 16:23:14 -0700
Subject: [PATCH 13/14] Update (base update)

[ghstack-poisoned]
---
 osdc/modules/buildkit/README.md | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/osdc/modules/buildkit/README.md b/osdc/modules/buildkit/README.md
index 6e4e30eb..b38c1511 100644
--- a/osdc/modules/buildkit/README.md
+++ b/osdc/modules/buildkit/README.md
@@ -4,8 +4,9 @@ Remote BuildKit build service: per-arch `buildkitd` Deployments behind an HAProx
 LB, on dedicated Karpenter NodePools. Clients build with
 `buildctl --addr tcp://buildkitd-<arch>.buildkit:1234`.
 
-Sizing is per-arch in `clusters.yaml` (`buildkit.{amd64,arm64}_{replicas,pods_per_node}`,
-`*_instance_type`); pod CPU/mem is computed by `scripts/python/generate_buildkit.py`.
+Sizing is per-arch in `clusters.yaml` (`buildkit.{amd64,arm64}_*` instance type,
+pods-per-node, and autoscaling `*_min` / `*_max`); pod CPU/mem is computed by
+`scripts/python/generate_buildkit.py`.
 
 ## Autoscaling (optional, `buildkit.autoscaling.enabled`)
 
@@ -18,10 +19,15 @@ back to a small warm baseline when idle.
   until a pod frees or the pool scales up.
 - **In-cluster scale signal** — KEDA `ScaledObject` per arch, `metrics-api`
   scraping the LB's own metrics (`haproxy_backend_current_sessions`) — no external
-  metrics backend.
+  metrics backend. If KEDA can't read the metric, a `fallback` (`*_fallback`,
+  e.g. 32/8 on prod) holds the proven fixed pool instead of freezing the count.
 - **Warm baseline** — `amd64_min` / `arm64_min` keep ≥1 node per arch up so the
   common case gets a free warm pod immediately. `*_max` caps the burst; NodePool
   limits are sized to `*_max`.
+- **No-flap scale-down** — KEDA holds a pod ~20 min after it goes idle
+  (`stabilizationWindowSeconds: 1200`), then sheds at most `max(10 pods, 20%)`
+  per 20 min, so a follow-up build reuses the pod's warm decompressed NVMe layer
+  cache. Node churn is left to Karpenter.
 - **Safe scale-down** — `preStop` drain (waits until the pod's `:1234` is idle)
   + long `terminationGracePeriodSeconds` + PDB, so a build is never killed
   mid-flight. Scale-down removes an arbitrary pod, which may be mid-build; the
@@ -67,4 +73,6 @@ new config. An unchanged config keeps the same hash, so routine deploys don't
 churn the LB. (The buildkitd worker pods do **not** yet have this, so a
 `buildkitd.toml` / `drain.sh` change needs a manual rollout to take effect.)
 
-Requires the `keda` module deployed before `buildkit` (provides the CRDs).
+Requires the `keda` module deployed before `buildkit` (provides the CRDs). The
+`monitoring` module scrapes the KEDA operator's metrics and ships
+buildkit-autoscaling alerts (scaler / fallback errors, queue backlog).

From 4a0f40c84e2f84cd303aa67d0be7ad809b6c8682 Mon Sep 17 00:00:00 2001
From: Huy Do <huydo@meta.com>
Date: Thu, 11 Jun 2026 09:19:55 -0700
Subject: [PATCH 14/14] Update (base update)

[ghstack-poisoned]
---
 osdc/clusters.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/osdc/clusters.yaml b/osdc/clusters.yaml
index bd618668..05f2ea4d 100644
--- a/osdc/clusters.yaml
+++ b/osdc/clusters.yaml
@@ -299,9 +299,9 @@ clusters:
       arm64_pods_per_node: 4
       autoscaling:
         enabled: true
-        amd64_min: 2   # 1x m6id.24xlarge (2 pods/node)
+        amd64_min: 32  # warm baseline = proven fixed pool (16x m6id.24xlarge)
         amd64_max: 360  # ~90d peak ≈180, x2 for headroom
-        arm64_min: 4   # 1x m7gd.16xlarge (4 pods/node)
+        arm64_min: 8   # warm baseline = proven fixed pool (2x m7gd.16xlarge)
         arm64_max: 30  # ~90d peak ≈15, x2 for headroom
         amd64_fallback: 32  # if KEDA can't read metrics, hold the proven fixed pool
         arm64_fallback: 8