pytorch · huydhn · Jun 11, 2026 · Jun 10, 2026 · Jun 10, 2026 · Jun 10, 2026
@@ -83,6 +83,10 @@ defaults:
     cpu_disruption_budget: "20%"
   buildkit:
     replicas_per_arch: 12
+    autoscaling:
+      enabled: false
+  keda:
+    chart_version: "2.16.1"
   monitoring:
     grafana_cloud_url: "https://prometheus-prod-36-prod-us-west-0.grafana.net/api/prom/push"
     grafana_cloud_read_url: "https://prometheus-prod-36-prod-us-west-0.grafana.net/api/prom"
@@ -162,15 +166,22 @@ clusters:
       github_secret_name: pytorch-arc-staging
       runner_name_prefix: "c-mt-"
     buildkit:
-      replicas_per_arch: 2
       arm64_instance_type: m7gd.16xlarge
+      arm64_pods_per_node: 4
+      autoscaling:
+        enabled: true
+        amd64_min: 2  # 1x m6id.24xlarge (2 pods/node)
+        amd64_max: 8
+        arm64_min: 4  # 1x m7gd.16xlarge (4 pods/node)
+        arm64_max: 8
     pypi_cache:
       replicas: 1
     modules:
       - karpenter
       - arc
       - nodepools
       - arc-runners
+      - keda
       - buildkit
       - pypi-cache
       - cache-enforcer

@@ -0,0 +1,78 @@
+# BuildKit module
+
+Remote BuildKit build service: per-arch `buildkitd` Deployments behind an HAProxy
+LB, on dedicated Karpenter NodePools. Clients build with
+`buildctl --addr tcp://buildkitd-<arch>.buildkit:1234`.
+
+Sizing is per-arch in `clusters.yaml` (`buildkit.{amd64,arm64}_*` instance type,
+pods-per-node, and autoscaling `*_min` / `*_max`); pod CPU/mem is computed by
+`scripts/python/generate_buildkit.py`.
+
+## Autoscaling (optional, `buildkit.autoscaling.enabled`)
+
+Absorb bursts of concurrent builds without overloading existing pods, and scale
+back to a small warm baseline when idle.
+
+- **One build per pod** — HAProxy `server maxconn 1` (matches buildkitd
+  `max-parallelism = 1`) so a build never stacks on a busy pod. When every pod is
+  busy the LB has no slot, so the client must **retry the connect** (see below)
+  until a pod frees or the pool scales up.
+- **In-cluster scale signal** — KEDA `ScaledObject` per arch, `metrics-api`
+  scraping the LB's own metrics (`haproxy_backend_current_sessions`) — no external
+  metrics backend. If KEDA can't read the metric, a `fallback` (`*_fallback`,
+  e.g. 32/8 on prod) holds the proven fixed pool instead of freezing the count.
+- **Warm baseline** — `amd64_min` / `arm64_min` keep ≥1 node per arch up so the
+  common case gets a free warm pod immediately. `*_max` caps the burst; NodePool
+  limits are sized to `*_max`.
+- **No-flap scale-down** — KEDA holds a pod ~20 min after it goes idle
+  (`stabilizationWindowSeconds: 1200`), then sheds at most `max(10 pods, 20%)`
+  per 20 min, so a follow-up build reuses the pod's warm decompressed NVMe layer
+  cache. Node churn is left to Karpenter.
+- **Safe scale-down** — `preStop` drain (waits until the pod's `:1234` is idle)
+  + long `terminationGracePeriodSeconds` + PDB, so a build is never killed
+  mid-flight. Scale-down removes an arbitrary pod, which may be mid-build; the
+  drain holds termination until that build finishes, but
+  `terminationGracePeriodSeconds` is a hard SIGKILL cap, so it must outlast the
+  longest possible build. It's set to **8100s (135m) = 120m** (the max time a
+  docker build may run, matching HAProxy `timeout server`) **+ ~15m** of
+  headroom for the drain's idle-detection polling. A build that starts just
+  before drain still completes; the cap only fires as a backstop if a pod never
+  drains.
+  The **PDB** (`maxUnavailable: 1` per arch) bounds *voluntary* disruptions —
+  node consolidation and manual `kubectl drain` — to one builder per arch at a
+  time, so those go through the preStop drain one pod at a time instead of
+  evicting several in-flight builds at once. (KEDA scale-down deletes pods
+  directly rather than via the eviction API, so it isn't PDB-gated — the drain +
+  grace cap above is what protects that path.)
+
+## Clients must retry the connect
+
+Build clients (both `docker buildx` and `buildctl`) use the `moby/buildkit` Go
+client, which dials with gRPC's default **~20s `MinConnectTimeout`** and
+**fail-fast** RPCs — there is no client-side flag to make it wait longer. During
+a burst, a build whose connection finds no free pod (`maxconn 1`) is dropped by
+the client after ~20s, well before KEDA/Karpenter can add a pod (minutes). An
+HAProxy-side `timeout queue` does **not** help: the client gives up at 20s
+regardless, so queueing on the LB is pointless (and was removed).
+
+So the **client must retry the build** on connection failures until a pod is
+free or the pool has scaled up; the repeated attempts also keep the autoscaler's
+load signal alive. PyTorch's `.ci/docker/build.sh` does this when
+`REMOTE_BUILDKIT` is set, and the workflow creates the remote builder *without*
+`--bootstrap` (the `docker buildx inspect --bootstrap` health check hits the same
+20s gate at setup). This was confirmed on the staging cluster.
+
+## HAProxy config changes roll the LB
+
+HAProxy renders its config only at container start, and nothing else restarts
+the `buildkitd-lb` pod, so a bare ConfigMap update (`maxconn`, timeouts,
+backends) would silently not take effect. `deploy.sh` stamps the LB pod template
+with a `checksum/config` annotation = a hash of `haproxy.yaml`; when the config
+changes the hash changes, which rolls the Deployment so the new pod picks up the
+new config. An unchanged config keeps the same hash, so routine deploys don't
+churn the LB. (The buildkitd worker pods do **not** yet have this, so a
+`buildkitd.toml` / `drain.sh` change needs a manual rollout to take effect.)
+
+Requires the `keda` module deployed before `buildkit` (provides the CRDs). The
+`monitoring` module scrapes the KEDA operator's metrics and ships
+buildkit-autoscaling alerts (scaler / fallback errors, queue backlog).
@@ -34,22 +34,43 @@ AMD64_REPLICAS=$(uv run "$CFG" "$CLUSTER" buildkit.amd64_replicas "$REPLICAS")
 ARM64_REPLICAS=$(uv run "$CFG" "$CLUSTER" buildkit.arm64_replicas "$REPLICAS")
 AMD64_PODS_PER_NODE=$(uv run "$CFG" "$CLUSTER" buildkit.amd64_pods_per_node "$PODS_PER_NODE")
 ARM64_PODS_PER_NODE=$(uv run "$CFG" "$CLUSTER" buildkit.arm64_pods_per_node "$PODS_PER_NODE")
+AUTOSCALING=$(uv run "$CFG" "$CLUSTER" buildkit.autoscaling.enabled false)
 
 GENERATED_DIR="$MODULE_DIR/generated"
 
 # --- Generate manifests ---
 
 echo "Generating BuildKit manifests..."
-uv run "$MODULE_DIR/scripts/python/generate_buildkit.py" \
-  --arm64-instance-type "$ARM64_INSTANCE" \
-  --amd64-instance-type "$AMD64_INSTANCE" \
-  --replicas "$REPLICAS" \
-  --pods-per-node "$PODS_PER_NODE" \
-  --amd64-replicas "$AMD64_REPLICAS" \
-  --arm64-replicas "$ARM64_REPLICAS" \
-  --amd64-pods-per-node "$AMD64_PODS_PER_NODE" \
-  --arm64-pods-per-node "$ARM64_PODS_PER_NODE" \
+GEN_ARGS=(
+  --arm64-instance-type "$ARM64_INSTANCE"
+  --amd64-instance-type "$AMD64_INSTANCE"
+  --replicas "$REPLICAS"
+  --pods-per-node "$PODS_PER_NODE"
+  --amd64-replicas "$AMD64_REPLICAS"
+  --arm64-replicas "$ARM64_REPLICAS"
+  --amd64-pods-per-node "$AMD64_PODS_PER_NODE"
+  --arm64-pods-per-node "$ARM64_PODS_PER_NODE"
   --output-dir "$GENERATED_DIR"
+)
+if [[ "${AUTOSCALING,,}" == "true" ]]; then
+  AMD64_MIN=$(uv run "$CFG" "$CLUSTER" buildkit.autoscaling.amd64_min 2)
+  AMD64_MAX=$(uv run "$CFG" "$CLUSTER" buildkit.autoscaling.amd64_max 8)
+  ARM64_MIN=$(uv run "$CFG" "$CLUSTER" buildkit.autoscaling.arm64_min 4)
+  ARM64_MAX=$(uv run "$CFG" "$CLUSTER" buildkit.autoscaling.arm64_max 8)
+  # Fallback replicas if KEDA can't read the scale metric (0 = no fallback).
+  AMD64_FALLBACK=$(uv run "$CFG" "$CLUSTER" buildkit.autoscaling.amd64_fallback 0)
+  ARM64_FALLBACK=$(uv run "$CFG" "$CLUSTER" buildkit.autoscaling.arm64_fallback 0)
+  GEN_ARGS+=(
+    --autoscaling
+    --amd64-min "$AMD64_MIN"
+    --amd64-max "$AMD64_MAX"
+    --arm64-min "$ARM64_MIN"
+    --arm64-max "$ARM64_MAX"
+    --amd64-fallback "$AMD64_FALLBACK"
+    --arm64-fallback "$ARM64_FALLBACK"
+  )
+fi
+uv run "$MODULE_DIR/scripts/python/generate_buildkit.py" "${GEN_ARGS[@]}"
 
 # --- Apply NodePools (with cluster name substitution) ---
 
@@ -59,7 +80,15 @@ sed "s/CLUSTER_NAME_PLACEHOLDER/$CNAME/g" "$GENERATED_DIR/nodepools.yaml" | kube
 # --- Apply static k8s resources ---
 
 echo "Applying BuildKit static manifests..."
-kubectl_apply_if_changed -k "$MODULE_DIR/kubernetes/base/"
+# Stamp the buildkitd-lb pod template with a hash of haproxy.yaml. HAProxy reads
+# its config only at container start, and nothing else restarts the LB, so
+# without this a ConfigMap change (maxconn, timeouts, backends) would silently
+# not take effect until the pod happened to be recreated. Changing the hash
+# rolls the Deployment whenever the config changes.
+HAPROXY_SUM=$(shasum -a 256 "$MODULE_DIR/kubernetes/base/haproxy.yaml" | cut -c1-12)
+kubectl kustomize "$MODULE_DIR/kubernetes/base/" \
+  | sed "s/__HAPROXY_CFG_CHECKSUM__/$HAPROXY_SUM/" \
+  | kubectl_apply_if_changed -f -
 
 # --- Apply generated Deployments (only if changed) ---
 
@@ -93,5 +122,13 @@ else
   kubectl rollout status deployment/buildkitd-amd64 -n buildkit --timeout=15m
 fi
 
+# --- KEDA autoscaling (optional) ---
+# Scales on the in-cluster buildkit LB metrics; no external metrics backend.
+
+if [[ "${AUTOSCALING,,}" == "true" ]]; then
+  echo "Applying KEDA autoscaling manifests..."
+  kubectl_apply_if_changed -f "$GENERATED_DIR/autoscaling.yaml"
+fi
+
 echo "BuildKit deployed."
 kubectl get pods -n buildkit -o wide
@@ -0,0 +1,21 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: buildkitd-drain
+  namespace: buildkit
+data:
+  # preStop drain: block termination until no in-flight build remains. A build
+  # keeps an ESTABLISHED inbound connection on :1234 for its whole duration;
+  # require two consecutive idle polls so a transient health check can't be
+  # mistaken for "done". terminationGracePeriodSeconds caps the total wait.
+  drain.sh: |
+    #!/bin/sh
+    idle=0
+    while [ "$idle" -lt 2 ]; do
+      if netstat -tn 2>/dev/null | awk '$NF=="ESTABLISHED" && $4 ~ /:1234$/{f=1} END{exit !f}'; then
+        idle=0
+      else
+        idle=$((idle + 1))
+      fi
+      sleep 15
+    done
@@ -71,15 +71,15 @@ data:
     # resolution warnings.
     backend bk_arm64
       balance leastconn
-      server-template arm 10 buildkitd-arm64-pods.buildkit.svc.cluster.local:1234 check resolvers k8s init-addr none resolve-prefer ipv6
+      server-template arm 10 buildkitd-arm64-pods.buildkit.svc.cluster.local:1234 check resolvers k8s init-addr none resolve-prefer ipv6 maxconn 1
 
     backend bk_amd64
       balance leastconn
-      server-template amd 10 buildkitd-amd64-pods.buildkit.svc.cluster.local:1234 check resolvers k8s init-addr none resolve-prefer ipv6
+      server-template amd 10 buildkitd-amd64-pods.buildkit.svc.cluster.local:1234 check resolvers k8s init-addr none resolve-prefer ipv6 maxconn 1
 
     backend bk_all
       balance leastconn
-      server-template all 20 buildkitd-pods.buildkit.svc.cluster.local:1234 check resolvers k8s init-addr none resolve-prefer ipv6
+      server-template all 20 buildkitd-pods.buildkit.svc.cluster.local:1234 check resolvers k8s init-addr none resolve-prefer ipv6 maxconn 1
 
 ---
 apiVersion: apps/v1
@@ -100,6 +100,12 @@ spec:
     metadata:
       labels:
         app: buildkitd-lb
+      annotations:
+        # HAProxy renders its config once at container start, so a ConfigMap
+        # change has no effect until this pod restarts. deploy.sh fills this in
+        # with a hash of haproxy.yaml, so the Deployment rolls automatically
+        # whenever the config changes (nothing else restarts the LB).
+        checksum/config: "__HAPROXY_CFG_CHECKSUM__"
     spec:
       # Runs on base-infra nodes (CriticalAddonsOnly taint)
       tolerations:

@@ -5,6 +5,8 @@ kind: Kustomization
 resources:
   - namespace.yaml
   - configmap.yaml
+  - drain-configmap.yaml
   - haproxy.yaml
   - service.yaml
   - networkpolicy.yaml
+  - poddisruptionbudget.yaml
@@ -0,0 +1,25 @@
+# Cap voluntary disruptions (node consolidation, drains) to one builder per arch
+# at a time so evictions go through the preStop drain instead of killing builds.
+apiVersion: policy/v1
+kind: PodDisruptionBudget
+metadata:
+  name: buildkitd-arm64
+  namespace: buildkit
+spec:
+  maxUnavailable: 1
+  selector:
+    matchLabels:
+      app: buildkitd
+      arch: arm64
+---
+apiVersion: policy/v1
+kind: PodDisruptionBudget
+metadata:
+  name: buildkitd-amd64
+  namespace: buildkit
+spec:
+  maxUnavailable: 1
+  selector:
+    matchLabels:
+      app: buildkitd
+      arch: amd64