pytorch · huydhn · Jun 5, 2026 · Jun 10, 2026 · Jun 10, 2026 · Jun 10, 2026
@@ -83,6 +83,10 @@ defaults:
     cpu_disruption_budget: "20%"
   buildkit:
     replicas_per_arch: 12
+    autoscaling:
+      enabled: false
+  keda:
+    chart_version: "2.16.1"
   monitoring:
     grafana_cloud_url: "https://prometheus-prod-36-prod-us-west-0.grafana.net/api/prom/push"
     grafana_cloud_read_url: "https://prometheus-prod-36-prod-us-west-0.grafana.net/api/prom"
@@ -162,15 +166,22 @@ clusters:
       github_secret_name: pytorch-arc-staging
       runner_name_prefix: "c-mt-"
     buildkit:
-      replicas_per_arch: 2
       arm64_instance_type: m7gd.16xlarge
+      arm64_pods_per_node: 4
+      autoscaling:
+        enabled: true
+        amd64_min: 2  # 1x m6id.24xlarge (2 pods/node)
+        amd64_max: 8
+        arm64_min: 4  # 1x m7gd.16xlarge (4 pods/node)
+        arm64_max: 8
     pypi_cache:
       replicas: 1
     modules:
       - karpenter
       - arc
       - nodepools
       - arc-runners
+      - keda
       - buildkit
       - pypi-cache
       - cache-enforcer
@@ -283,11 +294,15 @@ clusters:
       min_node_age_seconds: 900
     buildkit:
       amd64_instance_type: m6id.24xlarge
-      amd64_replicas: 32
       amd64_pods_per_node: 2
       arm64_instance_type: m7gd.16xlarge
-      arm64_replicas: 8
       arm64_pods_per_node: 4
+      autoscaling:
+        enabled: true
+        amd64_min: 2   # 1x m6id.24xlarge (2 pods/node)
+        amd64_max: 128  # 14d peak 105; headroom to spare above it
+        arm64_min: 4   # 1x m7gd.16xlarge (4 pods/node)
+        arm64_max: 16  # 14d peak 8, likely capped by fixed pool; headroom
     arc-runners:
       github_config_url: "https://github.com/pytorch"
       github_secret_name: pytorch-arc-cbr-production

@@ -0,0 +1,6 @@
+# Holds one BuildKit slot (~10 min) so a burst of parallel builds exceeds the
+# warm baseline and forces KEDA scale-up. CACHEBUST differs per build so each
+# actually runs (no layer reuse).
+FROM alpine:3.21
+ARG CACHEBUST
+RUN echo "osdc buildkit scale-test ${CACHEBUST}" && sleep 600
@@ -276,15 +276,17 @@ def prepare_pr(
     # Write integration test workflow
     (workflows_dir / "integration-test.yaml").write_text(workflow_content)
 
-    # Copy build-image reusable workflow
-    build_wf_src = upstream_dir / "integration-tests" / "workflows" / "build-image.yaml"
-    (workflows_dir / "build-image.yaml").write_text(build_wf_src.read_text())
-
-    # Copy test Dockerfile
-    docker_dir = canary_path / "docker" / "test-buildkit"
-    docker_dir.mkdir(parents=True, exist_ok=True)
-    dockerfile_src = upstream_dir / "integration-tests" / "docker" / "test-buildkit" / "Dockerfile"
-    (docker_dir / "Dockerfile").write_text(dockerfile_src.read_text())
+    # Copy reusable BuildKit workflows
+    wf_root = upstream_dir / "integration-tests" / "workflows"
+    for wf in ("build-image.yaml", "build-image-scale.yaml"):
+        (workflows_dir / wf).write_text((wf_root / wf).read_text())
+
+    # Copy test Dockerfiles (connectivity + autoscaling scale test)
+    docker_root = upstream_dir / "integration-tests" / "docker"
+    for name in ("test-buildkit", "test-buildkit-scale"):
+        dst = canary_path / "docker" / name
+        dst.mkdir(parents=True, exist_ok=True)
+        (dst / "Dockerfile").write_text((docker_root / name / "Dockerfile").read_text())
 
     # Commit
     run_cmd(["git", "add", "-A"], cwd=canary_path)

@@ -112,11 +112,13 @@ def workflow_template(tmp_path):
     )
     (wf_dir / "integration-test.yaml.tpl").write_text(template)
 
-    # Also create build-image.yaml and Dockerfile for prepare_pr
+    # Also create reusable workflows and Dockerfiles for prepare_pr
     (wf_dir / "build-image.yaml").write_text("name: build-image\n")
-    docker_dir = upstream / "integration-tests" / "docker" / "test-buildkit"
-    docker_dir.mkdir(parents=True)
-    (docker_dir / "Dockerfile").write_text("FROM alpine\n")
+    (wf_dir / "build-image-scale.yaml").write_text("name: build-image-scale\n")
+    for name in ("test-buildkit", "test-buildkit-scale"):
+        docker_dir = upstream / "integration-tests" / "docker" / name
+        docker_dir.mkdir(parents=True)
+        (docker_dir / "Dockerfile").write_text("FROM alpine\n")
 
     return upstream
 

@@ -0,0 +1,59 @@
+# Reusable workflow: BuildKit autoscaling scale test.
+# Launches a burst of 8 parallel builds against one arch's remote BuildKit, each
+# holding a slot (server maxconn=1) for ~10 min via a sleep. The warm baseline
+# (amd64_min / arm64_min) is below the burst, so the builds finish within
+# timeout-minutes only if KEDA scales the pool up. Without scale-up they
+# serialize through the baseline pods and the back of the queue times out — i.e.
+# this job FAILS when autoscaling does not happen.
+name: BuildKit Scale Test
+
+on:
+  workflow_call:
+    inputs:
+      arch:
+        description: "Target architecture (amd64 or arm64)"
+        required: true
+        type: string
+      runner_label:
+        description: "Runner label to use (includes cluster prefix)"
+        required: false
+        type: string
+        default: "l-x86iavx512-2-4"
+
+jobs:
+  scale:
+    # Runs on x86 — BuildKit is a *remote* builder; arch selects the endpoint.
+    # timeout-minutes is the gate: scaled-up ~18 min, serialized ~43 min.
+    runs-on: ${{ inputs.runner_label }}
+    timeout-minutes: 30
+    strategy:
+      fail-fast: false
+      matrix:
+        replica: [1, 2, 3, 4, 5, 6, 7, 8]
+    container:
+      image: ghcr.io/actions/actions-runner:latest
+    steps:
+      - name: Install buildctl
+        run: |
+          BUILDKIT_VERSION="v0.29.0"
+          mkdir -p "$HOME/.local/bin"
+          curl -sSL "https://github.com/moby/buildkit/releases/download/${BUILDKIT_VERSION}/buildkit-${BUILDKIT_VERSION}.linux-amd64.tar.gz" \
+            | tar xz --strip-components=1 -C "$HOME/.local/bin" bin/buildctl
+          echo "$HOME/.local/bin" >> "$GITHUB_PATH"
+          "$HOME/.local/bin/buildctl" --version
+
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Occupy a BuildKit slot (~10 min) to drive autoscaling
+        run: |
+          ENDPOINT="tcp://buildkitd-${{ inputs.arch }}.buildkit:1234"
+          echo "Build ${{ matrix.replica }} -> $ENDPOINT"
+          buildctl --addr "$ENDPOINT" build \
+            --frontend dockerfile.v0 \
+            --local context=docker/test-buildkit-scale \
+            --local dockerfile=docker/test-buildkit-scale \
+            --opt build-arg:CACHEBUST=${{ inputs.arch }}-${{ matrix.replica }}-${{ github.run_id }} \
+            --no-cache \
+            --output type=cacheonly
+          echo "PASS: build ${{ matrix.replica }} finished within timeout"
@@ -1514,6 +1514,20 @@ jobs:
       arch: arm64
       runner_label: {{PREFIX}}l-x86iamx-8-32
 
+  # ── BuildKit Autoscaling Scale Test ───────────────────────────────────
+  # Bursts 8 parallel builds per arch; fails if KEDA does not scale the pool up.
+  buildkit-scale-amd64:
+    uses: ./.github/workflows/build-image-scale.yaml
+    with:
+      arch: amd64
+      runner_label: {{PREFIX}}l-x86iamx-8-32
+
+  buildkit-scale-arm64:
+    uses: ./.github/workflows/build-image-scale.yaml
+    with:
+      arch: arm64
+      runner_label: {{PREFIX}}l-x86iamx-8-32
+
   # ── Harbor Cache Test ─────────────────────────────────────────────────
   test-harbor:
     runs-on: {{PREFIX}}l-x86iamx-8-32

@@ -0,0 +1,40 @@
+# BuildKit module
+
+Remote BuildKit build service: per-arch `buildkitd` Deployments behind an HAProxy
+LB, on dedicated Karpenter NodePools. Clients build with
+`buildctl --addr tcp://buildkitd-<arch>.buildkit:1234`.
+
+Sizing is per-arch in `clusters.yaml` (`buildkit.{amd64,arm64}_{replicas,pods_per_node}`,
+`*_instance_type`); pod CPU/mem is computed by `scripts/python/generate_buildkit.py`.
+
+## Autoscaling (optional, `buildkit.autoscaling.enabled`)
+
+Absorb bursts of concurrent builds without overloading existing pods, and scale
+back to a small warm baseline when idle.
+
+- **One build per pod** — HAProxy `server maxconn 1` (matches buildkitd
+  `max-parallelism = 1`). Excess builds **queue** in HAProxy instead of stacking
+  on a busy pod; as new pods register (DNS), queued builds flow onto them, so
+  scaled-up pods never sit idle. `timeout queue` must stay set and large enough
+  to outlast a node-provision cycle — if omitted it falls back to `timeout
+  connect` (5s), which would abort queued builds before pods scale up.
+- **In-cluster scale signal** — KEDA `ScaledObject` per arch, `metrics-api`
+  scraping the LB's own metrics (`haproxy_backend_current_sessions`) — no external
+  metrics backend.
+- **Warm baseline** — `amd64_min` / `arm64_min` keep ≥1 node per arch up so the
+  common case gets a free warm pod immediately. `*_max` caps the burst; NodePool
+  limits are sized to `*_max`.
+- **Safe scale-down** — `preStop` drain (waits until the pod's `:1234` is idle)
+  + long `terminationGracePeriodSeconds` + PDB, so a build is never killed
+  mid-flight.
+- **Node consolidation** — the NodePool uses Karpenter `consolidationPolicy:
+  WhenEmpty`: a node is reclaimed only once it has no buildkitd pod, never by
+  evicting a running build to bin-pack. So after a burst, scattered survivor
+  pods can leave nodes half-full (more nodes than the cold baseline) until they
+  drain naturally — a deliberate trade of some idle node cost for zero build
+  disruption.
+
+Build clients should retry the connect so a build can wait for a pod from a cold
+or queued pool.
+
+Requires the `keda` module deployed before `buildkit` (provides the CRDs).
@@ -34,22 +34,38 @@ AMD64_REPLICAS=$(uv run "$CFG" "$CLUSTER" buildkit.amd64_replicas "$REPLICAS")
 ARM64_REPLICAS=$(uv run "$CFG" "$CLUSTER" buildkit.arm64_replicas "$REPLICAS")
 AMD64_PODS_PER_NODE=$(uv run "$CFG" "$CLUSTER" buildkit.amd64_pods_per_node "$PODS_PER_NODE")
 ARM64_PODS_PER_NODE=$(uv run "$CFG" "$CLUSTER" buildkit.arm64_pods_per_node "$PODS_PER_NODE")
+AUTOSCALING=$(uv run "$CFG" "$CLUSTER" buildkit.autoscaling.enabled false | tr '[:upper:]' '[:lower:]')
 
 GENERATED_DIR="$MODULE_DIR/generated"
 
 # --- Generate manifests ---
 
 echo "Generating BuildKit manifests..."
-uv run "$MODULE_DIR/scripts/python/generate_buildkit.py" \
-  --arm64-instance-type "$ARM64_INSTANCE" \
-  --amd64-instance-type "$AMD64_INSTANCE" \
-  --replicas "$REPLICAS" \
-  --pods-per-node "$PODS_PER_NODE" \
-  --amd64-replicas "$AMD64_REPLICAS" \
-  --arm64-replicas "$ARM64_REPLICAS" \
-  --amd64-pods-per-node "$AMD64_PODS_PER_NODE" \
-  --arm64-pods-per-node "$ARM64_PODS_PER_NODE" \
+GEN_ARGS=(
+  --arm64-instance-type "$ARM64_INSTANCE"
+  --amd64-instance-type "$AMD64_INSTANCE"
+  --replicas "$REPLICAS"
+  --pods-per-node "$PODS_PER_NODE"
+  --amd64-replicas "$AMD64_REPLICAS"
+  --arm64-replicas "$ARM64_REPLICAS"
+  --amd64-pods-per-node "$AMD64_PODS_PER_NODE"
+  --arm64-pods-per-node "$ARM64_PODS_PER_NODE"
   --output-dir "$GENERATED_DIR"
+)
+if [[ "$AUTOSCALING" == "true" ]]; then
+  AMD64_MIN=$(uv run "$CFG" "$CLUSTER" buildkit.autoscaling.amd64_min 2)
+  AMD64_MAX=$(uv run "$CFG" "$CLUSTER" buildkit.autoscaling.amd64_max 8)
+  ARM64_MIN=$(uv run "$CFG" "$CLUSTER" buildkit.autoscaling.arm64_min 4)
+  ARM64_MAX=$(uv run "$CFG" "$CLUSTER" buildkit.autoscaling.arm64_max 8)
+  GEN_ARGS+=(
+    --autoscaling
+    --amd64-min "$AMD64_MIN"
+    --amd64-max "$AMD64_MAX"
+    --arm64-min "$ARM64_MIN"
+    --arm64-max "$ARM64_MAX"
+  )
+fi
+uv run "$MODULE_DIR/scripts/python/generate_buildkit.py" "${GEN_ARGS[@]}"
 
 # --- Apply NodePools (with cluster name substitution) ---
 
@@ -93,5 +109,13 @@ else
   kubectl rollout status deployment/buildkitd-amd64 -n buildkit --timeout=15m
 fi
 
+# --- KEDA autoscaling (optional) ---
+# Scales on the in-cluster buildkit LB metrics; no external metrics backend.
+
+if [[ "$AUTOSCALING" == "true" ]]; then
+  echo "Applying KEDA autoscaling manifests..."
+  kubectl_apply_if_changed -f "$GENERATED_DIR/autoscaling.yaml"
+fi
+
 echo "BuildKit deployed."
 kubectl get pods -n buildkit -o wide
@@ -0,0 +1,21 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: buildkitd-drain
+  namespace: buildkit
+data:
+  # preStop drain: block termination until no in-flight build remains. A build
+  # keeps an ESTABLISHED inbound connection on :1234 for its whole duration;
+  # require two consecutive idle polls so a transient health check can't be
+  # mistaken for "done". terminationGracePeriodSeconds caps the total wait.
+  drain.sh: |
+    #!/bin/sh
+    idle=0
+    while [ "$idle" -lt 2 ]; do
+      if netstat -tn 2>/dev/null | awk '$NF=="ESTABLISHED" && $4 ~ /:1234$/{f=1} END{exit !f}'; then
+        idle=0
+      else
+        idle=$((idle + 1))
+      fi
+      sleep 15
+    done
@@ -24,6 +24,9 @@ data:
       timeout connect 5s
       timeout client 120m
       timeout server 120m
+      # Queue builds (server maxconn=1) while pods scale up. Keep <= 120m, the max
+      # time a docker build is allowed to run.
+      timeout queue 60m
       log global
       option tcplog
 
@@ -71,15 +74,15 @@ data:
     # resolution warnings.
     backend bk_arm64
       balance leastconn
-      server-template arm 10 buildkitd-arm64-pods.buildkit.svc.cluster.local:1234 check resolvers k8s init-addr none resolve-prefer ipv6
+      server-template arm 10 buildkitd-arm64-pods.buildkit.svc.cluster.local:1234 check resolvers k8s init-addr none resolve-prefer ipv6 maxconn 1
 
     backend bk_amd64
       balance leastconn
-      server-template amd 10 buildkitd-amd64-pods.buildkit.svc.cluster.local:1234 check resolvers k8s init-addr none resolve-prefer ipv6
+      server-template amd 10 buildkitd-amd64-pods.buildkit.svc.cluster.local:1234 check resolvers k8s init-addr none resolve-prefer ipv6 maxconn 1
 
     backend bk_all
       balance leastconn
-      server-template all 20 buildkitd-pods.buildkit.svc.cluster.local:1234 check resolvers k8s init-addr none resolve-prefer ipv6
+      server-template all 20 buildkitd-pods.buildkit.svc.cluster.local:1234 check resolvers k8s init-addr none resolve-prefer ipv6 maxconn 1
 
 ---
 apiVersion: apps/v1

@@ -5,6 +5,8 @@ kind: Kustomization
 resources:
   - namespace.yaml
   - configmap.yaml
+  - drain-configmap.yaml
   - haproxy.yaml
   - service.yaml
   - networkpolicy.yaml
+  - poddisruptionbudget.yaml
@@ -0,0 +1,25 @@
+# Cap voluntary disruptions (node consolidation, drains) to one builder per arch
+# at a time so evictions go through the preStop drain instead of killing builds.
+apiVersion: policy/v1
+kind: PodDisruptionBudget
+metadata:
+  name: buildkitd-arm64
+  namespace: buildkit
+spec:
+  maxUnavailable: 1
+  selector:
+    matchLabels:
+      app: buildkitd
+      arch: arm64
+---
+apiVersion: policy/v1
+kind: PodDisruptionBudget
+metadata:
+  name: buildkitd-amd64
+  namespace: buildkit
+spec:
+  maxUnavailable: 1
+  selector:
+    matchLabels:
+      app: buildkitd
+      arch: amd64