diff --git a/osdc/clusters.yaml b/osdc/clusters.yaml
index 9bc0dc34..214ac531 100644
--- a/osdc/clusters.yaml
+++ b/osdc/clusters.yaml
@@ -269,11 +269,17 @@ clusters:
       min_node_age_seconds: 900
     buildkit:
       amd64_instance_type: m6id.24xlarge
-      amd64_replicas: 32
       amd64_pods_per_node: 2
       arm64_instance_type: m7gd.16xlarge
-      arm64_replicas: 8
       arm64_pods_per_node: 4
+      autoscaling:
+        enabled: true
+        amd64_min: 32  # warm baseline = proven fixed pool (16x m6id.24xlarge)
+        amd64_max: 360  # ~90d peak ≈180, x2 for headroom
+        arm64_min: 8   # warm baseline = proven fixed pool (2x m7gd.16xlarge)
+        arm64_max: 30  # ~90d peak ≈15, x2 for headroom
+        amd64_fallback: 32  # if KEDA can't read metrics, hold the proven fixed pool
+        arm64_fallback: 8
     arc-runners:
       github_config_url: "https://github.com/pytorch"
       github_secret_name: pytorch-arc-cbr-production
@@ -299,6 +305,7 @@ clusters:
       - arc-runners
       - arc-runners-b200
       - arc-runners-h100
+      - keda
       - buildkit
       - pypi-cache
       - cache-enforcer
diff --git a/osdc/integration-tests/scripts/python/phases.py b/osdc/integration-tests/scripts/python/phases.py
index 27ddebf6..7a21f204 100644
--- a/osdc/integration-tests/scripts/python/phases.py
+++ b/osdc/integration-tests/scripts/python/phases.py
@@ -276,11 +276,12 @@ def prepare_pr(
     # Write integration test workflow
     (workflows_dir / "integration-test.yaml").write_text(workflow_content)
 
-    # Copy build-image reusable workflow
+    # Copy the reusable BuildKit workflow (connectivity + autoscaling scale jobs).
+    # The scale job builds an inline Dockerfile, so it needs no copied context.
     build_wf_src = upstream_dir / "integration-tests" / "workflows" / "build-image.yaml"
     (workflows_dir / "build-image.yaml").write_text(build_wf_src.read_text())
 
-    # Copy test Dockerfile
+    # Copy test Dockerfile (connectivity test context)
     docker_dir = canary_path / "docker" / "test-buildkit"
     docker_dir.mkdir(parents=True, exist_ok=True)
     dockerfile_src = upstream_dir / "integration-tests" / "docker" / "test-buildkit" / "Dockerfile"
diff --git a/osdc/integration-tests/scripts/python/test_run.py b/osdc/integration-tests/scripts/python/test_run.py
index ff3a7bcb..58e6ab48 100644
--- a/osdc/integration-tests/scripts/python/test_run.py
+++ b/osdc/integration-tests/scripts/python/test_run.py
@@ -112,7 +112,7 @@ def workflow_template(tmp_path):
     )
     (wf_dir / "integration-test.yaml.tpl").write_text(template)
 
-    # Also create build-image.yaml and Dockerfile for prepare_pr
+    # Also create reusable workflow and Dockerfile for prepare_pr
     (wf_dir / "build-image.yaml").write_text("name: build-image\n")
     docker_dir = upstream / "integration-tests" / "docker" / "test-buildkit"
     docker_dir.mkdir(parents=True)
diff --git a/osdc/integration-tests/workflows/build-image.yaml b/osdc/integration-tests/workflows/build-image.yaml
index b523d03d..df36cfc8 100644
--- a/osdc/integration-tests/workflows/build-image.yaml
+++ b/osdc/integration-tests/workflows/build-image.yaml
@@ -1,6 +1,7 @@
-# Reusable workflow: Build a test image via OSDC BuildKit
-# Called by integration-test.yaml to validate BuildKit connectivity.
-# Uses buildctl directly — no Docker daemon required.
+# Reusable workflow: exercise OSDC BuildKit for one arch.
+# Called by integration-test.yaml. Two jobs:
+#   build  — single buildctl build (validates connectivity; buildctl route)
+#   scale  — burst of docker buildx builds (validates autoscaling; prod client)
 name: Build Test Image
 
 on:
@@ -38,21 +39,95 @@ jobs:
 
       - name: Build test image via BuildKit
         run: |
+          set -eu
           echo "=== BuildKit ${{ inputs.arch }} connectivity test ==="
           ENDPOINT="tcp://buildkitd-${{ inputs.arch }}.buildkit:1234"
           echo "Connecting to: $ENDPOINT"
 
-          buildctl --addr "$ENDPOINT" build \
-            --frontend dockerfile.v0 \
-            --local context=docker/test-buildkit \
-            --local dockerfile=docker/test-buildkit \
-            --output type=image,name=ghcr.io/${{ github.repository }}:integration-test-${{ inputs.arch }}-${{ github.sha }},push=false
-
-          echo "PASS: BuildKit ${{ inputs.arch }} built successfully"
-          echo "Endpoint: $ENDPOINT"
+          # The buildkit client dials with gRPC's ~20s connect timeout, so a busy
+          # / cold pool drops the connection fast (no HAProxy queue holds it).
+          # Retry long enough to outlast a peer's ~10 min build when the pool is
+          # over-subscribed (9 builds > 8 pods): ~45 x (≈5s fail + 15s) ≈ 15 min.
+          for attempt in $(seq 1 45); do
+            if buildctl --addr "$ENDPOINT" build \
+                --frontend dockerfile.v0 \
+                --local context=docker/test-buildkit \
+                --local dockerfile=docker/test-buildkit \
+                --output type=image,name=ghcr.io/${{ github.repository }}:integration-test-${{ inputs.arch }}-${{ github.sha }},push=false; then
+              echo "PASS: BuildKit ${{ inputs.arch }} built on attempt ${attempt} ($ENDPOINT)"
+              exit 0
+            fi
+            echo "attempt ${attempt}/45 failed (BuildKit cold/queued); retrying in 15s..."
+            sleep 15
+          done
+          echo "FAIL: BuildKit ${{ inputs.arch }} build failed after retries" >&2
+          exit 1
 
       - name: Verify BuildKit endpoint info
         run: |
           ENDPOINT="tcp://buildkitd-${{ inputs.arch }}.buildkit:1234"
           buildctl --addr "$ENDPOINT" debug info || echo "WARN: debug info not available"
           echo "PASS: BuildKit ${{ inputs.arch }} endpoint is responsive"
+
+  scale:
+    # 8 parallel docker buildx builds (the prod client), each holding a BuildKit
+    # slot (server maxconn=1) ~10 min via a sleep. The warm baseline is below the
+    # burst, so they finish within timeout-minutes only if KEDA scales the pool
+    # up; otherwise the back of the burst serializes and the job times out — i.e.
+    # this FAILS if autoscaling does not happen.
+    #
+    # Runs concurrently with `build`, so 9 builds contend for a max-8 pool: the
+    # odd one out has no pod until a peer's ~10 min build finishes, exercising
+    # the over-subscription wait (the retry below must outlast that).
+    runs-on: ${{ inputs.runner_label }}
+    timeout-minutes: 30
+    strategy:
+      fail-fast: false
+      matrix:
+        replica: [1, 2, 3, 4, 5, 6, 7, 8]
+    container:
+      image: ghcr.io/actions/actions-runner:latest
+    steps:
+      - name: Set up Docker Buildx (remote, no bootstrap)
+        shell: bash
+        # NOT docker/setup-buildx-action: it runs `buildx inspect --bootstrap`,
+        # whose ~20s connect timeout fails at setup during a cold scale-up.
+        # `create` (no --bootstrap) just registers the builder; the build step
+        # retries to wait out scale-up.
+        run: |
+          set -ex
+          docker buildx create \
+            --name osdc-remote \
+            --driver remote \
+            --use \
+            "tcp://buildkitd-${{ inputs.arch }}.buildkit:1234"
+
+      - name: Occupy a BuildKit slot (~10 min) to drive autoscaling
+        shell: bash
+        run: |
+          set -eu
+          cat > Dockerfile.scale <<'EOF'
+          FROM alpine:3.21
+          ARG CACHEBUST
+          RUN echo "osdc buildkit scale-test ${CACHEBUST}" && sleep 600
+          EOF
+          # buildx boots the builder with a hardcoded ~20s connect timeout (gRPC
+          # MinConnectTimeout), so retry to wait out cold scale-up and, when the
+          # pool is over-subscribed, a peer's ~10 min build; the repeated attempts
+          # also keep KEDA's load signal alive. ~45 x (≈5s fail + 15s) ≈ 15 min,
+          # within the 30-min job timeout (still fails if scale-up never happens).
+          for attempt in $(seq 1 45); do
+            if docker buildx build \
+                --platform "linux/${{ inputs.arch }}" \
+                --build-arg "CACHEBUST=${{ inputs.arch }}-${{ matrix.replica }}-${{ github.run_id }}" \
+                --no-cache \
+                --output type=cacheonly \
+                -f Dockerfile.scale .; then
+              echo "build succeeded on attempt ${attempt}"
+              exit 0
+            fi
+            echo "attempt ${attempt}/45 failed (BuildKit cold/queued); retrying in 15s..."
+            sleep 15
+          done
+          echo "build failed after retries" >&2
+          exit 1
diff --git a/osdc/integration-tests/workflows/integration-test.yaml.tpl b/osdc/integration-tests/workflows/integration-test.yaml.tpl
index 4c20c327..63b60740 100644
--- a/osdc/integration-tests/workflows/integration-test.yaml.tpl
+++ b/osdc/integration-tests/workflows/integration-test.yaml.tpl
@@ -1436,13 +1436,15 @@ jobs:
   # END_B200
 
   # ── BuildKit Tests ────────────────────────────────────────────────────
-  build-amd64:
+  # Each call runs a buildctl connectivity build + an 8-wide docker buildx burst
+  # (fails if KEDA does not scale the pool up).
+  buildkit-amd64:
     uses: ./.github/workflows/build-image.yaml
     with:
       arch: amd64
       runner_label: {{PREFIX}}l-x86iamx-8-32
 
-  build-arm64:
+  buildkit-arm64:
     uses: ./.github/workflows/build-image.yaml
     with:
       arch: arm64
diff --git a/osdc/modules/buildkit/deploy.sh b/osdc/modules/buildkit/deploy.sh
index eb08b727..3f26c9b8 100755
--- a/osdc/modules/buildkit/deploy.sh
+++ b/osdc/modules/buildkit/deploy.sh
@@ -34,7 +34,8 @@ AMD64_REPLICAS=$(uv run "$CFG" "$CLUSTER" buildkit.amd64_replicas "$REPLICAS")
 ARM64_REPLICAS=$(uv run "$CFG" "$CLUSTER" buildkit.arm64_replicas "$REPLICAS")
 AMD64_PODS_PER_NODE=$(uv run "$CFG" "$CLUSTER" buildkit.amd64_pods_per_node "$PODS_PER_NODE")
 ARM64_PODS_PER_NODE=$(uv run "$CFG" "$CLUSTER" buildkit.arm64_pods_per_node "$PODS_PER_NODE")
-AUTOSCALING=$(uv run "$CFG" "$CLUSTER" buildkit.autoscaling.enabled false)
+# Lowercase via tr (not ${VAR,,}) — deploy.sh runs under macOS bash 3.2 too.
+AUTOSCALING=$(uv run "$CFG" "$CLUSTER" buildkit.autoscaling.enabled false | tr '[:upper:]' '[:lower:]')
 
 GENERATED_DIR="$MODULE_DIR/generated"
 
@@ -52,7 +53,7 @@ GEN_ARGS=(
   --arm64-pods-per-node "$ARM64_PODS_PER_NODE"
   --output-dir "$GENERATED_DIR"
 )
-if [[ "${AUTOSCALING,,}" == "true" ]]; then
+if [[ "$AUTOSCALING" == "true" ]]; then
   AMD64_MIN=$(uv run "$CFG" "$CLUSTER" buildkit.autoscaling.amd64_min 2)
   AMD64_MAX=$(uv run "$CFG" "$CLUSTER" buildkit.autoscaling.amd64_max 8)
   ARM64_MIN=$(uv run "$CFG" "$CLUSTER" buildkit.autoscaling.arm64_min 4)
@@ -125,7 +126,7 @@ fi
 # --- KEDA autoscaling (optional) ---
 # Scales on the in-cluster buildkit LB metrics; no external metrics backend.
 
-if [[ "${AUTOSCALING,,}" == "true" ]]; then
+if [[ "$AUTOSCALING" == "true" ]]; then
   echo "Applying KEDA autoscaling manifests..."
   kubectl_apply_if_changed -f "$GENERATED_DIR/autoscaling.yaml"
 fi
diff --git a/osdc/modules/keda/deploy.sh b/osdc/modules/keda/deploy.sh
index 59f422ad..e74d3d9f 100755
--- a/osdc/modules/keda/deploy.sh
+++ b/osdc/modules/keda/deploy.sh
@@ -27,6 +27,7 @@ helm repo update kedacore >/dev/null 2>&1 || true
 helm_upgrade_if_changed keda "$NAMESPACE" \
   --create-namespace \
   --version "$CHART_VERSION" \
+  -f "$MODULE_DIR/helm/values.yaml" \
   --timeout 5m \
   --wait \
   kedacore/keda
diff --git a/osdc/modules/keda/helm/values.yaml b/osdc/modules/keda/helm/values.yaml
new file mode 100644
index 00000000..282ca179
--- /dev/null
+++ b/osdc/modules/keda/helm/values.yaml
@@ -0,0 +1,13 @@
+# Schedule on the base-infra nodes (tainted CriticalAddonsOnly); every other node
+# is reserved by workload taints, so without this the keda pods stay Pending and
+# the install's --wait times out. Applies to operator, metrics server, webhooks.
+tolerations:
+  - key: CriticalAddonsOnly
+    operator: Exists
+
+# Expose the KEDA operator's Prometheus metrics (keda_scaler_* /
+# keda_scaledobject_*) on :8080. The ServiceMonitor that scrapes it lives in the
+# monitoring module, so it applies after the monitoring.coreos.com CRDs exist.
+prometheus:
+  operator:
+    enabled: true
diff --git a/osdc/modules/monitoring/kubernetes/monitors/kustomization.yaml b/osdc/modules/monitoring/kubernetes/monitors/kustomization.yaml
index 2d223760..08e1aa28 100644
--- a/osdc/modules/monitoring/kubernetes/monitors/kustomization.yaml
+++ b/osdc/modules/monitoring/kubernetes/monitors/kustomization.yaml
@@ -12,6 +12,7 @@ resources:
   - servicemonitors/buildkit-haproxy.yaml
   - servicemonitors/harbor.yaml
   - servicemonitors/karpenter.yaml
+  - servicemonitors/keda.yaml
   - servicemonitors/node-compactor.yaml
   - servicemonitors/pushgateway.yaml
   - servicemonitors/pypi-cache.yaml
diff --git a/osdc/modules/monitoring/kubernetes/monitors/servicemonitors/keda.yaml b/osdc/modules/monitoring/kubernetes/monitors/servicemonitors/keda.yaml
new file mode 100644
index 00000000..7200fc48
--- /dev/null
+++ b/osdc/modules/monitoring/kubernetes/monitors/servicemonitors/keda.yaml
@@ -0,0 +1,28 @@
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  name: keda
+  namespace: monitoring
+  labels:
+    app.kubernetes.io/part-of: osdc-monitoring
+spec:
+  namespaceSelector:
+    matchNames:
+      - keda
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: keda-operator
+  endpoints:
+    - port: metrics
+      path: /metrics
+      interval: 60s
+      metricRelabelings:
+        # Keep only KEDA's own metrics (scaler/scaledobject values, errors,
+        # activity, latency) — this also drops the endpoint's go_/process_/
+        # controller-runtime series. Then drop histogram buckets.
+        - action: keep
+          sourceLabels: [__name__]
+          regex: "keda_.*"
+        - action: drop
+          sourceLabels: [__name__]
+          regex: ".*_bucket"
diff --git a/osdc/modules/monitoring/tests/smoke/test_monitoring.py b/osdc/modules/monitoring/tests/smoke/test_monitoring.py
index c095df99..17123ffd 100644
--- a/osdc/modules/monitoring/tests/smoke/test_monitoring.py
+++ b/osdc/modules/monitoring/tests/smoke/test_monitoring.py
@@ -29,6 +29,7 @@
     "buildkit-haproxy",
     "harbor",
     "karpenter",
+    "keda",
     "node-compactor",
     "pypi-cache",
     "dcgm-exporter",
@@ -289,6 +290,7 @@ def test_metrics_arriving(self, resolve_config) -> None:
     "buildkit": ("buildkitd-pods", "buildkit"),
     "buildkit-haproxy": ("buildkitd-lb-metrics", "buildkit"),
     "karpenter": ("karpenter", "karpenter"),
+    "keda": ("keda-operator", "keda"),
     "node-compactor": ("node-compactor", None),
     # arc-controller: skipped — ARC controller metrics Service varies by chart version
     # harbor: skipped — Harbor exporter Service name varies by chart version