From fbbedbecb2fcde9ee25af9c6e262465ec64905b1 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Wed, 10 Jun 2026 01:52:22 -0700 Subject: [PATCH 01/14] Update [ghstack-poisoned] --- .../docker/test-buildkit-scale/Dockerfile | 6 ++ .../scripts/python/phases.py | 20 ++++--- .../scripts/python/test_run.py | 10 ++-- .../workflows/build-image-scale.yaml | 59 +++++++++++++++++++ .../workflows/integration-test.yaml.tpl | 14 +++++ 5 files changed, 96 insertions(+), 13 deletions(-) create mode 100644 osdc/integration-tests/docker/test-buildkit-scale/Dockerfile create mode 100644 osdc/integration-tests/workflows/build-image-scale.yaml diff --git a/osdc/integration-tests/docker/test-buildkit-scale/Dockerfile b/osdc/integration-tests/docker/test-buildkit-scale/Dockerfile new file mode 100644 index 00000000..e317eb61 --- /dev/null +++ b/osdc/integration-tests/docker/test-buildkit-scale/Dockerfile @@ -0,0 +1,6 @@ +# Holds one BuildKit slot (~10 min) so a burst of parallel builds exceeds the +# warm baseline and forces KEDA scale-up. CACHEBUST differs per build so each +# actually runs (no layer reuse). +FROM alpine:3.21 +ARG CACHEBUST +RUN echo "osdc buildkit scale-test ${CACHEBUST}" && sleep 600 diff --git a/osdc/integration-tests/scripts/python/phases.py b/osdc/integration-tests/scripts/python/phases.py index 27ddebf6..17654000 100644 --- a/osdc/integration-tests/scripts/python/phases.py +++ b/osdc/integration-tests/scripts/python/phases.py @@ -276,15 +276,17 @@ def prepare_pr( # Write integration test workflow (workflows_dir / "integration-test.yaml").write_text(workflow_content) - # Copy build-image reusable workflow - build_wf_src = upstream_dir / "integration-tests" / "workflows" / "build-image.yaml" - (workflows_dir / "build-image.yaml").write_text(build_wf_src.read_text()) - - # Copy test Dockerfile - docker_dir = canary_path / "docker" / "test-buildkit" - docker_dir.mkdir(parents=True, exist_ok=True) - dockerfile_src = upstream_dir / "integration-tests" / "docker" / "test-buildkit" / "Dockerfile" - (docker_dir / "Dockerfile").write_text(dockerfile_src.read_text()) + # Copy reusable BuildKit workflows + wf_root = upstream_dir / "integration-tests" / "workflows" + for wf in ("build-image.yaml", "build-image-scale.yaml"): + (workflows_dir / wf).write_text((wf_root / wf).read_text()) + + # Copy test Dockerfiles (connectivity + autoscaling scale test) + docker_root = upstream_dir / "integration-tests" / "docker" + for name in ("test-buildkit", "test-buildkit-scale"): + dst = canary_path / "docker" / name + dst.mkdir(parents=True, exist_ok=True) + (dst / "Dockerfile").write_text((docker_root / name / "Dockerfile").read_text()) # Commit run_cmd(["git", "add", "-A"], cwd=canary_path) diff --git a/osdc/integration-tests/scripts/python/test_run.py b/osdc/integration-tests/scripts/python/test_run.py index ff3a7bcb..dc4d5188 100644 --- a/osdc/integration-tests/scripts/python/test_run.py +++ b/osdc/integration-tests/scripts/python/test_run.py @@ -112,11 +112,13 @@ def workflow_template(tmp_path): ) (wf_dir / "integration-test.yaml.tpl").write_text(template) - # Also create build-image.yaml and Dockerfile for prepare_pr + # Also create reusable workflows and Dockerfiles for prepare_pr (wf_dir / "build-image.yaml").write_text("name: build-image\n") - docker_dir = upstream / "integration-tests" / "docker" / "test-buildkit" - docker_dir.mkdir(parents=True) - (docker_dir / "Dockerfile").write_text("FROM alpine\n") + (wf_dir / "build-image-scale.yaml").write_text("name: build-image-scale\n") + for name in ("test-buildkit", "test-buildkit-scale"): + docker_dir = upstream / "integration-tests" / "docker" / name + docker_dir.mkdir(parents=True) + (docker_dir / "Dockerfile").write_text("FROM alpine\n") return upstream diff --git a/osdc/integration-tests/workflows/build-image-scale.yaml b/osdc/integration-tests/workflows/build-image-scale.yaml new file mode 100644 index 00000000..b485dc5a --- /dev/null +++ b/osdc/integration-tests/workflows/build-image-scale.yaml @@ -0,0 +1,59 @@ +# Reusable workflow: BuildKit autoscaling scale test. +# Launches a burst of 8 parallel builds against one arch's remote BuildKit, each +# holding a slot (server maxconn=1) for ~10 min via a sleep. The warm baseline +# (amd64_min / arm64_min) is below the burst, so the builds finish within +# timeout-minutes only if KEDA scales the pool up. Without scale-up they +# serialize through the baseline pods and the back of the queue times out — i.e. +# this job FAILS when autoscaling does not happen. +name: BuildKit Scale Test + +on: + workflow_call: + inputs: + arch: + description: "Target architecture (amd64 or arm64)" + required: true + type: string + runner_label: + description: "Runner label to use (includes cluster prefix)" + required: false + type: string + default: "l-x86iavx512-2-4" + +jobs: + scale: + # Runs on x86 — BuildKit is a *remote* builder; arch selects the endpoint. + # timeout-minutes is the gate: scaled-up ~18 min, serialized ~43 min. + runs-on: ${{ inputs.runner_label }} + timeout-minutes: 30 + strategy: + fail-fast: false + matrix: + replica: [1, 2, 3, 4, 5, 6, 7, 8] + container: + image: ghcr.io/actions/actions-runner:latest + steps: + - name: Install buildctl + run: | + BUILDKIT_VERSION="v0.29.0" + mkdir -p "$HOME/.local/bin" + curl -sSL "https://github.com/moby/buildkit/releases/download/${BUILDKIT_VERSION}/buildkit-${BUILDKIT_VERSION}.linux-amd64.tar.gz" \ + | tar xz --strip-components=1 -C "$HOME/.local/bin" bin/buildctl + echo "$HOME/.local/bin" >> "$GITHUB_PATH" + "$HOME/.local/bin/buildctl" --version + + - name: Checkout + uses: actions/checkout@v4 + + - name: Occupy a BuildKit slot (~10 min) to drive autoscaling + run: | + ENDPOINT="tcp://buildkitd-${{ inputs.arch }}.buildkit:1234" + echo "Build ${{ matrix.replica }} -> $ENDPOINT" + buildctl --addr "$ENDPOINT" build \ + --frontend dockerfile.v0 \ + --local context=docker/test-buildkit-scale \ + --local dockerfile=docker/test-buildkit-scale \ + --opt build-arg:CACHEBUST=${{ inputs.arch }}-${{ matrix.replica }}-${{ github.run_id }} \ + --no-cache \ + --output type=cacheonly + echo "PASS: build ${{ matrix.replica }} finished within timeout" diff --git a/osdc/integration-tests/workflows/integration-test.yaml.tpl b/osdc/integration-tests/workflows/integration-test.yaml.tpl index c9563480..7d4fb3aa 100644 --- a/osdc/integration-tests/workflows/integration-test.yaml.tpl +++ b/osdc/integration-tests/workflows/integration-test.yaml.tpl @@ -1514,6 +1514,20 @@ jobs: arch: arm64 runner_label: {{PREFIX}}l-x86iamx-8-32 + # ── BuildKit Autoscaling Scale Test ─────────────────────────────────── + # Bursts 8 parallel builds per arch; fails if KEDA does not scale the pool up. + buildkit-scale-amd64: + uses: ./.github/workflows/build-image-scale.yaml + with: + arch: amd64 + runner_label: {{PREFIX}}l-x86iamx-8-32 + + buildkit-scale-arm64: + uses: ./.github/workflows/build-image-scale.yaml + with: + arch: arm64 + runner_label: {{PREFIX}}l-x86iamx-8-32 + # ── Harbor Cache Test ───────────────────────────────────────────────── test-harbor: runs-on: {{PREFIX}}l-x86iamx-8-32 From 4f58d648123935f125e406cb5549e247c26811af Mon Sep 17 00:00:00 2001 From: Huy Do Date: Wed, 10 Jun 2026 01:52:22 -0700 Subject: [PATCH 02/14] Update (base update) [ghstack-poisoned] --- osdc/clusters.yaml | 21 ++- osdc/modules/buildkit/README.md | 32 ++++ osdc/modules/buildkit/deploy.sh | 42 ++++- .../kubernetes/base/drain-configmap.yaml | 21 +++ .../buildkit/kubernetes/base/haproxy.yaml | 10 +- .../kubernetes/base/kustomization.yaml | 2 + .../kubernetes/base/poddisruptionbudget.yaml | 25 +++ .../scripts/python/generate_buildkit.py | 157 ++++++++++++++++-- .../scripts/python/test_generate_buildkit.py | 118 +++++++++++++ osdc/modules/keda/deploy.sh | 34 ++++ 10 files changed, 430 insertions(+), 32 deletions(-) create mode 100644 osdc/modules/buildkit/README.md create mode 100644 osdc/modules/buildkit/kubernetes/base/drain-configmap.yaml create mode 100644 osdc/modules/buildkit/kubernetes/base/poddisruptionbudget.yaml create mode 100755 osdc/modules/keda/deploy.sh diff --git a/osdc/clusters.yaml b/osdc/clusters.yaml index 44aa9a75..f6e15a08 100644 --- a/osdc/clusters.yaml +++ b/osdc/clusters.yaml @@ -83,6 +83,10 @@ defaults: cpu_disruption_budget: "20%" buildkit: replicas_per_arch: 12 + autoscaling: + enabled: false + keda: + chart_version: "2.16.1" monitoring: grafana_cloud_url: "https://prometheus-prod-36-prod-us-west-0.grafana.net/api/prom/push" grafana_cloud_read_url: "https://prometheus-prod-36-prod-us-west-0.grafana.net/api/prom" @@ -162,8 +166,14 @@ clusters: github_secret_name: pytorch-arc-staging runner_name_prefix: "c-mt-" buildkit: - replicas_per_arch: 2 arm64_instance_type: m7gd.16xlarge + arm64_pods_per_node: 4 + autoscaling: + enabled: true + amd64_min: 2 # 1x m6id.24xlarge (2 pods/node) + amd64_max: 8 + arm64_min: 4 # 1x m7gd.16xlarge (4 pods/node) + arm64_max: 8 pypi_cache: replicas: 1 modules: @@ -171,6 +181,7 @@ clusters: - arc - nodepools - arc-runners + - keda - buildkit - pypi-cache - cache-enforcer @@ -283,11 +294,15 @@ clusters: min_node_age_seconds: 900 buildkit: amd64_instance_type: m6id.24xlarge - amd64_replicas: 32 amd64_pods_per_node: 2 arm64_instance_type: m7gd.16xlarge - arm64_replicas: 8 arm64_pods_per_node: 4 + autoscaling: + enabled: true + amd64_min: 2 # 1x m6id.24xlarge (2 pods/node) + amd64_max: 128 # 14d peak 105; headroom to spare above it + arm64_min: 4 # 1x m7gd.16xlarge (4 pods/node) + arm64_max: 16 # 14d peak 8, likely capped by fixed pool; headroom arc-runners: github_config_url: "https://github.com/pytorch" github_secret_name: pytorch-arc-cbr-production diff --git a/osdc/modules/buildkit/README.md b/osdc/modules/buildkit/README.md new file mode 100644 index 00000000..e17fafdf --- /dev/null +++ b/osdc/modules/buildkit/README.md @@ -0,0 +1,32 @@ +# BuildKit module + +Remote BuildKit build service: per-arch `buildkitd` Deployments behind an HAProxy +LB, on dedicated Karpenter NodePools. Clients build with +`buildctl --addr tcp://buildkitd-.buildkit:1234`. + +Sizing is per-arch in `clusters.yaml` (`buildkit.{amd64,arm64}_{replicas,pods_per_node}`, +`*_instance_type`); pod CPU/mem is computed by `scripts/python/generate_buildkit.py`. + +## Autoscaling (optional, `buildkit.autoscaling.enabled`) + +Absorb bursts of concurrent builds without overloading existing pods, and scale +back to a small warm baseline when idle. + +- **One build per pod** — HAProxy `server maxconn 1` (matches buildkitd + `max-parallelism = 1`). Excess builds **queue** in HAProxy (`timeout queue`) + instead of stacking on a busy pod; as new pods register (DNS), queued builds + flow onto them, so scaled-up pods never sit idle. +- **In-cluster scale signal** — KEDA `ScaledObject` per arch, `metrics-api` + scraping the LB's own metrics (`haproxy_backend_current_sessions`) — no external + metrics backend. +- **Warm baseline** — `amd64_min` / `arm64_min` keep ≥1 node per arch up so the + common case gets a free warm pod immediately. `*_max` caps the burst; NodePool + limits are sized to `*_max`. +- **Safe scale-down** — `preStop` drain (waits until the pod's `:1234` is idle) + + long `terminationGracePeriodSeconds` + PDB, so a build is never killed + mid-flight. + +Build clients should retry the connect so a build can wait for a pod from a cold +or queued pool. + +Requires the `keda` module deployed before `buildkit` (provides the CRDs). diff --git a/osdc/modules/buildkit/deploy.sh b/osdc/modules/buildkit/deploy.sh index c7be6d39..ece804ad 100755 --- a/osdc/modules/buildkit/deploy.sh +++ b/osdc/modules/buildkit/deploy.sh @@ -34,22 +34,38 @@ AMD64_REPLICAS=$(uv run "$CFG" "$CLUSTER" buildkit.amd64_replicas "$REPLICAS") ARM64_REPLICAS=$(uv run "$CFG" "$CLUSTER" buildkit.arm64_replicas "$REPLICAS") AMD64_PODS_PER_NODE=$(uv run "$CFG" "$CLUSTER" buildkit.amd64_pods_per_node "$PODS_PER_NODE") ARM64_PODS_PER_NODE=$(uv run "$CFG" "$CLUSTER" buildkit.arm64_pods_per_node "$PODS_PER_NODE") +AUTOSCALING=$(uv run "$CFG" "$CLUSTER" buildkit.autoscaling.enabled false) GENERATED_DIR="$MODULE_DIR/generated" # --- Generate manifests --- echo "Generating BuildKit manifests..." -uv run "$MODULE_DIR/scripts/python/generate_buildkit.py" \ - --arm64-instance-type "$ARM64_INSTANCE" \ - --amd64-instance-type "$AMD64_INSTANCE" \ - --replicas "$REPLICAS" \ - --pods-per-node "$PODS_PER_NODE" \ - --amd64-replicas "$AMD64_REPLICAS" \ - --arm64-replicas "$ARM64_REPLICAS" \ - --amd64-pods-per-node "$AMD64_PODS_PER_NODE" \ - --arm64-pods-per-node "$ARM64_PODS_PER_NODE" \ +GEN_ARGS=( + --arm64-instance-type "$ARM64_INSTANCE" + --amd64-instance-type "$AMD64_INSTANCE" + --replicas "$REPLICAS" + --pods-per-node "$PODS_PER_NODE" + --amd64-replicas "$AMD64_REPLICAS" + --arm64-replicas "$ARM64_REPLICAS" + --amd64-pods-per-node "$AMD64_PODS_PER_NODE" + --arm64-pods-per-node "$ARM64_PODS_PER_NODE" --output-dir "$GENERATED_DIR" +) +if [[ "${AUTOSCALING,,}" == "true" ]]; then + AMD64_MIN=$(uv run "$CFG" "$CLUSTER" buildkit.autoscaling.amd64_min 2) + AMD64_MAX=$(uv run "$CFG" "$CLUSTER" buildkit.autoscaling.amd64_max 8) + ARM64_MIN=$(uv run "$CFG" "$CLUSTER" buildkit.autoscaling.arm64_min 4) + ARM64_MAX=$(uv run "$CFG" "$CLUSTER" buildkit.autoscaling.arm64_max 8) + GEN_ARGS+=( + --autoscaling + --amd64-min "$AMD64_MIN" + --amd64-max "$AMD64_MAX" + --arm64-min "$ARM64_MIN" + --arm64-max "$ARM64_MAX" + ) +fi +uv run "$MODULE_DIR/scripts/python/generate_buildkit.py" "${GEN_ARGS[@]}" # --- Apply NodePools (with cluster name substitution) --- @@ -93,5 +109,13 @@ else kubectl rollout status deployment/buildkitd-amd64 -n buildkit --timeout=15m fi +# --- KEDA autoscaling (optional) --- +# Scales on the in-cluster buildkit LB metrics; no external metrics backend. + +if [[ "${AUTOSCALING,,}" == "true" ]]; then + echo "Applying KEDA autoscaling manifests..." + kubectl_apply_if_changed -f "$GENERATED_DIR/autoscaling.yaml" +fi + echo "BuildKit deployed." kubectl get pods -n buildkit -o wide diff --git a/osdc/modules/buildkit/kubernetes/base/drain-configmap.yaml b/osdc/modules/buildkit/kubernetes/base/drain-configmap.yaml new file mode 100644 index 00000000..89395ae9 --- /dev/null +++ b/osdc/modules/buildkit/kubernetes/base/drain-configmap.yaml @@ -0,0 +1,21 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: buildkitd-drain + namespace: buildkit +data: + # preStop drain: block termination until no in-flight build remains. A build + # keeps an ESTABLISHED inbound connection on :1234 for its whole duration; + # require two consecutive idle polls so a transient health check can't be + # mistaken for "done". terminationGracePeriodSeconds caps the total wait. + drain.sh: | + #!/bin/sh + idle=0 + while [ "$idle" -lt 2 ]; do + if netstat -tn 2>/dev/null | awk '$NF=="ESTABLISHED" && $4 ~ /:1234$/{f=1} END{exit !f}'; then + idle=0 + else + idle=$((idle + 1)) + fi + sleep 15 + done diff --git a/osdc/modules/buildkit/kubernetes/base/haproxy.yaml b/osdc/modules/buildkit/kubernetes/base/haproxy.yaml index 3bfd4eb2..a52cece7 100644 --- a/osdc/modules/buildkit/kubernetes/base/haproxy.yaml +++ b/osdc/modules/buildkit/kubernetes/base/haproxy.yaml @@ -24,6 +24,10 @@ data: timeout connect 5s timeout client 120m timeout server 120m + # Queue a build (server maxconn=1, below) while KEDA/Karpenter add pods, + # instead of stacking it on a busy pod. The runner also retries the + # connect, so this is an upper bound, not the only safety net. + timeout queue 10m log global option tcplog @@ -71,15 +75,15 @@ data: # resolution warnings. backend bk_arm64 balance leastconn - server-template arm 10 buildkitd-arm64-pods.buildkit.svc.cluster.local:1234 check resolvers k8s init-addr none resolve-prefer ipv6 + server-template arm 10 buildkitd-arm64-pods.buildkit.svc.cluster.local:1234 check resolvers k8s init-addr none resolve-prefer ipv6 maxconn 1 backend bk_amd64 balance leastconn - server-template amd 10 buildkitd-amd64-pods.buildkit.svc.cluster.local:1234 check resolvers k8s init-addr none resolve-prefer ipv6 + server-template amd 10 buildkitd-amd64-pods.buildkit.svc.cluster.local:1234 check resolvers k8s init-addr none resolve-prefer ipv6 maxconn 1 backend bk_all balance leastconn - server-template all 20 buildkitd-pods.buildkit.svc.cluster.local:1234 check resolvers k8s init-addr none resolve-prefer ipv6 + server-template all 20 buildkitd-pods.buildkit.svc.cluster.local:1234 check resolvers k8s init-addr none resolve-prefer ipv6 maxconn 1 --- apiVersion: apps/v1 diff --git a/osdc/modules/buildkit/kubernetes/base/kustomization.yaml b/osdc/modules/buildkit/kubernetes/base/kustomization.yaml index 9cbffb38..32572408 100644 --- a/osdc/modules/buildkit/kubernetes/base/kustomization.yaml +++ b/osdc/modules/buildkit/kubernetes/base/kustomization.yaml @@ -5,6 +5,8 @@ kind: Kustomization resources: - namespace.yaml - configmap.yaml + - drain-configmap.yaml - haproxy.yaml - service.yaml - networkpolicy.yaml + - poddisruptionbudget.yaml diff --git a/osdc/modules/buildkit/kubernetes/base/poddisruptionbudget.yaml b/osdc/modules/buildkit/kubernetes/base/poddisruptionbudget.yaml new file mode 100644 index 00000000..b717bb64 --- /dev/null +++ b/osdc/modules/buildkit/kubernetes/base/poddisruptionbudget.yaml @@ -0,0 +1,25 @@ +# Cap voluntary disruptions (node consolidation, drains) to one builder per arch +# at a time so evictions go through the preStop drain instead of killing builds. +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: buildkitd-arm64 + namespace: buildkit +spec: + maxUnavailable: 1 + selector: + matchLabels: + app: buildkitd + arch: arm64 +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: buildkitd-amd64 + namespace: buildkit +spec: + maxUnavailable: 1 + selector: + matchLabels: + app: buildkitd + arch: amd64 diff --git a/osdc/modules/buildkit/scripts/python/generate_buildkit.py b/osdc/modules/buildkit/scripts/python/generate_buildkit.py index e7468418..830b8998 100644 --- a/osdc/modules/buildkit/scripts/python/generate_buildkit.py +++ b/osdc/modules/buildkit/scripts/python/generate_buildkit.py @@ -103,6 +103,7 @@ def generate_deployment_yaml( arm64_replicas: int | None = None, amd64_pods_per_node: int | None = None, arm64_pods_per_node: int | None = None, + autoscaling: bool = False, ) -> str: """Generate the combined Deployment YAML for both architectures. @@ -118,6 +119,37 @@ def generate_deployment_yaml( arm64_res = compute_pod_resources(arm64_instance, arm64_pods_per_node) amd64_res = compute_pod_resources(amd64_instance, amd64_pods_per_node) + # When KEDA owns the replica count, omit `replicas` and add a preStop drain + # that holds the pod open until its in-flight build finishes. `replicas_line` + # is computed per-arch inside _deployment_block (below). + grace_line = " terminationGracePeriodSeconds: 8100\n" if autoscaling else "" + lifecycle_block = ( + """ + lifecycle: + preStop: + exec: + command: ["/bin/sh", "/opt/drain/drain.sh"]""" + if autoscaling + else "" + ) + drain_mount = ( + """ + - name: drain + mountPath: /opt/drain + readOnly: true""" + if autoscaling + else "" + ) + drain_volume = ( + """ + - name: drain + configMap: + name: buildkitd-drain + defaultMode: 0555""" + if autoscaling + else "" + ) + log_info( f"arm64 ({arm64_instance}): {arm64_res['cpu']} vCPU, {arm64_res['memory_gi']}Gi per pod " f"(allocatable: {arm64_res['allocatable_cpu_m']}m CPU, {arm64_res['allocatable_mem_mi']}Mi mem)" @@ -128,6 +160,7 @@ def generate_deployment_yaml( ) def _deployment_block(arch, instance_type, cpu, memory_gi, replicas, pods_per_node): + replicas_line = "" if autoscaling else f" replicas: {replicas}\n" return f"""apiVersion: apps/v1 kind: Deployment metadata: @@ -139,8 +172,7 @@ def _deployment_block(arch, instance_type, cpu, memory_gi, replicas, pods_per_no app.kubernetes.io/name: buildkitd app.kubernetes.io/component: build-service spec: - replicas: {replicas} - strategy: +{replicas_line} strategy: type: RollingUpdate rollingUpdate: maxSurge: 0 @@ -155,7 +187,7 @@ def _deployment_block(arch, instance_type, cpu, memory_gi, replicas, pods_per_no app: buildkitd arch: {arch} spec: - nodeSelector: +{grace_line} nodeSelector: workload-type: buildkit instance-type: "{instance_type}" @@ -208,7 +240,7 @@ def _deployment_block(arch, instance_type, cpu, memory_gi, replicas, pods_per_no fieldPath: metadata.name securityContext: - privileged: true + privileged: true{lifecycle_block} readinessProbe: exec: @@ -237,7 +269,7 @@ def _deployment_block(arch, instance_type, cpu, memory_gi, replicas, pods_per_no subPathExpr: $(POD_NAME) - name: git-cache mountPath: /opt/git-cache - readOnly: true + readOnly: true{drain_mount} volumes: - name: config @@ -252,7 +284,7 @@ def _deployment_block(arch, instance_type, cpu, memory_gi, replicas, pods_per_no - name: git-cache hostPath: path: /mnt/k8s-disks/0/git-cache - type: DirectoryOrCreate""" + type: DirectoryOrCreate{drain_volume}""" arm64_block = _deployment_block( "arm64", arm64_instance, arm64_res["cpu"], arm64_res["memory_gi"], arm64_replicas, arm64_pods_per_node @@ -453,6 +485,63 @@ def _nodepool_block(arch, instance_type, cpu_limit, memory_limit_gi): return arm64_block + "\n\n---\n" + amd64_block + "\n" +def generate_autoscaling_yaml( + amd64_min: int, + amd64_max: int, + arm64_min: int, + arm64_max: int, +) -> str: + """Generate per-arch KEDA ScaledObjects. + + Each arch scales on its HAProxy backend's active build count + (haproxy_backend_current_sessions), scraped in-cluster from the buildkit LB + metrics endpoint — no external metrics backend. With server maxconn=1, the LB + queues bursts while KEDA/Karpenter bring up pods; minReplicaCount keeps a warm + baseline so the common case has a free pod immediately. + """ + + metrics_url = "http://buildkitd-lb-metrics.buildkit.svc.cluster.local:9404/metrics" + + def _scaledobject(arch, backend, min_replicas, max_replicas): + return f"""apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: buildkitd-{arch} + namespace: buildkit +spec: + scaleTargetRef: + name: buildkitd-{arch} + minReplicaCount: {min_replicas} + maxReplicaCount: {max_replicas} + cooldownPeriod: 600 + advanced: + horizontalPodAutoscalerConfig: + behavior: + scaleDown: + stabilizationWindowSeconds: 600 + policies: + - type: Pods + value: 1 + periodSeconds: 120 + triggers: + - type: metrics-api + metadata: + url: "{metrics_url}" + format: "prometheus" + valueLocation: 'haproxy_backend_current_sessions{{proxy="{backend}"}}' + targetValue: "1" +""" + + header = "# KEDA autoscaling — auto-generated by generate_buildkit.py. Do not edit by hand.\n" + return ( + header + + "\n" + + _scaledobject("amd64", "bk_amd64", amd64_min, amd64_max) + + "\n---\n" + + _scaledobject("arm64", "bk_arm64", arm64_min, arm64_max) + ) + + def main(): parser = argparse.ArgumentParser(description="Generate BuildKit Deployment and NodePool YAMLs") parser.add_argument("--arm64-instance-type", required=True, help="ARM64 instance type (e.g., m8gd.24xlarge)") @@ -464,8 +553,17 @@ def main(): parser.add_argument("--amd64-pods-per-node", type=int, default=None, help="Override amd64 pods per node") parser.add_argument("--arm64-pods-per-node", type=int, default=None, help="Override arm64 pods per node") parser.add_argument("--output-dir", required=True, help="Output directory for generated YAMLs") + parser.add_argument("--autoscaling", action="store_true", help="Generate KEDA autoscaling manifests") + parser.add_argument("--amd64-min", type=int, default=0, help="KEDA minReplicaCount for amd64") + parser.add_argument("--amd64-max", type=int, default=0, help="KEDA maxReplicaCount for amd64") + parser.add_argument("--arm64-min", type=int, default=0, help="KEDA minReplicaCount for arm64") + parser.add_argument("--arm64-max", type=int, default=0, help="KEDA maxReplicaCount for arm64") args = parser.parse_args() + if args.autoscaling and not (args.amd64_max and args.arm64_max): + log_error("--autoscaling requires --amd64-max and --arm64-max") + return 1 + # Validate instance types for it in [args.arm64_instance_type, args.amd64_instance_type]: if it not in INSTANCE_SPECS: @@ -498,26 +596,51 @@ def main(): arm64_replicas=args.arm64_replicas, amd64_pods_per_node=args.amd64_pods_per_node, arm64_pods_per_node=args.arm64_pods_per_node, + autoscaling=args.autoscaling, ) deployment_path = output_dir / "deployment.yaml" deployment_path.write_text(deployment_yaml) log_info(f"Wrote {deployment_path}") - # Generate nodepools - nodepools_yaml = generate_nodepools_yaml( - args.arm64_instance_type, - args.amd64_instance_type, - args.replicas, - args.pods_per_node, - amd64_replicas=args.amd64_replicas, - arm64_replicas=args.arm64_replicas, - amd64_pods_per_node=args.amd64_pods_per_node, - arm64_pods_per_node=args.arm64_pods_per_node, - ) + # Size NodePool limits for the peak: the per-arch autoscaling ceiling + # (amd64_max / arm64_max) when enabled, otherwise the per-arch replica counts. + if args.autoscaling: + nodepools_yaml = generate_nodepools_yaml( + args.arm64_instance_type, + args.amd64_instance_type, + args.replicas, + args.pods_per_node, + amd64_replicas=args.amd64_max, + arm64_replicas=args.arm64_max, + amd64_pods_per_node=args.amd64_pods_per_node, + arm64_pods_per_node=args.arm64_pods_per_node, + ) + else: + nodepools_yaml = generate_nodepools_yaml( + args.arm64_instance_type, + args.amd64_instance_type, + args.replicas, + args.pods_per_node, + amd64_replicas=args.amd64_replicas, + arm64_replicas=args.arm64_replicas, + amd64_pods_per_node=args.amd64_pods_per_node, + arm64_pods_per_node=args.arm64_pods_per_node, + ) nodepools_path = output_dir / "nodepools.yaml" nodepools_path.write_text(nodepools_yaml) log_info(f"Wrote {nodepools_path}") + if args.autoscaling: + autoscaling_yaml = generate_autoscaling_yaml( + args.amd64_min, + args.amd64_max, + args.arm64_min, + args.arm64_max, + ) + autoscaling_path = output_dir / "autoscaling.yaml" + autoscaling_path.write_text(autoscaling_yaml) + log_info(f"Wrote {autoscaling_path}") + return 0 diff --git a/osdc/modules/buildkit/scripts/python/test_generate_buildkit.py b/osdc/modules/buildkit/scripts/python/test_generate_buildkit.py index 387a283d..c373c9c0 100644 --- a/osdc/modules/buildkit/scripts/python/test_generate_buildkit.py +++ b/osdc/modules/buildkit/scripts/python/test_generate_buildkit.py @@ -12,6 +12,7 @@ DAEMONSET_OVERHEAD_MEM_MI, MARGIN, compute_pod_resources, + generate_autoscaling_yaml, generate_deployment_yaml, generate_nodepools_yaml, ) @@ -341,6 +342,65 @@ def test_nodepool_limits_scale_per_arch(self): assert np["buildkit-arm64"]["spec"]["limits"]["cpu"] == "256" +# ============================================================================ +# autoscaling +# ============================================================================ + + +class TestAutoscalingDeployment: + """When autoscaling=True the Deployment yields control of replicas to KEDA + and gains a preStop drain so scale-down never kills an in-flight build.""" + + def test_omits_replicas(self): + output = generate_deployment_yaml("m8gd.24xlarge", "m6id.24xlarge", 4, 2, autoscaling=True) + for d in parse_all_yaml(output): + assert "replicas" not in d["spec"] + + def test_keeps_replicas_when_disabled(self): + output = generate_deployment_yaml("m8gd.24xlarge", "m6id.24xlarge", 4, 2) + for d in parse_all_yaml(output): + assert d["spec"]["replicas"] == 4 + + def test_drain_prestop_and_grace(self): + output = generate_deployment_yaml("m8gd.24xlarge", "m6id.24xlarge", 4, 2, autoscaling=True) + for d in parse_all_yaml(output): + spec = d["spec"]["template"]["spec"] + assert spec["terminationGracePeriodSeconds"] == 8100 + container = spec["containers"][0] + assert container["lifecycle"]["preStop"]["exec"]["command"] == ["/bin/sh", "/opt/drain/drain.sh"] + assert "drain" in {vm["name"] for vm in container["volumeMounts"]} + assert "drain" in {v["name"] for v in spec["volumes"]} + + +class TestGenerateAutoscalingYaml: + """Tests for generate_autoscaling_yaml — in-cluster KEDA ScaledObjects.""" + + def _docs(self): + output = generate_autoscaling_yaml(2, 8, 4, 8) + return parse_all_yaml(output) + + def test_only_scaledobjects_no_external_auth(self): + # In-cluster signal → no TriggerAuthentication / Grafana secret needed. + kinds = sorted(d["kind"] for d in self._docs()) + assert kinds == ["ScaledObject", "ScaledObject"] + + def test_per_arch_min_max(self): + scaled = {d["metadata"]["name"]: d for d in self._docs() if d["kind"] == "ScaledObject"} + assert set(scaled) == {"buildkitd-amd64", "buildkitd-arm64"} + assert scaled["buildkitd-amd64"]["spec"]["minReplicaCount"] == 2 + assert scaled["buildkitd-amd64"]["spec"]["maxReplicaCount"] == 8 + assert scaled["buildkitd-arm64"]["spec"]["minReplicaCount"] == 4 + assert scaled["buildkitd-arm64"]["spec"]["maxReplicaCount"] == 8 + + def test_in_cluster_metrics_api_trigger(self): + scaled = {d["metadata"]["name"]: d for d in self._docs() if d["kind"] == "ScaledObject"} + for name, backend in [("buildkitd-amd64", "bk_amd64"), ("buildkitd-arm64", "bk_arm64")]: + trig = scaled[name]["spec"]["triggers"][0] + assert trig["type"] == "metrics-api" + assert "buildkitd-lb-metrics.buildkit" in trig["metadata"]["url"] + assert f'proxy="{backend}"' in trig["metadata"]["valueLocation"] + + # ============================================================================ # generate_nodepools_yaml # ============================================================================ @@ -520,6 +580,64 @@ def test_deployment_yaml_parseable(self, tmp_path): docs = parse_all_yaml(deployment_text) assert len(docs) == 2 + def test_autoscaling_writes_manifests(self, tmp_path): + output_dir = tmp_path / "output" + + import generate_buildkit + + test_args = [ + "generate_buildkit.py", + "--arm64-instance-type", + "m8gd.24xlarge", + "--amd64-instance-type", + "m6id.24xlarge", + "--replicas", + "1", + "--pods-per-node", + "2", + "--output-dir", + str(output_dir), + "--autoscaling", + "--amd64-min", + "2", + "--amd64-max", + "8", + "--arm64-min", + "4", + "--arm64-max", + "8", + ] + with patch.object(sys, "argv", test_args): + result = generate_buildkit.main() + + assert result == 0 + autoscaling = parse_all_yaml((output_dir / "autoscaling.yaml").read_text()) + assert sorted(d["kind"] for d in autoscaling) == ["ScaledObject", "ScaledObject"] + + def test_autoscaling_requires_params(self, tmp_path): + output_dir = tmp_path / "output" + + import generate_buildkit + + test_args = [ + "generate_buildkit.py", + "--arm64-instance-type", + "m8gd.24xlarge", + "--amd64-instance-type", + "m6id.24xlarge", + "--replicas", + "1", + "--pods-per-node", + "2", + "--output-dir", + str(output_dir), + "--autoscaling", + ] + with patch.object(sys, "argv", test_args): + result = generate_buildkit.main() + + assert result == 1 + def test_unknown_instance_type_fails(self, tmp_path): output_dir = tmp_path / "output" test_args = [ diff --git a/osdc/modules/keda/deploy.sh b/osdc/modules/keda/deploy.sh new file mode 100755 index 00000000..59f422ad --- /dev/null +++ b/osdc/modules/keda/deploy.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash +set -euo pipefail +# +# KEDA module deploy script. +# Called by: just deploy-module keda +# Args: $1=cluster-id $2=cluster-name $3=region +# +# Installs the KEDA operator (provides the ScaledObject/TriggerAuthentication +# CRDs that the buildkit module uses to autoscale builders). + +CLUSTER="$1" +MODULE_DIR="$(cd "$(dirname "$0")" && pwd)" +REPO_ROOT="${OSDC_ROOT:-$(cd "$MODULE_DIR/../.." && pwd)}" +UPSTREAM_ROOT="${OSDC_UPSTREAM:-$REPO_ROOT}" +# shellcheck source=/dev/null +source "$UPSTREAM_ROOT/scripts/mise-activate.sh" +# shellcheck source=/dev/null +source "$UPSTREAM_ROOT/scripts/helm-upgrade.sh" +CFG="$UPSTREAM_ROOT/scripts/cluster-config.py" + +NAMESPACE="keda" +CHART_VERSION=$(uv run "$CFG" "$CLUSTER" keda.chart_version 2.16.1) + +helm repo add kedacore https://kedacore.github.io/charts >/dev/null 2>&1 || true +helm repo update kedacore >/dev/null 2>&1 || true + +helm_upgrade_if_changed keda "$NAMESPACE" \ + --create-namespace \ + --version "$CHART_VERSION" \ + --timeout 5m \ + --wait \ + kedacore/keda + +echo "KEDA deployed (chart $CHART_VERSION)." From 0475d6da04e9732b7e0cb62d3792b7baa616f2cf Mon Sep 17 00:00:00 2001 From: Huy Do Date: Wed, 10 Jun 2026 02:07:35 -0700 Subject: [PATCH 03/14] Update (base update) [ghstack-poisoned] --- osdc/modules/buildkit/README.md | 15 +++++- .../scripts/python/generate_buildkit.py | 48 +++++++++++-------- 2 files changed, 43 insertions(+), 20 deletions(-) diff --git a/osdc/modules/buildkit/README.md b/osdc/modules/buildkit/README.md index e17fafdf..37d64f9c 100644 --- a/osdc/modules/buildkit/README.md +++ b/osdc/modules/buildkit/README.md @@ -24,7 +24,20 @@ back to a small warm baseline when idle. limits are sized to `*_max`. - **Safe scale-down** — `preStop` drain (waits until the pod's `:1234` is idle) + long `terminationGracePeriodSeconds` + PDB, so a build is never killed - mid-flight. + mid-flight. Scale-down removes an arbitrary pod, which may be mid-build; the + drain holds termination until that build finishes, but + `terminationGracePeriodSeconds` is a hard SIGKILL cap, so it must outlast the + longest possible build. It's set to **8100s (135m) = 120m** (the max time a + docker build may run, matching HAProxy `timeout server`) **+ ~15m** of + headroom for the drain's idle-detection polling. A build that starts just + before drain still completes; the cap only fires as a backstop if a pod never + drains. + The **PDB** (`maxUnavailable: 1` per arch) bounds *voluntary* disruptions — + node consolidation and manual `kubectl drain` — to one builder per arch at a + time, so those go through the preStop drain one pod at a time instead of + evicting several in-flight builds at once. (KEDA scale-down deletes pods + directly rather than via the eviction API, so it isn't PDB-gated — the drain + + grace cap above is what protects that path.) Build clients should retry the connect so a build can wait for a pod from a cold or queued pool. diff --git a/osdc/modules/buildkit/scripts/python/generate_buildkit.py b/osdc/modules/buildkit/scripts/python/generate_buildkit.py index 830b8998..35f00fd0 100644 --- a/osdc/modules/buildkit/scripts/python/generate_buildkit.py +++ b/osdc/modules/buildkit/scripts/python/generate_buildkit.py @@ -36,6 +36,12 @@ RED = "\033[0;31m" NC = "\033[0m" +# Sentinel for optional template lines. An optional fragment is either its YAML +# lines or this sentinel; lines equal to it are dropped when a block is assembled +# (see _deployment_block). This lets every fragment sit on its own line in the +# templates below instead of being concatenated onto an adjacent line. +_OMIT = "<>" + def log_info(msg): print(f"{GREEN}\u2192{NC} {msg}") @@ -120,34 +126,32 @@ def generate_deployment_yaml( amd64_res = compute_pod_resources(amd64_instance, amd64_pods_per_node) # When KEDA owns the replica count, omit `replicas` and add a preStop drain - # that holds the pod open until its in-flight build finishes. `replicas_line` - # is computed per-arch inside _deployment_block (below). - grace_line = " terminationGracePeriodSeconds: 8100\n" if autoscaling else "" + # that holds the pod open until its in-flight build finishes. Each fragment is + # either its YAML lines or _OMIT (dropped at assembly), so it sits on its own + # line in the template. (`replicas_line` is per-arch — computed below.) + grace_line = " terminationGracePeriodSeconds: 8100" if autoscaling else _OMIT lifecycle_block = ( - """ - lifecycle: + """ lifecycle: preStop: exec: command: ["/bin/sh", "/opt/drain/drain.sh"]""" if autoscaling - else "" + else _OMIT ) drain_mount = ( - """ - - name: drain + """ - name: drain mountPath: /opt/drain readOnly: true""" if autoscaling - else "" + else _OMIT ) drain_volume = ( - """ - - name: drain + """ - name: drain configMap: name: buildkitd-drain defaultMode: 0555""" if autoscaling - else "" + else _OMIT ) log_info( @@ -160,8 +164,8 @@ def generate_deployment_yaml( ) def _deployment_block(arch, instance_type, cpu, memory_gi, replicas, pods_per_node): - replicas_line = "" if autoscaling else f" replicas: {replicas}\n" - return f"""apiVersion: apps/v1 + replicas_line = _OMIT if autoscaling else f" replicas: {replicas}" + block = f"""apiVersion: apps/v1 kind: Deployment metadata: name: buildkitd-{arch} @@ -172,7 +176,8 @@ def _deployment_block(arch, instance_type, cpu, memory_gi, replicas, pods_per_no app.kubernetes.io/name: buildkitd app.kubernetes.io/component: build-service spec: -{replicas_line} strategy: +{replicas_line} + strategy: type: RollingUpdate rollingUpdate: maxSurge: 0 @@ -187,7 +192,8 @@ def _deployment_block(arch, instance_type, cpu, memory_gi, replicas, pods_per_no app: buildkitd arch: {arch} spec: -{grace_line} nodeSelector: +{grace_line} + nodeSelector: workload-type: buildkit instance-type: "{instance_type}" @@ -240,7 +246,8 @@ def _deployment_block(arch, instance_type, cpu, memory_gi, replicas, pods_per_no fieldPath: metadata.name securityContext: - privileged: true{lifecycle_block} + privileged: true +{lifecycle_block} readinessProbe: exec: @@ -269,7 +276,8 @@ def _deployment_block(arch, instance_type, cpu, memory_gi, replicas, pods_per_no subPathExpr: $(POD_NAME) - name: git-cache mountPath: /opt/git-cache - readOnly: true{drain_mount} + readOnly: true +{drain_mount} volumes: - name: config @@ -284,7 +292,9 @@ def _deployment_block(arch, instance_type, cpu, memory_gi, replicas, pods_per_no - name: git-cache hostPath: path: /mnt/k8s-disks/0/git-cache - type: DirectoryOrCreate{drain_volume}""" + type: DirectoryOrCreate +{drain_volume}""" + return "\n".join(line for line in block.splitlines() if line != _OMIT) arm64_block = _deployment_block( "arm64", arm64_instance, arm64_res["cpu"], arm64_res["memory_gi"], arm64_replicas, arm64_pods_per_node From 24a5f136b08f884c2916ed7570ce4485b309da4c Mon Sep 17 00:00:00 2001 From: Huy Do Date: Wed, 10 Jun 2026 02:41:02 -0700 Subject: [PATCH 04/14] Update [ghstack-poisoned] --- .../docker/test-buildkit-scale/Dockerfile | 6 -- .../scripts/python/phases.py | 14 ++--- .../scripts/python/test_run.py | 9 ++- .../workflows/build-image-scale.yaml | 56 +++++++++---------- 4 files changed, 39 insertions(+), 46 deletions(-) delete mode 100644 osdc/integration-tests/docker/test-buildkit-scale/Dockerfile diff --git a/osdc/integration-tests/docker/test-buildkit-scale/Dockerfile b/osdc/integration-tests/docker/test-buildkit-scale/Dockerfile deleted file mode 100644 index e317eb61..00000000 --- a/osdc/integration-tests/docker/test-buildkit-scale/Dockerfile +++ /dev/null @@ -1,6 +0,0 @@ -# Holds one BuildKit slot (~10 min) so a burst of parallel builds exceeds the -# warm baseline and forces KEDA scale-up. CACHEBUST differs per build so each -# actually runs (no layer reuse). -FROM alpine:3.21 -ARG CACHEBUST -RUN echo "osdc buildkit scale-test ${CACHEBUST}" && sleep 600 diff --git a/osdc/integration-tests/scripts/python/phases.py b/osdc/integration-tests/scripts/python/phases.py index 17654000..04d3589d 100644 --- a/osdc/integration-tests/scripts/python/phases.py +++ b/osdc/integration-tests/scripts/python/phases.py @@ -276,17 +276,17 @@ def prepare_pr( # Write integration test workflow (workflows_dir / "integration-test.yaml").write_text(workflow_content) - # Copy reusable BuildKit workflows + # Copy reusable BuildKit workflows (connectivity + autoscaling scale test). + # The scale test builds an inline Dockerfile, so it needs no copied context. wf_root = upstream_dir / "integration-tests" / "workflows" for wf in ("build-image.yaml", "build-image-scale.yaml"): (workflows_dir / wf).write_text((wf_root / wf).read_text()) - # Copy test Dockerfiles (connectivity + autoscaling scale test) - docker_root = upstream_dir / "integration-tests" / "docker" - for name in ("test-buildkit", "test-buildkit-scale"): - dst = canary_path / "docker" / name - dst.mkdir(parents=True, exist_ok=True) - (dst / "Dockerfile").write_text((docker_root / name / "Dockerfile").read_text()) + # Copy test Dockerfile (connectivity test context) + docker_dir = canary_path / "docker" / "test-buildkit" + docker_dir.mkdir(parents=True, exist_ok=True) + dockerfile_src = upstream_dir / "integration-tests" / "docker" / "test-buildkit" / "Dockerfile" + (docker_dir / "Dockerfile").write_text(dockerfile_src.read_text()) # Commit run_cmd(["git", "add", "-A"], cwd=canary_path) diff --git a/osdc/integration-tests/scripts/python/test_run.py b/osdc/integration-tests/scripts/python/test_run.py index dc4d5188..5f51e8d2 100644 --- a/osdc/integration-tests/scripts/python/test_run.py +++ b/osdc/integration-tests/scripts/python/test_run.py @@ -112,13 +112,12 @@ def workflow_template(tmp_path): ) (wf_dir / "integration-test.yaml.tpl").write_text(template) - # Also create reusable workflows and Dockerfiles for prepare_pr + # Also create reusable workflows and Dockerfile for prepare_pr (wf_dir / "build-image.yaml").write_text("name: build-image\n") (wf_dir / "build-image-scale.yaml").write_text("name: build-image-scale\n") - for name in ("test-buildkit", "test-buildkit-scale"): - docker_dir = upstream / "integration-tests" / "docker" / name - docker_dir.mkdir(parents=True) - (docker_dir / "Dockerfile").write_text("FROM alpine\n") + docker_dir = upstream / "integration-tests" / "docker" / "test-buildkit" + docker_dir.mkdir(parents=True) + (docker_dir / "Dockerfile").write_text("FROM alpine\n") return upstream diff --git a/osdc/integration-tests/workflows/build-image-scale.yaml b/osdc/integration-tests/workflows/build-image-scale.yaml index b485dc5a..dbc3d185 100644 --- a/osdc/integration-tests/workflows/build-image-scale.yaml +++ b/osdc/integration-tests/workflows/build-image-scale.yaml @@ -1,10 +1,12 @@ # Reusable workflow: BuildKit autoscaling scale test. -# Launches a burst of 8 parallel builds against one arch's remote BuildKit, each -# holding a slot (server maxconn=1) for ~10 min via a sleep. The warm baseline -# (amd64_min / arm64_min) is below the burst, so the builds finish within -# timeout-minutes only if KEDA scales the pool up. Without scale-up they -# serialize through the baseline pods and the back of the queue times out — i.e. -# this job FAILS when autoscaling does not happen. +# Mirrors the real .ci/docker/docker-builds.yml path — Docker Buildx with the +# remote driver pointed at the per-arch buildkitd endpoint — but builds a +# trivial image that just sleeps, so each of the 8 parallel jobs holds one +# BuildKit slot (server maxconn=1) for ~10 min. The warm baseline is below the +# burst, so the builds finish within timeout-minutes only if KEDA scales the +# pool up; without scale-up they serialize through the baseline pods and the +# back of the queue times out — i.e. this job FAILS when autoscaling does not +# happen. name: BuildKit Scale Test on: @@ -16,9 +18,8 @@ on: type: string runner_label: description: "Runner label to use (includes cluster prefix)" - required: false + required: true type: string - default: "l-x86iavx512-2-4" jobs: scale: @@ -33,27 +34,26 @@ jobs: container: image: ghcr.io/actions/actions-runner:latest steps: - - name: Install buildctl - run: | - BUILDKIT_VERSION="v0.29.0" - mkdir -p "$HOME/.local/bin" - curl -sSL "https://github.com/moby/buildkit/releases/download/${BUILDKIT_VERSION}/buildkit-${BUILDKIT_VERSION}.linux-amd64.tar.gz" \ - | tar xz --strip-components=1 -C "$HOME/.local/bin" bin/buildctl - echo "$HOME/.local/bin" >> "$GITHUB_PATH" - "$HOME/.local/bin/buildctl" --version - - - name: Checkout - uses: actions/checkout@v4 + - name: Set up Docker Buildx (remote) + uses: docker/setup-buildx-action@v3 + with: + driver: remote + endpoint: tcp://buildkitd-${{ inputs.arch }}.buildkit:1234 - name: Occupy a BuildKit slot (~10 min) to drive autoscaling + shell: bash run: | - ENDPOINT="tcp://buildkitd-${{ inputs.arch }}.buildkit:1234" - echo "Build ${{ matrix.replica }} -> $ENDPOINT" - buildctl --addr "$ENDPOINT" build \ - --frontend dockerfile.v0 \ - --local context=docker/test-buildkit-scale \ - --local dockerfile=docker/test-buildkit-scale \ - --opt build-arg:CACHEBUST=${{ inputs.arch }}-${{ matrix.replica }}-${{ github.run_id }} \ + set -ex + cat > Dockerfile.scale <<'EOF' + FROM alpine:3.21 + ARG CACHEBUST + RUN echo "osdc buildkit scale-test ${CACHEBUST}" && sleep 600 + EOF + # --no-cache + a per-job CACHEBUST make each build distinct so it + # actually runs the sleep; cacheonly keeps it remote (no push/load). + docker buildx build \ + --platform "linux/${{ inputs.arch }}" \ + --build-arg "CACHEBUST=${{ inputs.arch }}-${{ matrix.replica }}-${{ github.run_id }}" \ --no-cache \ - --output type=cacheonly - echo "PASS: build ${{ matrix.replica }} finished within timeout" + --output type=cacheonly \ + -f Dockerfile.scale . From c906ed44af0bafd64ecdd90ed4541571fc41f4c6 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Wed, 10 Jun 2026 02:51:46 -0700 Subject: [PATCH 05/14] Update (base update) [ghstack-poisoned] --- .../buildkit/kubernetes/base/haproxy.yaml | 6 +++--- .../buildkit/scripts/python/generate_buildkit.py | 16 +++++++++------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/osdc/modules/buildkit/kubernetes/base/haproxy.yaml b/osdc/modules/buildkit/kubernetes/base/haproxy.yaml index a52cece7..45c630d6 100644 --- a/osdc/modules/buildkit/kubernetes/base/haproxy.yaml +++ b/osdc/modules/buildkit/kubernetes/base/haproxy.yaml @@ -25,9 +25,9 @@ data: timeout client 120m timeout server 120m # Queue a build (server maxconn=1, below) while KEDA/Karpenter add pods, - # instead of stacking it on a busy pod. The runner also retries the - # connect, so this is an upper bound, not the only safety net. - timeout queue 10m + # instead of stacking it on a busy pod. Keep <= 120m, the max time a + # docker build is allowed to run. + timeout queue 60m log global option tcplog diff --git a/osdc/modules/buildkit/scripts/python/generate_buildkit.py b/osdc/modules/buildkit/scripts/python/generate_buildkit.py index 35f00fd0..b39a09a2 100644 --- a/osdc/modules/buildkit/scripts/python/generate_buildkit.py +++ b/osdc/modules/buildkit/scripts/python/generate_buildkit.py @@ -127,9 +127,11 @@ def generate_deployment_yaml( # When KEDA owns the replica count, omit `replicas` and add a preStop drain # that holds the pod open until its in-flight build finishes. Each fragment is - # either its YAML lines or _OMIT (dropped at assembly), so it sits on its own - # line in the template. (`replicas_line` is per-arch — computed below.) - grace_line = " terminationGracePeriodSeconds: 8100" if autoscaling else _OMIT + # either its YAML or _OMIT; _deployment_block drops _OMIT lines (matched after + # stripping), so single lines carry their indent in the template (e.g. + # ` {grace_line}`) while multi-line blocks self-indent. (`replicas_line` + # is per-arch — computed below.) + grace_line = "terminationGracePeriodSeconds: 8100" if autoscaling else _OMIT lifecycle_block = ( """ lifecycle: preStop: @@ -164,7 +166,7 @@ def generate_deployment_yaml( ) def _deployment_block(arch, instance_type, cpu, memory_gi, replicas, pods_per_node): - replicas_line = _OMIT if autoscaling else f" replicas: {replicas}" + replicas_line = _OMIT if autoscaling else f"replicas: {replicas}" block = f"""apiVersion: apps/v1 kind: Deployment metadata: @@ -176,7 +178,7 @@ def _deployment_block(arch, instance_type, cpu, memory_gi, replicas, pods_per_no app.kubernetes.io/name: buildkitd app.kubernetes.io/component: build-service spec: -{replicas_line} + {replicas_line} strategy: type: RollingUpdate rollingUpdate: @@ -192,7 +194,7 @@ def _deployment_block(arch, instance_type, cpu, memory_gi, replicas, pods_per_no app: buildkitd arch: {arch} spec: -{grace_line} + {grace_line} nodeSelector: workload-type: buildkit instance-type: "{instance_type}" @@ -294,7 +296,7 @@ def _deployment_block(arch, instance_type, cpu, memory_gi, replicas, pods_per_no path: /mnt/k8s-disks/0/git-cache type: DirectoryOrCreate {drain_volume}""" - return "\n".join(line for line in block.splitlines() if line != _OMIT) + return "\n".join(line for line in block.splitlines() if line.strip() != _OMIT) arm64_block = _deployment_block( "arm64", arm64_instance, arm64_res["cpu"], arm64_res["memory_gi"], arm64_replicas, arm64_pods_per_node From dd8d8e08e54cc155e91855658c2457d5a7635356 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Wed, 10 Jun 2026 03:52:30 -0700 Subject: [PATCH 06/14] Update (base update) [ghstack-poisoned] --- osdc/modules/buildkit/README.md | 11 +++++++++++ osdc/modules/buildkit/deploy.sh | 10 +++++++++- osdc/modules/buildkit/kubernetes/base/haproxy.yaml | 6 ++++++ 3 files changed, 26 insertions(+), 1 deletion(-) diff --git a/osdc/modules/buildkit/README.md b/osdc/modules/buildkit/README.md index 37d64f9c..8f63639b 100644 --- a/osdc/modules/buildkit/README.md +++ b/osdc/modules/buildkit/README.md @@ -42,4 +42,15 @@ back to a small warm baseline when idle. Build clients should retry the connect so a build can wait for a pod from a cold or queued pool. +## HAProxy config changes roll the LB + +HAProxy renders its config only at container start, and nothing else restarts +the `buildkitd-lb` pod, so a bare ConfigMap update (`maxconn`, timeouts, +backends) would silently not take effect. `deploy.sh` stamps the LB pod template +with a `checksum/config` annotation = a hash of `haproxy.yaml`; when the config +changes the hash changes, which rolls the Deployment so the new pod picks up the +new config. An unchanged config keeps the same hash, so routine deploys don't +churn the LB. (The buildkitd worker pods do **not** yet have this, so a +`buildkitd.toml` / `drain.sh` change needs a manual rollout to take effect.) + Requires the `keda` module deployed before `buildkit` (provides the CRDs). diff --git a/osdc/modules/buildkit/deploy.sh b/osdc/modules/buildkit/deploy.sh index ece804ad..f97d0f42 100755 --- a/osdc/modules/buildkit/deploy.sh +++ b/osdc/modules/buildkit/deploy.sh @@ -75,7 +75,15 @@ sed "s/CLUSTER_NAME_PLACEHOLDER/$CNAME/g" "$GENERATED_DIR/nodepools.yaml" | kube # --- Apply static k8s resources --- echo "Applying BuildKit static manifests..." -kubectl_apply_if_changed -k "$MODULE_DIR/kubernetes/base/" +# Stamp the buildkitd-lb pod template with a hash of haproxy.yaml. HAProxy reads +# its config only at container start, and nothing else restarts the LB, so +# without this a ConfigMap change (maxconn, timeouts, backends) would silently +# not take effect until the pod happened to be recreated. Changing the hash +# rolls the Deployment whenever the config changes. +HAPROXY_SUM=$(shasum -a 256 "$MODULE_DIR/kubernetes/base/haproxy.yaml" | cut -c1-12) +kubectl kustomize "$MODULE_DIR/kubernetes/base/" \ + | sed "s/__HAPROXY_CFG_CHECKSUM__/$HAPROXY_SUM/" \ + | kubectl_apply_if_changed -f - # --- Apply generated Deployments (only if changed) --- diff --git a/osdc/modules/buildkit/kubernetes/base/haproxy.yaml b/osdc/modules/buildkit/kubernetes/base/haproxy.yaml index 45c630d6..5b5f2f7f 100644 --- a/osdc/modules/buildkit/kubernetes/base/haproxy.yaml +++ b/osdc/modules/buildkit/kubernetes/base/haproxy.yaml @@ -104,6 +104,12 @@ spec: metadata: labels: app: buildkitd-lb + annotations: + # HAProxy renders its config once at container start, so a ConfigMap + # change has no effect until this pod restarts. deploy.sh fills this in + # with a hash of haproxy.yaml, so the Deployment rolls automatically + # whenever the config changes (nothing else restarts the LB). + checksum/config: "__HAPROXY_CFG_CHECKSUM__" spec: # Runs on base-infra nodes (CriticalAddonsOnly taint) tolerations: From 190e88c94e300e6cb62afd05e74e8cc4045d8b61 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Wed, 10 Jun 2026 09:32:10 -0700 Subject: [PATCH 07/14] Update [ghstack-poisoned] --- .../workflows/build-image-scale.yaml | 23 +++++++++++++++---- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/osdc/integration-tests/workflows/build-image-scale.yaml b/osdc/integration-tests/workflows/build-image-scale.yaml index dbc3d185..86ccc08e 100644 --- a/osdc/integration-tests/workflows/build-image-scale.yaml +++ b/osdc/integration-tests/workflows/build-image-scale.yaml @@ -34,11 +34,20 @@ jobs: container: image: ghcr.io/actions/actions-runner:latest steps: - - name: Set up Docker Buildx (remote) - uses: docker/setup-buildx-action@v3 - with: - driver: remote - endpoint: tcp://buildkitd-${{ inputs.arch }}.buildkit:1234 + - name: Set up Docker Buildx (remote, no bootstrap) + shell: bash + # NOT docker/setup-buildx-action: it always runs `buildx inspect + # --bootstrap`, whose remote-driver health check gives up after a + # hardcoded ~20s — shorter than a cold scale-up, so jobs fail at setup + # during a burst. `create` (no --bootstrap) just registers the builder; + # the build step below retries so the connection waits out scale-up. + run: | + set -ex + docker buildx create \ + --name osdc-remote \ + --driver remote \ + --use \ + "tcp://buildkitd-${{ inputs.arch }}.buildkit:1234" - name: Occupy a BuildKit slot (~10 min) to drive autoscaling shell: bash @@ -51,6 +60,10 @@ jobs: EOF # --no-cache + a per-job CACHEBUST make each build distinct so it # actually runs the sleep; cacheonly keeps it remote (no push/load). + # No retry needed: with the builder created above (no --bootstrap), the + # build connection waits in HAProxy's queue for a pod to free / scale up + # (bounded by `timeout queue`), instead of buildx's ~20s bootstrap gate. + # The 30-min job timeout still fails the test if scale-up never happens. docker buildx build \ --platform "linux/${{ inputs.arch }}" \ --build-arg "CACHEBUST=${{ inputs.arch }}-${{ matrix.replica }}-${{ github.run_id }}" \ From 2453c6fb7a1144946b41e68ef603208cf2da9f4c Mon Sep 17 00:00:00 2001 From: Huy Do Date: Wed, 10 Jun 2026 10:29:00 -0700 Subject: [PATCH 08/14] Update (base update) [ghstack-poisoned] --- osdc/modules/buildkit/README.md | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/osdc/modules/buildkit/README.md b/osdc/modules/buildkit/README.md index 8f63639b..a98ee0ac 100644 --- a/osdc/modules/buildkit/README.md +++ b/osdc/modules/buildkit/README.md @@ -39,8 +39,28 @@ back to a small warm baseline when idle. directly rather than via the eviction API, so it isn't PDB-gated — the drain + grace cap above is what protects that path.) -Build clients should retry the connect so a build can wait for a pod from a cold -or queued pool. +## Build with `buildctl`, not `docker buildx` + +Clients must reach the pool with **`buildctl`** (`buildctl --addr +tcp://buildkitd-.buildkit:1234 build ...`), not `docker buildx` against a +remote builder. + +The autoscaling design relies on a *patient* client: during a burst the build's +connection sits in HAProxy's queue (above) for the minutes it takes KEDA + +Karpenter to add a pod, and that pending connection is also what keeps the +scale-up signal alive. `buildctl` does exactly this — its build call waits in +the queue up to `timeout queue` with no separate connect deadline. + +`docker buildx` does **not**: before solving it "boots" the remote builder with +a **hardcoded ~20s connect timeout** (`[internal] waiting for connection`), which +is not configurable and far shorter than a cold scale-up. Under a burst the +connection is still queued at 20s, buildx aborts with `context deadline +exceeded`, and — because the connection then drops — the scale-up signal +disappears before KEDA can act, so the pool never grows and every queued build +fails. (`docker/setup-buildx-action` hits the same gate via `inspect +--bootstrap`; removing it doesn't help because `docker buildx build` re-runs the +same boot.) This was confirmed on the staging cluster. So PyTorch's +`.ci/docker/build.sh` uses `buildctl` whenever `REMOTE_BUILDKIT` is set. ## HAProxy config changes roll the LB From 5d09df7ad72ae6fec701abf0265c6a65227d540b Mon Sep 17 00:00:00 2001 From: Huy Do Date: Wed, 10 Jun 2026 11:44:16 -0700 Subject: [PATCH 09/14] Update (base update) [ghstack-poisoned] --- osdc/modules/buildkit/README.md | 40 ++++++++----------- .../buildkit/kubernetes/base/haproxy.yaml | 4 -- 2 files changed, 17 insertions(+), 27 deletions(-) diff --git a/osdc/modules/buildkit/README.md b/osdc/modules/buildkit/README.md index a98ee0ac..6e4e30eb 100644 --- a/osdc/modules/buildkit/README.md +++ b/osdc/modules/buildkit/README.md @@ -13,9 +13,9 @@ Absorb bursts of concurrent builds without overloading existing pods, and scale back to a small warm baseline when idle. - **One build per pod** — HAProxy `server maxconn 1` (matches buildkitd - `max-parallelism = 1`). Excess builds **queue** in HAProxy (`timeout queue`) - instead of stacking on a busy pod; as new pods register (DNS), queued builds - flow onto them, so scaled-up pods never sit idle. + `max-parallelism = 1`) so a build never stacks on a busy pod. When every pod is + busy the LB has no slot, so the client must **retry the connect** (see below) + until a pod frees or the pool scales up. - **In-cluster scale signal** — KEDA `ScaledObject` per arch, `metrics-api` scraping the LB's own metrics (`haproxy_backend_current_sessions`) — no external metrics backend. @@ -39,28 +39,22 @@ back to a small warm baseline when idle. directly rather than via the eviction API, so it isn't PDB-gated — the drain + grace cap above is what protects that path.) -## Build with `buildctl`, not `docker buildx` +## Clients must retry the connect -Clients must reach the pool with **`buildctl`** (`buildctl --addr -tcp://buildkitd-.buildkit:1234 build ...`), not `docker buildx` against a -remote builder. +Build clients (both `docker buildx` and `buildctl`) use the `moby/buildkit` Go +client, which dials with gRPC's default **~20s `MinConnectTimeout`** and +**fail-fast** RPCs — there is no client-side flag to make it wait longer. During +a burst, a build whose connection finds no free pod (`maxconn 1`) is dropped by +the client after ~20s, well before KEDA/Karpenter can add a pod (minutes). An +HAProxy-side `timeout queue` does **not** help: the client gives up at 20s +regardless, so queueing on the LB is pointless (and was removed). -The autoscaling design relies on a *patient* client: during a burst the build's -connection sits in HAProxy's queue (above) for the minutes it takes KEDA + -Karpenter to add a pod, and that pending connection is also what keeps the -scale-up signal alive. `buildctl` does exactly this — its build call waits in -the queue up to `timeout queue` with no separate connect deadline. - -`docker buildx` does **not**: before solving it "boots" the remote builder with -a **hardcoded ~20s connect timeout** (`[internal] waiting for connection`), which -is not configurable and far shorter than a cold scale-up. Under a burst the -connection is still queued at 20s, buildx aborts with `context deadline -exceeded`, and — because the connection then drops — the scale-up signal -disappears before KEDA can act, so the pool never grows and every queued build -fails. (`docker/setup-buildx-action` hits the same gate via `inspect ---bootstrap`; removing it doesn't help because `docker buildx build` re-runs the -same boot.) This was confirmed on the staging cluster. So PyTorch's -`.ci/docker/build.sh` uses `buildctl` whenever `REMOTE_BUILDKIT` is set. +So the **client must retry the build** on connection failures until a pod is +free or the pool has scaled up; the repeated attempts also keep the autoscaler's +load signal alive. PyTorch's `.ci/docker/build.sh` does this when +`REMOTE_BUILDKIT` is set, and the workflow creates the remote builder *without* +`--bootstrap` (the `docker buildx inspect --bootstrap` health check hits the same +20s gate at setup). This was confirmed on the staging cluster. ## HAProxy config changes roll the LB diff --git a/osdc/modules/buildkit/kubernetes/base/haproxy.yaml b/osdc/modules/buildkit/kubernetes/base/haproxy.yaml index 5b5f2f7f..d37eeca9 100644 --- a/osdc/modules/buildkit/kubernetes/base/haproxy.yaml +++ b/osdc/modules/buildkit/kubernetes/base/haproxy.yaml @@ -24,10 +24,6 @@ data: timeout connect 5s timeout client 120m timeout server 120m - # Queue a build (server maxconn=1, below) while KEDA/Karpenter add pods, - # instead of stacking it on a busy pod. Keep <= 120m, the max time a - # docker build is allowed to run. - timeout queue 60m log global option tcplog From 031399ae5b50a2115c2a535bcd975a61a12ef575 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Wed, 10 Jun 2026 12:18:11 -0700 Subject: [PATCH 10/14] Update [ghstack-poisoned] --- .../workflows/build-image.yaml | 26 +++++++++++++------ 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/osdc/integration-tests/workflows/build-image.yaml b/osdc/integration-tests/workflows/build-image.yaml index b523d03d..74956d19 100644 --- a/osdc/integration-tests/workflows/build-image.yaml +++ b/osdc/integration-tests/workflows/build-image.yaml @@ -38,18 +38,28 @@ jobs: - name: Build test image via BuildKit run: | + set -eu echo "=== BuildKit ${{ inputs.arch }} connectivity test ===" ENDPOINT="tcp://buildkitd-${{ inputs.arch }}.buildkit:1234" echo "Connecting to: $ENDPOINT" - buildctl --addr "$ENDPOINT" build \ - --frontend dockerfile.v0 \ - --local context=docker/test-buildkit \ - --local dockerfile=docker/test-buildkit \ - --output type=image,name=ghcr.io/${{ github.repository }}:integration-test-${{ inputs.arch }}-${{ github.sha }},push=false - - echo "PASS: BuildKit ${{ inputs.arch }} built successfully" - echo "Endpoint: $ENDPOINT" + # The buildkit client dials with gRPC's ~20s connect timeout, so a + # busy / cold autoscaled pool drops the connection fast. Retry until a + # pod is free or the pool scales up. + for attempt in $(seq 1 15); do + if buildctl --addr "$ENDPOINT" build \ + --frontend dockerfile.v0 \ + --local context=docker/test-buildkit \ + --local dockerfile=docker/test-buildkit \ + --output type=image,name=ghcr.io/${{ github.repository }}:integration-test-${{ inputs.arch }}-${{ github.sha }},push=false; then + echo "PASS: BuildKit ${{ inputs.arch }} built on attempt ${attempt} ($ENDPOINT)" + exit 0 + fi + echo "attempt ${attempt}/15 failed (BuildKit cold/queued); retrying in 15s..." + sleep 15 + done + echo "FAIL: BuildKit ${{ inputs.arch }} build failed after retries" >&2 + exit 1 - name: Verify BuildKit endpoint info run: | From 3536b0e5a5691ebe89d0159e3f70237e8a6aa8e3 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Wed, 10 Jun 2026 12:25:23 -0700 Subject: [PATCH 11/14] Update [ghstack-poisoned] --- .../scripts/python/phases.py | 9 +-- .../scripts/python/test_run.py | 3 +- .../workflows/build-image-scale.yaml | 78 ------------------- .../workflows/build-image.yaml | 65 +++++++++++++++- .../workflows/integration-test.yaml.tpl | 20 +---- 5 files changed, 71 insertions(+), 104 deletions(-) delete mode 100644 osdc/integration-tests/workflows/build-image-scale.yaml diff --git a/osdc/integration-tests/scripts/python/phases.py b/osdc/integration-tests/scripts/python/phases.py index 04d3589d..7a21f204 100644 --- a/osdc/integration-tests/scripts/python/phases.py +++ b/osdc/integration-tests/scripts/python/phases.py @@ -276,11 +276,10 @@ def prepare_pr( # Write integration test workflow (workflows_dir / "integration-test.yaml").write_text(workflow_content) - # Copy reusable BuildKit workflows (connectivity + autoscaling scale test). - # The scale test builds an inline Dockerfile, so it needs no copied context. - wf_root = upstream_dir / "integration-tests" / "workflows" - for wf in ("build-image.yaml", "build-image-scale.yaml"): - (workflows_dir / wf).write_text((wf_root / wf).read_text()) + # Copy the reusable BuildKit workflow (connectivity + autoscaling scale jobs). + # The scale job builds an inline Dockerfile, so it needs no copied context. + build_wf_src = upstream_dir / "integration-tests" / "workflows" / "build-image.yaml" + (workflows_dir / "build-image.yaml").write_text(build_wf_src.read_text()) # Copy test Dockerfile (connectivity test context) docker_dir = canary_path / "docker" / "test-buildkit" diff --git a/osdc/integration-tests/scripts/python/test_run.py b/osdc/integration-tests/scripts/python/test_run.py index 5f51e8d2..58e6ab48 100644 --- a/osdc/integration-tests/scripts/python/test_run.py +++ b/osdc/integration-tests/scripts/python/test_run.py @@ -112,9 +112,8 @@ def workflow_template(tmp_path): ) (wf_dir / "integration-test.yaml.tpl").write_text(template) - # Also create reusable workflows and Dockerfile for prepare_pr + # Also create reusable workflow and Dockerfile for prepare_pr (wf_dir / "build-image.yaml").write_text("name: build-image\n") - (wf_dir / "build-image-scale.yaml").write_text("name: build-image-scale\n") docker_dir = upstream / "integration-tests" / "docker" / "test-buildkit" docker_dir.mkdir(parents=True) (docker_dir / "Dockerfile").write_text("FROM alpine\n") diff --git a/osdc/integration-tests/workflows/build-image-scale.yaml b/osdc/integration-tests/workflows/build-image-scale.yaml deleted file mode 100644 index e2618d00..00000000 --- a/osdc/integration-tests/workflows/build-image-scale.yaml +++ /dev/null @@ -1,78 +0,0 @@ -# Reusable workflow: BuildKit autoscaling scale test. -# Mirrors the real .ci/docker/docker-builds.yml path — Docker Buildx with the -# remote driver pointed at the per-arch buildkitd endpoint — but builds a -# trivial image that just sleeps, so each of the 8 parallel jobs holds one -# BuildKit slot (server maxconn=1) for ~10 min. The warm baseline is below the -# burst, so the builds finish within timeout-minutes only if KEDA scales the -# pool up; without scale-up they serialize through the baseline pods and the -# back of the queue times out — i.e. this job FAILS when autoscaling does not -# happen. -name: BuildKit Scale Test - -on: - workflow_call: - inputs: - arch: - description: "Target architecture (amd64 or arm64)" - required: true - type: string - runner_label: - description: "Runner label to use (includes cluster prefix)" - required: true - type: string - -jobs: - scale: - # Runs on x86 — BuildKit is a *remote* builder; arch selects the endpoint. - # timeout-minutes is the gate: scaled-up ~18 min, serialized ~43 min. - runs-on: ${{ inputs.runner_label }} - timeout-minutes: 30 - strategy: - fail-fast: false - matrix: - replica: [1, 2, 3, 4, 5, 6, 7, 8] - container: - image: ghcr.io/actions/actions-runner:latest - steps: - - name: Set up Docker Buildx (remote, no bootstrap) - shell: bash - # NOT docker/setup-buildx-action: it runs `buildx inspect --bootstrap`, - # whose ~20s connect timeout fails at setup during a cold scale-up. - # `create` (no --bootstrap) just registers the builder; the build step - # retries to wait out scale-up. - run: | - set -ex - docker buildx create \ - --name osdc-remote \ - --driver remote \ - --use \ - "tcp://buildkitd-${{ inputs.arch }}.buildkit:1234" - - - name: Occupy a BuildKit slot (~10 min) to drive autoscaling - shell: bash - run: | - set -eu - cat > Dockerfile.scale <<'EOF' - FROM alpine:3.21 - ARG CACHEBUST - RUN echo "osdc buildkit scale-test ${CACHEBUST}" && sleep 600 - EOF - # buildx boots the builder with a hardcoded ~20s connect timeout (gRPC - # MinConnectTimeout), so retry to wait out cold scale-up; the repeated - # attempts also keep KEDA's load signal alive. The 30-min job timeout - # still fails the test if scale-up never happens. - for attempt in $(seq 1 15); do - if docker buildx build \ - --platform "linux/${{ inputs.arch }}" \ - --build-arg "CACHEBUST=${{ inputs.arch }}-${{ matrix.replica }}-${{ github.run_id }}" \ - --no-cache \ - --output type=cacheonly \ - -f Dockerfile.scale .; then - echo "build succeeded on attempt ${attempt}" - exit 0 - fi - echo "attempt ${attempt}/15 failed (BuildKit cold/queued); retrying in 15s..." - sleep 15 - done - echo "build failed after retries" >&2 - exit 1 diff --git a/osdc/integration-tests/workflows/build-image.yaml b/osdc/integration-tests/workflows/build-image.yaml index 74956d19..04663043 100644 --- a/osdc/integration-tests/workflows/build-image.yaml +++ b/osdc/integration-tests/workflows/build-image.yaml @@ -1,6 +1,7 @@ -# Reusable workflow: Build a test image via OSDC BuildKit -# Called by integration-test.yaml to validate BuildKit connectivity. -# Uses buildctl directly — no Docker daemon required. +# Reusable workflow: exercise OSDC BuildKit for one arch. +# Called by integration-test.yaml. Two jobs: +# build — single buildctl build (validates connectivity; buildctl route) +# scale — burst of docker buildx builds (validates autoscaling; prod client) name: Build Test Image on: @@ -66,3 +67,61 @@ jobs: ENDPOINT="tcp://buildkitd-${{ inputs.arch }}.buildkit:1234" buildctl --addr "$ENDPOINT" debug info || echo "WARN: debug info not available" echo "PASS: BuildKit ${{ inputs.arch }} endpoint is responsive" + + scale: + # 8 parallel docker buildx builds (the prod client), each holding a BuildKit + # slot (server maxconn=1) ~10 min via a sleep. The warm baseline is below the + # burst, so they finish within timeout-minutes only if KEDA scales the pool + # up; otherwise the back of the burst serializes and the job times out — i.e. + # this FAILS if autoscaling does not happen. + runs-on: ${{ inputs.runner_label }} + timeout-minutes: 30 + strategy: + fail-fast: false + matrix: + replica: [1, 2, 3, 4, 5, 6, 7, 8] + container: + image: ghcr.io/actions/actions-runner:latest + steps: + - name: Set up Docker Buildx (remote, no bootstrap) + shell: bash + # NOT docker/setup-buildx-action: it runs `buildx inspect --bootstrap`, + # whose ~20s connect timeout fails at setup during a cold scale-up. + # `create` (no --bootstrap) just registers the builder; the build step + # retries to wait out scale-up. + run: | + set -ex + docker buildx create \ + --name osdc-remote \ + --driver remote \ + --use \ + "tcp://buildkitd-${{ inputs.arch }}.buildkit:1234" + + - name: Occupy a BuildKit slot (~10 min) to drive autoscaling + shell: bash + run: | + set -eu + cat > Dockerfile.scale <<'EOF' + FROM alpine:3.21 + ARG CACHEBUST + RUN echo "osdc buildkit scale-test ${CACHEBUST}" && sleep 600 + EOF + # buildx boots the builder with a hardcoded ~20s connect timeout (gRPC + # MinConnectTimeout), so retry to wait out cold scale-up; the repeated + # attempts also keep KEDA's load signal alive. The 30-min job timeout + # still fails the test if scale-up never happens. + for attempt in $(seq 1 15); do + if docker buildx build \ + --platform "linux/${{ inputs.arch }}" \ + --build-arg "CACHEBUST=${{ inputs.arch }}-${{ matrix.replica }}-${{ github.run_id }}" \ + --no-cache \ + --output type=cacheonly \ + -f Dockerfile.scale .; then + echo "build succeeded on attempt ${attempt}" + exit 0 + fi + echo "attempt ${attempt}/15 failed (BuildKit cold/queued); retrying in 15s..." + sleep 15 + done + echo "build failed after retries" >&2 + exit 1 diff --git a/osdc/integration-tests/workflows/integration-test.yaml.tpl b/osdc/integration-tests/workflows/integration-test.yaml.tpl index 7d4fb3aa..bb453f0d 100644 --- a/osdc/integration-tests/workflows/integration-test.yaml.tpl +++ b/osdc/integration-tests/workflows/integration-test.yaml.tpl @@ -1502,32 +1502,20 @@ jobs: # END_B200 # ── BuildKit Tests ──────────────────────────────────────────────────── - build-amd64: + # Each call runs a buildctl connectivity build + an 8-wide docker buildx burst + # (fails if KEDA does not scale the pool up). + buildkit-amd64: uses: ./.github/workflows/build-image.yaml with: arch: amd64 runner_label: {{PREFIX}}l-x86iamx-8-32 - build-arm64: + buildkit-arm64: uses: ./.github/workflows/build-image.yaml with: arch: arm64 runner_label: {{PREFIX}}l-x86iamx-8-32 - # ── BuildKit Autoscaling Scale Test ─────────────────────────────────── - # Bursts 8 parallel builds per arch; fails if KEDA does not scale the pool up. - buildkit-scale-amd64: - uses: ./.github/workflows/build-image-scale.yaml - with: - arch: amd64 - runner_label: {{PREFIX}}l-x86iamx-8-32 - - buildkit-scale-arm64: - uses: ./.github/workflows/build-image-scale.yaml - with: - arch: arm64 - runner_label: {{PREFIX}}l-x86iamx-8-32 - # ── Harbor Cache Test ───────────────────────────────────────────────── test-harbor: runs-on: {{PREFIX}}l-x86iamx-8-32 From 94148dc5838b6401060046cf70be73fd3beeb8a4 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Wed, 10 Jun 2026 13:00:53 -0700 Subject: [PATCH 12/14] Update [ghstack-poisoned] --- .../workflows/build-image.yaml | 26 ++++++++++++------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/osdc/integration-tests/workflows/build-image.yaml b/osdc/integration-tests/workflows/build-image.yaml index 04663043..df36cfc8 100644 --- a/osdc/integration-tests/workflows/build-image.yaml +++ b/osdc/integration-tests/workflows/build-image.yaml @@ -44,10 +44,11 @@ jobs: ENDPOINT="tcp://buildkitd-${{ inputs.arch }}.buildkit:1234" echo "Connecting to: $ENDPOINT" - # The buildkit client dials with gRPC's ~20s connect timeout, so a - # busy / cold autoscaled pool drops the connection fast. Retry until a - # pod is free or the pool scales up. - for attempt in $(seq 1 15); do + # The buildkit client dials with gRPC's ~20s connect timeout, so a busy + # / cold pool drops the connection fast (no HAProxy queue holds it). + # Retry long enough to outlast a peer's ~10 min build when the pool is + # over-subscribed (9 builds > 8 pods): ~45 x (≈5s fail + 15s) ≈ 15 min. + for attempt in $(seq 1 45); do if buildctl --addr "$ENDPOINT" build \ --frontend dockerfile.v0 \ --local context=docker/test-buildkit \ @@ -56,7 +57,7 @@ jobs: echo "PASS: BuildKit ${{ inputs.arch }} built on attempt ${attempt} ($ENDPOINT)" exit 0 fi - echo "attempt ${attempt}/15 failed (BuildKit cold/queued); retrying in 15s..." + echo "attempt ${attempt}/45 failed (BuildKit cold/queued); retrying in 15s..." sleep 15 done echo "FAIL: BuildKit ${{ inputs.arch }} build failed after retries" >&2 @@ -74,6 +75,10 @@ jobs: # burst, so they finish within timeout-minutes only if KEDA scales the pool # up; otherwise the back of the burst serializes and the job times out — i.e. # this FAILS if autoscaling does not happen. + # + # Runs concurrently with `build`, so 9 builds contend for a max-8 pool: the + # odd one out has no pod until a peer's ~10 min build finishes, exercising + # the over-subscription wait (the retry below must outlast that). runs-on: ${{ inputs.runner_label }} timeout-minutes: 30 strategy: @@ -107,10 +112,11 @@ jobs: RUN echo "osdc buildkit scale-test ${CACHEBUST}" && sleep 600 EOF # buildx boots the builder with a hardcoded ~20s connect timeout (gRPC - # MinConnectTimeout), so retry to wait out cold scale-up; the repeated - # attempts also keep KEDA's load signal alive. The 30-min job timeout - # still fails the test if scale-up never happens. - for attempt in $(seq 1 15); do + # MinConnectTimeout), so retry to wait out cold scale-up and, when the + # pool is over-subscribed, a peer's ~10 min build; the repeated attempts + # also keep KEDA's load signal alive. ~45 x (≈5s fail + 15s) ≈ 15 min, + # within the 30-min job timeout (still fails if scale-up never happens). + for attempt in $(seq 1 45); do if docker buildx build \ --platform "linux/${{ inputs.arch }}" \ --build-arg "CACHEBUST=${{ inputs.arch }}-${{ matrix.replica }}-${{ github.run_id }}" \ @@ -120,7 +126,7 @@ jobs: echo "build succeeded on attempt ${attempt}" exit 0 fi - echo "attempt ${attempt}/15 failed (BuildKit cold/queued); retrying in 15s..." + echo "attempt ${attempt}/45 failed (BuildKit cold/queued); retrying in 15s..." sleep 15 done echo "build failed after retries" >&2 From 5b75928745d90964ff2efb2c8034c1fc26f2de19 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Wed, 10 Jun 2026 16:23:14 -0700 Subject: [PATCH 13/14] Update (base update) [ghstack-poisoned] --- osdc/modules/buildkit/README.md | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/osdc/modules/buildkit/README.md b/osdc/modules/buildkit/README.md index 6e4e30eb..b38c1511 100644 --- a/osdc/modules/buildkit/README.md +++ b/osdc/modules/buildkit/README.md @@ -4,8 +4,9 @@ Remote BuildKit build service: per-arch `buildkitd` Deployments behind an HAProx LB, on dedicated Karpenter NodePools. Clients build with `buildctl --addr tcp://buildkitd-.buildkit:1234`. -Sizing is per-arch in `clusters.yaml` (`buildkit.{amd64,arm64}_{replicas,pods_per_node}`, -`*_instance_type`); pod CPU/mem is computed by `scripts/python/generate_buildkit.py`. +Sizing is per-arch in `clusters.yaml` (`buildkit.{amd64,arm64}_*` instance type, +pods-per-node, and autoscaling `*_min` / `*_max`); pod CPU/mem is computed by +`scripts/python/generate_buildkit.py`. ## Autoscaling (optional, `buildkit.autoscaling.enabled`) @@ -18,10 +19,15 @@ back to a small warm baseline when idle. until a pod frees or the pool scales up. - **In-cluster scale signal** — KEDA `ScaledObject` per arch, `metrics-api` scraping the LB's own metrics (`haproxy_backend_current_sessions`) — no external - metrics backend. + metrics backend. If KEDA can't read the metric, a `fallback` (`*_fallback`, + e.g. 32/8 on prod) holds the proven fixed pool instead of freezing the count. - **Warm baseline** — `amd64_min` / `arm64_min` keep ≥1 node per arch up so the common case gets a free warm pod immediately. `*_max` caps the burst; NodePool limits are sized to `*_max`. +- **No-flap scale-down** — KEDA holds a pod ~20 min after it goes idle + (`stabilizationWindowSeconds: 1200`), then sheds at most `max(10 pods, 20%)` + per 20 min, so a follow-up build reuses the pod's warm decompressed NVMe layer + cache. Node churn is left to Karpenter. - **Safe scale-down** — `preStop` drain (waits until the pod's `:1234` is idle) + long `terminationGracePeriodSeconds` + PDB, so a build is never killed mid-flight. Scale-down removes an arbitrary pod, which may be mid-build; the @@ -67,4 +73,6 @@ new config. An unchanged config keeps the same hash, so routine deploys don't churn the LB. (The buildkitd worker pods do **not** yet have this, so a `buildkitd.toml` / `drain.sh` change needs a manual rollout to take effect.) -Requires the `keda` module deployed before `buildkit` (provides the CRDs). +Requires the `keda` module deployed before `buildkit` (provides the CRDs). The +`monitoring` module scrapes the KEDA operator's metrics and ships +buildkit-autoscaling alerts (scaler / fallback errors, queue backlog). From 4a0f40c84e2f84cd303aa67d0be7ad809b6c8682 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Thu, 11 Jun 2026 09:19:55 -0700 Subject: [PATCH 14/14] Update (base update) [ghstack-poisoned] --- osdc/clusters.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/osdc/clusters.yaml b/osdc/clusters.yaml index bd618668..05f2ea4d 100644 --- a/osdc/clusters.yaml +++ b/osdc/clusters.yaml @@ -299,9 +299,9 @@ clusters: arm64_pods_per_node: 4 autoscaling: enabled: true - amd64_min: 2 # 1x m6id.24xlarge (2 pods/node) + amd64_min: 32 # warm baseline = proven fixed pool (16x m6id.24xlarge) amd64_max: 360 # ~90d peak ≈180, x2 for headroom - arm64_min: 4 # 1x m7gd.16xlarge (4 pods/node) + arm64_min: 8 # warm baseline = proven fixed pool (2x m7gd.16xlarge) arm64_max: 30 # ~90d peak ≈15, x2 for headroom amd64_fallback: 32 # if KEDA can't read metrics, hold the proven fixed pool arm64_fallback: 8