Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 18 additions & 3 deletions osdc/clusters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,10 @@ defaults:
cpu_disruption_budget: "20%"
buildkit:
replicas_per_arch: 12
autoscaling:
enabled: false
keda:
chart_version: "2.16.1"
monitoring:
grafana_cloud_url: "https://prometheus-prod-36-prod-us-west-0.grafana.net/api/prom/push"
grafana_cloud_read_url: "https://prometheus-prod-36-prod-us-west-0.grafana.net/api/prom"
Expand Down Expand Up @@ -162,15 +166,22 @@ clusters:
github_secret_name: pytorch-arc-staging
runner_name_prefix: "c-mt-"
buildkit:
replicas_per_arch: 2
arm64_instance_type: m7gd.16xlarge
arm64_pods_per_node: 4
autoscaling:
enabled: true
amd64_min: 2 # 1x m6id.24xlarge (2 pods/node)
amd64_max: 8
arm64_min: 4 # 1x m7gd.16xlarge (4 pods/node)
arm64_max: 8
pypi_cache:
replicas: 1
modules:
- karpenter
- arc
- nodepools
- arc-runners
- keda
- buildkit
- pypi-cache
- cache-enforcer
Expand Down Expand Up @@ -283,11 +294,15 @@ clusters:
min_node_age_seconds: 900
buildkit:
amd64_instance_type: m6id.24xlarge
amd64_replicas: 32
amd64_pods_per_node: 2
arm64_instance_type: m7gd.16xlarge
arm64_replicas: 8
arm64_pods_per_node: 4
autoscaling:
enabled: true
amd64_min: 2 # 1x m6id.24xlarge (2 pods/node)
amd64_max: 128 # 14d peak 105; headroom to spare above it
arm64_min: 4 # 1x m7gd.16xlarge (4 pods/node)
arm64_max: 16 # 14d peak 8, likely capped by fixed pool; headroom
arc-runners:
github_config_url: "https://github.com/pytorch"
github_secret_name: pytorch-arc-cbr-production
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Holds one BuildKit slot (~10 min) so a burst of parallel builds exceeds the
# warm baseline and forces KEDA scale-up. CACHEBUST differs per build so each
# actually runs (no layer reuse).
FROM alpine:3.21
ARG CACHEBUST
RUN echo "osdc buildkit scale-test ${CACHEBUST}" && sleep 600
20 changes: 11 additions & 9 deletions osdc/integration-tests/scripts/python/phases.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,15 +276,17 @@ def prepare_pr(
# Write integration test workflow
(workflows_dir / "integration-test.yaml").write_text(workflow_content)

# Copy build-image reusable workflow
build_wf_src = upstream_dir / "integration-tests" / "workflows" / "build-image.yaml"
(workflows_dir / "build-image.yaml").write_text(build_wf_src.read_text())

# Copy test Dockerfile
docker_dir = canary_path / "docker" / "test-buildkit"
docker_dir.mkdir(parents=True, exist_ok=True)
dockerfile_src = upstream_dir / "integration-tests" / "docker" / "test-buildkit" / "Dockerfile"
(docker_dir / "Dockerfile").write_text(dockerfile_src.read_text())
# Copy reusable BuildKit workflows
wf_root = upstream_dir / "integration-tests" / "workflows"
for wf in ("build-image.yaml", "build-image-scale.yaml"):
(workflows_dir / wf).write_text((wf_root / wf).read_text())

# Copy test Dockerfiles (connectivity + autoscaling scale test)
docker_root = upstream_dir / "integration-tests" / "docker"
for name in ("test-buildkit", "test-buildkit-scale"):
dst = canary_path / "docker" / name
dst.mkdir(parents=True, exist_ok=True)
(dst / "Dockerfile").write_text((docker_root / name / "Dockerfile").read_text())

# Commit
run_cmd(["git", "add", "-A"], cwd=canary_path)
Expand Down
10 changes: 6 additions & 4 deletions osdc/integration-tests/scripts/python/test_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,11 +112,13 @@ def workflow_template(tmp_path):
)
(wf_dir / "integration-test.yaml.tpl").write_text(template)

# Also create build-image.yaml and Dockerfile for prepare_pr
# Also create reusable workflows and Dockerfiles for prepare_pr
(wf_dir / "build-image.yaml").write_text("name: build-image\n")
docker_dir = upstream / "integration-tests" / "docker" / "test-buildkit"
docker_dir.mkdir(parents=True)
(docker_dir / "Dockerfile").write_text("FROM alpine\n")
(wf_dir / "build-image-scale.yaml").write_text("name: build-image-scale\n")
for name in ("test-buildkit", "test-buildkit-scale"):
docker_dir = upstream / "integration-tests" / "docker" / name
docker_dir.mkdir(parents=True)
(docker_dir / "Dockerfile").write_text("FROM alpine\n")

return upstream

Expand Down
59 changes: 59 additions & 0 deletions osdc/integration-tests/workflows/build-image-scale.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# Reusable workflow: BuildKit autoscaling scale test.
# Launches a burst of 8 parallel builds against one arch's remote BuildKit, each
# holding a slot (server maxconn=1) for ~10 min via a sleep. The warm baseline
# (amd64_min / arm64_min) is below the burst, so the builds finish within
# timeout-minutes only if KEDA scales the pool up. Without scale-up they
# serialize through the baseline pods and the back of the queue times out — i.e.
# this job FAILS when autoscaling does not happen.
name: BuildKit Scale Test

on:
workflow_call:
inputs:
arch:
description: "Target architecture (amd64 or arm64)"
required: true
type: string
runner_label:
description: "Runner label to use (includes cluster prefix)"
required: false
type: string
default: "l-x86iavx512-2-4"

jobs:
scale:
# Runs on x86 — BuildKit is a *remote* builder; arch selects the endpoint.
# timeout-minutes is the gate: scaled-up ~18 min, serialized ~43 min.
runs-on: ${{ inputs.runner_label }}
timeout-minutes: 30
strategy:
fail-fast: false
matrix:
replica: [1, 2, 3, 4, 5, 6, 7, 8]
container:
image: ghcr.io/actions/actions-runner:latest
steps:
- name: Install buildctl
run: |
BUILDKIT_VERSION="v0.29.0"
mkdir -p "$HOME/.local/bin"
curl -sSL "https://github.com/moby/buildkit/releases/download/${BUILDKIT_VERSION}/buildkit-${BUILDKIT_VERSION}.linux-amd64.tar.gz" \
| tar xz --strip-components=1 -C "$HOME/.local/bin" bin/buildctl
echo "$HOME/.local/bin" >> "$GITHUB_PATH"
"$HOME/.local/bin/buildctl" --version

- name: Checkout
uses: actions/checkout@v4

- name: Occupy a BuildKit slot (~10 min) to drive autoscaling
run: |
ENDPOINT="tcp://buildkitd-${{ inputs.arch }}.buildkit:1234"
echo "Build ${{ matrix.replica }} -> $ENDPOINT"
buildctl --addr "$ENDPOINT" build \
--frontend dockerfile.v0 \
--local context=docker/test-buildkit-scale \
--local dockerfile=docker/test-buildkit-scale \
--opt build-arg:CACHEBUST=${{ inputs.arch }}-${{ matrix.replica }}-${{ github.run_id }} \
--no-cache \
--output type=cacheonly
echo "PASS: build ${{ matrix.replica }} finished within timeout"
14 changes: 14 additions & 0 deletions osdc/integration-tests/workflows/integration-test.yaml.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -1514,6 +1514,20 @@ jobs:
arch: arm64
runner_label: {{PREFIX}}l-x86iamx-8-32

# ── BuildKit Autoscaling Scale Test ───────────────────────────────────
# Bursts 8 parallel builds per arch; fails if KEDA does not scale the pool up.
buildkit-scale-amd64:
uses: ./.github/workflows/build-image-scale.yaml
with:
arch: amd64
runner_label: {{PREFIX}}l-x86iamx-8-32

buildkit-scale-arm64:
uses: ./.github/workflows/build-image-scale.yaml
with:
arch: arm64
runner_label: {{PREFIX}}l-x86iamx-8-32

# ── Harbor Cache Test ─────────────────────────────────────────────────
test-harbor:
runs-on: {{PREFIX}}l-x86iamx-8-32
Expand Down
40 changes: 40 additions & 0 deletions osdc/modules/buildkit/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# BuildKit module

Remote BuildKit build service: per-arch `buildkitd` Deployments behind an HAProxy
LB, on dedicated Karpenter NodePools. Clients build with
`buildctl --addr tcp://buildkitd-<arch>.buildkit:1234`.

Sizing is per-arch in `clusters.yaml` (`buildkit.{amd64,arm64}_{replicas,pods_per_node}`,
`*_instance_type`); pod CPU/mem is computed by `scripts/python/generate_buildkit.py`.

## Autoscaling (optional, `buildkit.autoscaling.enabled`)

Absorb bursts of concurrent builds without overloading existing pods, and scale
back to a small warm baseline when idle.

- **One build per pod** — HAProxy `server maxconn 1` (matches buildkitd
`max-parallelism = 1`). Excess builds **queue** in HAProxy instead of stacking
on a busy pod; as new pods register (DNS), queued builds flow onto them, so
scaled-up pods never sit idle. `timeout queue` must stay set and large enough
to outlast a node-provision cycle — if omitted it falls back to `timeout
connect` (5s), which would abort queued builds before pods scale up.
- **In-cluster scale signal** — KEDA `ScaledObject` per arch, `metrics-api`
scraping the LB's own metrics (`haproxy_backend_current_sessions`) — no external
metrics backend.
- **Warm baseline** — `amd64_min` / `arm64_min` keep ≥1 node per arch up so the
common case gets a free warm pod immediately. `*_max` caps the burst; NodePool
limits are sized to `*_max`.
- **Safe scale-down** — `preStop` drain (waits until the pod's `:1234` is idle)
+ long `terminationGracePeriodSeconds` + PDB, so a build is never killed
mid-flight.
- **Node consolidation** — the NodePool uses Karpenter `consolidationPolicy:
WhenEmpty`: a node is reclaimed only once it has no buildkitd pod, never by
evicting a running build to bin-pack. So after a burst, scattered survivor
pods can leave nodes half-full (more nodes than the cold baseline) until they
drain naturally — a deliberate trade of some idle node cost for zero build
disruption.

Build clients should retry the connect so a build can wait for a pod from a cold
or queued pool.

Requires the `keda` module deployed before `buildkit` (provides the CRDs).
42 changes: 33 additions & 9 deletions osdc/modules/buildkit/deploy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,22 +34,38 @@ AMD64_REPLICAS=$(uv run "$CFG" "$CLUSTER" buildkit.amd64_replicas "$REPLICAS")
ARM64_REPLICAS=$(uv run "$CFG" "$CLUSTER" buildkit.arm64_replicas "$REPLICAS")
AMD64_PODS_PER_NODE=$(uv run "$CFG" "$CLUSTER" buildkit.amd64_pods_per_node "$PODS_PER_NODE")
ARM64_PODS_PER_NODE=$(uv run "$CFG" "$CLUSTER" buildkit.arm64_pods_per_node "$PODS_PER_NODE")
AUTOSCALING=$(uv run "$CFG" "$CLUSTER" buildkit.autoscaling.enabled false | tr '[:upper:]' '[:lower:]')

GENERATED_DIR="$MODULE_DIR/generated"

# --- Generate manifests ---

echo "Generating BuildKit manifests..."
uv run "$MODULE_DIR/scripts/python/generate_buildkit.py" \
--arm64-instance-type "$ARM64_INSTANCE" \
--amd64-instance-type "$AMD64_INSTANCE" \
--replicas "$REPLICAS" \
--pods-per-node "$PODS_PER_NODE" \
--amd64-replicas "$AMD64_REPLICAS" \
--arm64-replicas "$ARM64_REPLICAS" \
--amd64-pods-per-node "$AMD64_PODS_PER_NODE" \
--arm64-pods-per-node "$ARM64_PODS_PER_NODE" \
GEN_ARGS=(
--arm64-instance-type "$ARM64_INSTANCE"
--amd64-instance-type "$AMD64_INSTANCE"
--replicas "$REPLICAS"
--pods-per-node "$PODS_PER_NODE"
--amd64-replicas "$AMD64_REPLICAS"
--arm64-replicas "$ARM64_REPLICAS"
--amd64-pods-per-node "$AMD64_PODS_PER_NODE"
--arm64-pods-per-node "$ARM64_PODS_PER_NODE"
--output-dir "$GENERATED_DIR"
)
if [[ "$AUTOSCALING" == "true" ]]; then
AMD64_MIN=$(uv run "$CFG" "$CLUSTER" buildkit.autoscaling.amd64_min 2)
AMD64_MAX=$(uv run "$CFG" "$CLUSTER" buildkit.autoscaling.amd64_max 8)
ARM64_MIN=$(uv run "$CFG" "$CLUSTER" buildkit.autoscaling.arm64_min 4)
ARM64_MAX=$(uv run "$CFG" "$CLUSTER" buildkit.autoscaling.arm64_max 8)
GEN_ARGS+=(
--autoscaling
--amd64-min "$AMD64_MIN"
--amd64-max "$AMD64_MAX"
--arm64-min "$ARM64_MIN"
--arm64-max "$ARM64_MAX"
)
fi
uv run "$MODULE_DIR/scripts/python/generate_buildkit.py" "${GEN_ARGS[@]}"

# --- Apply NodePools (with cluster name substitution) ---

Expand Down Expand Up @@ -93,5 +109,13 @@ else
kubectl rollout status deployment/buildkitd-amd64 -n buildkit --timeout=15m
fi

# --- KEDA autoscaling (optional) ---
# Scales on the in-cluster buildkit LB metrics; no external metrics backend.

if [[ "$AUTOSCALING" == "true" ]]; then
echo "Applying KEDA autoscaling manifests..."
kubectl_apply_if_changed -f "$GENERATED_DIR/autoscaling.yaml"
fi

echo "BuildKit deployed."
kubectl get pods -n buildkit -o wide
21 changes: 21 additions & 0 deletions osdc/modules/buildkit/kubernetes/base/drain-configmap.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: buildkitd-drain
namespace: buildkit
data:
# preStop drain: block termination until no in-flight build remains. A build
# keeps an ESTABLISHED inbound connection on :1234 for its whole duration;
# require two consecutive idle polls so a transient health check can't be
# mistaken for "done". terminationGracePeriodSeconds caps the total wait.
drain.sh: |
#!/bin/sh
idle=0
while [ "$idle" -lt 2 ]; do
if netstat -tn 2>/dev/null | awk '$NF=="ESTABLISHED" && $4 ~ /:1234$/{f=1} END{exit !f}'; then
idle=0
else
idle=$((idle + 1))
fi
sleep 15
done
9 changes: 6 additions & 3 deletions osdc/modules/buildkit/kubernetes/base/haproxy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ data:
timeout connect 5s
timeout client 120m
timeout server 120m
# Queue builds (server maxconn=1) while pods scale up. Keep <= 120m, the max
# time a docker build is allowed to run.
timeout queue 60m
log global
option tcplog

Expand Down Expand Up @@ -71,15 +74,15 @@ data:
# resolution warnings.
backend bk_arm64
balance leastconn
server-template arm 10 buildkitd-arm64-pods.buildkit.svc.cluster.local:1234 check resolvers k8s init-addr none resolve-prefer ipv6
server-template arm 10 buildkitd-arm64-pods.buildkit.svc.cluster.local:1234 check resolvers k8s init-addr none resolve-prefer ipv6 maxconn 1

backend bk_amd64
balance leastconn
server-template amd 10 buildkitd-amd64-pods.buildkit.svc.cluster.local:1234 check resolvers k8s init-addr none resolve-prefer ipv6
server-template amd 10 buildkitd-amd64-pods.buildkit.svc.cluster.local:1234 check resolvers k8s init-addr none resolve-prefer ipv6 maxconn 1

backend bk_all
balance leastconn
server-template all 20 buildkitd-pods.buildkit.svc.cluster.local:1234 check resolvers k8s init-addr none resolve-prefer ipv6
server-template all 20 buildkitd-pods.buildkit.svc.cluster.local:1234 check resolvers k8s init-addr none resolve-prefer ipv6 maxconn 1

---
apiVersion: apps/v1
Expand Down
2 changes: 2 additions & 0 deletions osdc/modules/buildkit/kubernetes/base/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ kind: Kustomization
resources:
- namespace.yaml
- configmap.yaml
- drain-configmap.yaml
- haproxy.yaml
- service.yaml
- networkpolicy.yaml
- poddisruptionbudget.yaml
25 changes: 25 additions & 0 deletions osdc/modules/buildkit/kubernetes/base/poddisruptionbudget.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Cap voluntary disruptions (node consolidation, drains) to one builder per arch
# at a time so evictions go through the preStop drain instead of killing builds.
apiVersion: policy/v1
kind: PodDisruptionBudget
metadata:
name: buildkitd-arm64
namespace: buildkit
spec:
maxUnavailable: 1
selector:
matchLabels:
app: buildkitd
arch: arm64
---
apiVersion: policy/v1
kind: PodDisruptionBudget
metadata:
name: buildkitd-amd64
namespace: buildkit
spec:
maxUnavailable: 1
selector:
matchLabels:
app: buildkitd
arch: amd64
Loading
Loading