diff --git a/osdc/clusters.yaml b/osdc/clusters.yaml index 9bc0dc34..214ac531 100644 --- a/osdc/clusters.yaml +++ b/osdc/clusters.yaml @@ -269,11 +269,17 @@ clusters: min_node_age_seconds: 900 buildkit: amd64_instance_type: m6id.24xlarge - amd64_replicas: 32 amd64_pods_per_node: 2 arm64_instance_type: m7gd.16xlarge - arm64_replicas: 8 arm64_pods_per_node: 4 + autoscaling: + enabled: true + amd64_min: 32 # warm baseline = proven fixed pool (16x m6id.24xlarge) + amd64_max: 360 # ~90d peak ≈180, x2 for headroom + arm64_min: 8 # warm baseline = proven fixed pool (2x m7gd.16xlarge) + arm64_max: 30 # ~90d peak ≈15, x2 for headroom + amd64_fallback: 32 # if KEDA can't read metrics, hold the proven fixed pool + arm64_fallback: 8 arc-runners: github_config_url: "https://github.com/pytorch" github_secret_name: pytorch-arc-cbr-production @@ -299,6 +305,7 @@ clusters: - arc-runners - arc-runners-b200 - arc-runners-h100 + - keda - buildkit - pypi-cache - cache-enforcer diff --git a/osdc/integration-tests/scripts/python/phases.py b/osdc/integration-tests/scripts/python/phases.py index 27ddebf6..7a21f204 100644 --- a/osdc/integration-tests/scripts/python/phases.py +++ b/osdc/integration-tests/scripts/python/phases.py @@ -276,11 +276,12 @@ def prepare_pr( # Write integration test workflow (workflows_dir / "integration-test.yaml").write_text(workflow_content) - # Copy build-image reusable workflow + # Copy the reusable BuildKit workflow (connectivity + autoscaling scale jobs). + # The scale job builds an inline Dockerfile, so it needs no copied context. build_wf_src = upstream_dir / "integration-tests" / "workflows" / "build-image.yaml" (workflows_dir / "build-image.yaml").write_text(build_wf_src.read_text()) - # Copy test Dockerfile + # Copy test Dockerfile (connectivity test context) docker_dir = canary_path / "docker" / "test-buildkit" docker_dir.mkdir(parents=True, exist_ok=True) dockerfile_src = upstream_dir / "integration-tests" / "docker" / "test-buildkit" / "Dockerfile" diff --git a/osdc/integration-tests/scripts/python/test_run.py b/osdc/integration-tests/scripts/python/test_run.py index ff3a7bcb..58e6ab48 100644 --- a/osdc/integration-tests/scripts/python/test_run.py +++ b/osdc/integration-tests/scripts/python/test_run.py @@ -112,7 +112,7 @@ def workflow_template(tmp_path): ) (wf_dir / "integration-test.yaml.tpl").write_text(template) - # Also create build-image.yaml and Dockerfile for prepare_pr + # Also create reusable workflow and Dockerfile for prepare_pr (wf_dir / "build-image.yaml").write_text("name: build-image\n") docker_dir = upstream / "integration-tests" / "docker" / "test-buildkit" docker_dir.mkdir(parents=True) diff --git a/osdc/integration-tests/workflows/build-image.yaml b/osdc/integration-tests/workflows/build-image.yaml index b523d03d..df36cfc8 100644 --- a/osdc/integration-tests/workflows/build-image.yaml +++ b/osdc/integration-tests/workflows/build-image.yaml @@ -1,6 +1,7 @@ -# Reusable workflow: Build a test image via OSDC BuildKit -# Called by integration-test.yaml to validate BuildKit connectivity. -# Uses buildctl directly — no Docker daemon required. +# Reusable workflow: exercise OSDC BuildKit for one arch. +# Called by integration-test.yaml. Two jobs: +# build — single buildctl build (validates connectivity; buildctl route) +# scale — burst of docker buildx builds (validates autoscaling; prod client) name: Build Test Image on: @@ -38,21 +39,95 @@ jobs: - name: Build test image via BuildKit run: | + set -eu echo "=== BuildKit ${{ inputs.arch }} connectivity test ===" ENDPOINT="tcp://buildkitd-${{ inputs.arch }}.buildkit:1234" echo "Connecting to: $ENDPOINT" - buildctl --addr "$ENDPOINT" build \ - --frontend dockerfile.v0 \ - --local context=docker/test-buildkit \ - --local dockerfile=docker/test-buildkit \ - --output type=image,name=ghcr.io/${{ github.repository }}:integration-test-${{ inputs.arch }}-${{ github.sha }},push=false - - echo "PASS: BuildKit ${{ inputs.arch }} built successfully" - echo "Endpoint: $ENDPOINT" + # The buildkit client dials with gRPC's ~20s connect timeout, so a busy + # / cold pool drops the connection fast (no HAProxy queue holds it). + # Retry long enough to outlast a peer's ~10 min build when the pool is + # over-subscribed (9 builds > 8 pods): ~45 x (≈5s fail + 15s) ≈ 15 min. + for attempt in $(seq 1 45); do + if buildctl --addr "$ENDPOINT" build \ + --frontend dockerfile.v0 \ + --local context=docker/test-buildkit \ + --local dockerfile=docker/test-buildkit \ + --output type=image,name=ghcr.io/${{ github.repository }}:integration-test-${{ inputs.arch }}-${{ github.sha }},push=false; then + echo "PASS: BuildKit ${{ inputs.arch }} built on attempt ${attempt} ($ENDPOINT)" + exit 0 + fi + echo "attempt ${attempt}/45 failed (BuildKit cold/queued); retrying in 15s..." + sleep 15 + done + echo "FAIL: BuildKit ${{ inputs.arch }} build failed after retries" >&2 + exit 1 - name: Verify BuildKit endpoint info run: | ENDPOINT="tcp://buildkitd-${{ inputs.arch }}.buildkit:1234" buildctl --addr "$ENDPOINT" debug info || echo "WARN: debug info not available" echo "PASS: BuildKit ${{ inputs.arch }} endpoint is responsive" + + scale: + # 8 parallel docker buildx builds (the prod client), each holding a BuildKit + # slot (server maxconn=1) ~10 min via a sleep. The warm baseline is below the + # burst, so they finish within timeout-minutes only if KEDA scales the pool + # up; otherwise the back of the burst serializes and the job times out — i.e. + # this FAILS if autoscaling does not happen. + # + # Runs concurrently with `build`, so 9 builds contend for a max-8 pool: the + # odd one out has no pod until a peer's ~10 min build finishes, exercising + # the over-subscription wait (the retry below must outlast that). + runs-on: ${{ inputs.runner_label }} + timeout-minutes: 30 + strategy: + fail-fast: false + matrix: + replica: [1, 2, 3, 4, 5, 6, 7, 8] + container: + image: ghcr.io/actions/actions-runner:latest + steps: + - name: Set up Docker Buildx (remote, no bootstrap) + shell: bash + # NOT docker/setup-buildx-action: it runs `buildx inspect --bootstrap`, + # whose ~20s connect timeout fails at setup during a cold scale-up. + # `create` (no --bootstrap) just registers the builder; the build step + # retries to wait out scale-up. + run: | + set -ex + docker buildx create \ + --name osdc-remote \ + --driver remote \ + --use \ + "tcp://buildkitd-${{ inputs.arch }}.buildkit:1234" + + - name: Occupy a BuildKit slot (~10 min) to drive autoscaling + shell: bash + run: | + set -eu + cat > Dockerfile.scale <<'EOF' + FROM alpine:3.21 + ARG CACHEBUST + RUN echo "osdc buildkit scale-test ${CACHEBUST}" && sleep 600 + EOF + # buildx boots the builder with a hardcoded ~20s connect timeout (gRPC + # MinConnectTimeout), so retry to wait out cold scale-up and, when the + # pool is over-subscribed, a peer's ~10 min build; the repeated attempts + # also keep KEDA's load signal alive. ~45 x (≈5s fail + 15s) ≈ 15 min, + # within the 30-min job timeout (still fails if scale-up never happens). + for attempt in $(seq 1 45); do + if docker buildx build \ + --platform "linux/${{ inputs.arch }}" \ + --build-arg "CACHEBUST=${{ inputs.arch }}-${{ matrix.replica }}-${{ github.run_id }}" \ + --no-cache \ + --output type=cacheonly \ + -f Dockerfile.scale .; then + echo "build succeeded on attempt ${attempt}" + exit 0 + fi + echo "attempt ${attempt}/45 failed (BuildKit cold/queued); retrying in 15s..." + sleep 15 + done + echo "build failed after retries" >&2 + exit 1 diff --git a/osdc/integration-tests/workflows/integration-test.yaml.tpl b/osdc/integration-tests/workflows/integration-test.yaml.tpl index 4c20c327..63b60740 100644 --- a/osdc/integration-tests/workflows/integration-test.yaml.tpl +++ b/osdc/integration-tests/workflows/integration-test.yaml.tpl @@ -1436,13 +1436,15 @@ jobs: # END_B200 # ── BuildKit Tests ──────────────────────────────────────────────────── - build-amd64: + # Each call runs a buildctl connectivity build + an 8-wide docker buildx burst + # (fails if KEDA does not scale the pool up). + buildkit-amd64: uses: ./.github/workflows/build-image.yaml with: arch: amd64 runner_label: {{PREFIX}}l-x86iamx-8-32 - build-arm64: + buildkit-arm64: uses: ./.github/workflows/build-image.yaml with: arch: arm64