diff --git a/osdc/clusters.yaml b/osdc/clusters.yaml index 9bc0dc34..214ac531 100644 --- a/osdc/clusters.yaml +++ b/osdc/clusters.yaml @@ -269,11 +269,17 @@ clusters: min_node_age_seconds: 900 buildkit: amd64_instance_type: m6id.24xlarge - amd64_replicas: 32 amd64_pods_per_node: 2 arm64_instance_type: m7gd.16xlarge - arm64_replicas: 8 arm64_pods_per_node: 4 + autoscaling: + enabled: true + amd64_min: 32 # warm baseline = proven fixed pool (16x m6id.24xlarge) + amd64_max: 360 # ~90d peak ≈180, x2 for headroom + arm64_min: 8 # warm baseline = proven fixed pool (2x m7gd.16xlarge) + arm64_max: 30 # ~90d peak ≈15, x2 for headroom + amd64_fallback: 32 # if KEDA can't read metrics, hold the proven fixed pool + arm64_fallback: 8 arc-runners: github_config_url: "https://github.com/pytorch" github_secret_name: pytorch-arc-cbr-production @@ -299,6 +305,7 @@ clusters: - arc-runners - arc-runners-b200 - arc-runners-h100 + - keda - buildkit - pypi-cache - cache-enforcer diff --git a/osdc/integration-tests/scripts/python/phases.py b/osdc/integration-tests/scripts/python/phases.py index 27ddebf6..7a21f204 100644 --- a/osdc/integration-tests/scripts/python/phases.py +++ b/osdc/integration-tests/scripts/python/phases.py @@ -276,11 +276,12 @@ def prepare_pr( # Write integration test workflow (workflows_dir / "integration-test.yaml").write_text(workflow_content) - # Copy build-image reusable workflow + # Copy the reusable BuildKit workflow (connectivity + autoscaling scale jobs). + # The scale job builds an inline Dockerfile, so it needs no copied context. build_wf_src = upstream_dir / "integration-tests" / "workflows" / "build-image.yaml" (workflows_dir / "build-image.yaml").write_text(build_wf_src.read_text()) - # Copy test Dockerfile + # Copy test Dockerfile (connectivity test context) docker_dir = canary_path / "docker" / "test-buildkit" docker_dir.mkdir(parents=True, exist_ok=True) dockerfile_src = upstream_dir / "integration-tests" / "docker" / "test-buildkit" / "Dockerfile" diff --git a/osdc/integration-tests/scripts/python/test_run.py b/osdc/integration-tests/scripts/python/test_run.py index ff3a7bcb..58e6ab48 100644 --- a/osdc/integration-tests/scripts/python/test_run.py +++ b/osdc/integration-tests/scripts/python/test_run.py @@ -112,7 +112,7 @@ def workflow_template(tmp_path): ) (wf_dir / "integration-test.yaml.tpl").write_text(template) - # Also create build-image.yaml and Dockerfile for prepare_pr + # Also create reusable workflow and Dockerfile for prepare_pr (wf_dir / "build-image.yaml").write_text("name: build-image\n") docker_dir = upstream / "integration-tests" / "docker" / "test-buildkit" docker_dir.mkdir(parents=True) diff --git a/osdc/integration-tests/workflows/build-image.yaml b/osdc/integration-tests/workflows/build-image.yaml index b523d03d..df36cfc8 100644 --- a/osdc/integration-tests/workflows/build-image.yaml +++ b/osdc/integration-tests/workflows/build-image.yaml @@ -1,6 +1,7 @@ -# Reusable workflow: Build a test image via OSDC BuildKit -# Called by integration-test.yaml to validate BuildKit connectivity. -# Uses buildctl directly — no Docker daemon required. +# Reusable workflow: exercise OSDC BuildKit for one arch. +# Called by integration-test.yaml. Two jobs: +# build — single buildctl build (validates connectivity; buildctl route) +# scale — burst of docker buildx builds (validates autoscaling; prod client) name: Build Test Image on: @@ -38,21 +39,95 @@ jobs: - name: Build test image via BuildKit run: | + set -eu echo "=== BuildKit ${{ inputs.arch }} connectivity test ===" ENDPOINT="tcp://buildkitd-${{ inputs.arch }}.buildkit:1234" echo "Connecting to: $ENDPOINT" - buildctl --addr "$ENDPOINT" build \ - --frontend dockerfile.v0 \ - --local context=docker/test-buildkit \ - --local dockerfile=docker/test-buildkit \ - --output type=image,name=ghcr.io/${{ github.repository }}:integration-test-${{ inputs.arch }}-${{ github.sha }},push=false - - echo "PASS: BuildKit ${{ inputs.arch }} built successfully" - echo "Endpoint: $ENDPOINT" + # The buildkit client dials with gRPC's ~20s connect timeout, so a busy + # / cold pool drops the connection fast (no HAProxy queue holds it). + # Retry long enough to outlast a peer's ~10 min build when the pool is + # over-subscribed (9 builds > 8 pods): ~45 x (≈5s fail + 15s) ≈ 15 min. + for attempt in $(seq 1 45); do + if buildctl --addr "$ENDPOINT" build \ + --frontend dockerfile.v0 \ + --local context=docker/test-buildkit \ + --local dockerfile=docker/test-buildkit \ + --output type=image,name=ghcr.io/${{ github.repository }}:integration-test-${{ inputs.arch }}-${{ github.sha }},push=false; then + echo "PASS: BuildKit ${{ inputs.arch }} built on attempt ${attempt} ($ENDPOINT)" + exit 0 + fi + echo "attempt ${attempt}/45 failed (BuildKit cold/queued); retrying in 15s..." + sleep 15 + done + echo "FAIL: BuildKit ${{ inputs.arch }} build failed after retries" >&2 + exit 1 - name: Verify BuildKit endpoint info run: | ENDPOINT="tcp://buildkitd-${{ inputs.arch }}.buildkit:1234" buildctl --addr "$ENDPOINT" debug info || echo "WARN: debug info not available" echo "PASS: BuildKit ${{ inputs.arch }} endpoint is responsive" + + scale: + # 8 parallel docker buildx builds (the prod client), each holding a BuildKit + # slot (server maxconn=1) ~10 min via a sleep. The warm baseline is below the + # burst, so they finish within timeout-minutes only if KEDA scales the pool + # up; otherwise the back of the burst serializes and the job times out — i.e. + # this FAILS if autoscaling does not happen. + # + # Runs concurrently with `build`, so 9 builds contend for a max-8 pool: the + # odd one out has no pod until a peer's ~10 min build finishes, exercising + # the over-subscription wait (the retry below must outlast that). + runs-on: ${{ inputs.runner_label }} + timeout-minutes: 30 + strategy: + fail-fast: false + matrix: + replica: [1, 2, 3, 4, 5, 6, 7, 8] + container: + image: ghcr.io/actions/actions-runner:latest + steps: + - name: Set up Docker Buildx (remote, no bootstrap) + shell: bash + # NOT docker/setup-buildx-action: it runs `buildx inspect --bootstrap`, + # whose ~20s connect timeout fails at setup during a cold scale-up. + # `create` (no --bootstrap) just registers the builder; the build step + # retries to wait out scale-up. + run: | + set -ex + docker buildx create \ + --name osdc-remote \ + --driver remote \ + --use \ + "tcp://buildkitd-${{ inputs.arch }}.buildkit:1234" + + - name: Occupy a BuildKit slot (~10 min) to drive autoscaling + shell: bash + run: | + set -eu + cat > Dockerfile.scale <<'EOF' + FROM alpine:3.21 + ARG CACHEBUST + RUN echo "osdc buildkit scale-test ${CACHEBUST}" && sleep 600 + EOF + # buildx boots the builder with a hardcoded ~20s connect timeout (gRPC + # MinConnectTimeout), so retry to wait out cold scale-up and, when the + # pool is over-subscribed, a peer's ~10 min build; the repeated attempts + # also keep KEDA's load signal alive. ~45 x (≈5s fail + 15s) ≈ 15 min, + # within the 30-min job timeout (still fails if scale-up never happens). + for attempt in $(seq 1 45); do + if docker buildx build \ + --platform "linux/${{ inputs.arch }}" \ + --build-arg "CACHEBUST=${{ inputs.arch }}-${{ matrix.replica }}-${{ github.run_id }}" \ + --no-cache \ + --output type=cacheonly \ + -f Dockerfile.scale .; then + echo "build succeeded on attempt ${attempt}" + exit 0 + fi + echo "attempt ${attempt}/45 failed (BuildKit cold/queued); retrying in 15s..." + sleep 15 + done + echo "build failed after retries" >&2 + exit 1 diff --git a/osdc/integration-tests/workflows/integration-test.yaml.tpl b/osdc/integration-tests/workflows/integration-test.yaml.tpl index 4c20c327..63b60740 100644 --- a/osdc/integration-tests/workflows/integration-test.yaml.tpl +++ b/osdc/integration-tests/workflows/integration-test.yaml.tpl @@ -1436,13 +1436,15 @@ jobs: # END_B200 # ── BuildKit Tests ──────────────────────────────────────────────────── - build-amd64: + # Each call runs a buildctl connectivity build + an 8-wide docker buildx burst + # (fails if KEDA does not scale the pool up). + buildkit-amd64: uses: ./.github/workflows/build-image.yaml with: arch: amd64 runner_label: {{PREFIX}}l-x86iamx-8-32 - build-arm64: + buildkit-arm64: uses: ./.github/workflows/build-image.yaml with: arch: arm64 diff --git a/osdc/modules/buildkit/deploy.sh b/osdc/modules/buildkit/deploy.sh index eb08b727..3f26c9b8 100755 --- a/osdc/modules/buildkit/deploy.sh +++ b/osdc/modules/buildkit/deploy.sh @@ -34,7 +34,8 @@ AMD64_REPLICAS=$(uv run "$CFG" "$CLUSTER" buildkit.amd64_replicas "$REPLICAS") ARM64_REPLICAS=$(uv run "$CFG" "$CLUSTER" buildkit.arm64_replicas "$REPLICAS") AMD64_PODS_PER_NODE=$(uv run "$CFG" "$CLUSTER" buildkit.amd64_pods_per_node "$PODS_PER_NODE") ARM64_PODS_PER_NODE=$(uv run "$CFG" "$CLUSTER" buildkit.arm64_pods_per_node "$PODS_PER_NODE") -AUTOSCALING=$(uv run "$CFG" "$CLUSTER" buildkit.autoscaling.enabled false) +# Lowercase via tr (not ${VAR,,}) — deploy.sh runs under macOS bash 3.2 too. +AUTOSCALING=$(uv run "$CFG" "$CLUSTER" buildkit.autoscaling.enabled false | tr '[:upper:]' '[:lower:]') GENERATED_DIR="$MODULE_DIR/generated" @@ -52,7 +53,7 @@ GEN_ARGS=( --arm64-pods-per-node "$ARM64_PODS_PER_NODE" --output-dir "$GENERATED_DIR" ) -if [[ "${AUTOSCALING,,}" == "true" ]]; then +if [[ "$AUTOSCALING" == "true" ]]; then AMD64_MIN=$(uv run "$CFG" "$CLUSTER" buildkit.autoscaling.amd64_min 2) AMD64_MAX=$(uv run "$CFG" "$CLUSTER" buildkit.autoscaling.amd64_max 8) ARM64_MIN=$(uv run "$CFG" "$CLUSTER" buildkit.autoscaling.arm64_min 4) @@ -125,7 +126,7 @@ fi # --- KEDA autoscaling (optional) --- # Scales on the in-cluster buildkit LB metrics; no external metrics backend. -if [[ "${AUTOSCALING,,}" == "true" ]]; then +if [[ "$AUTOSCALING" == "true" ]]; then echo "Applying KEDA autoscaling manifests..." kubectl_apply_if_changed -f "$GENERATED_DIR/autoscaling.yaml" fi diff --git a/osdc/modules/keda/deploy.sh b/osdc/modules/keda/deploy.sh index 59f422ad..e74d3d9f 100755 --- a/osdc/modules/keda/deploy.sh +++ b/osdc/modules/keda/deploy.sh @@ -27,6 +27,7 @@ helm repo update kedacore >/dev/null 2>&1 || true helm_upgrade_if_changed keda "$NAMESPACE" \ --create-namespace \ --version "$CHART_VERSION" \ + -f "$MODULE_DIR/helm/values.yaml" \ --timeout 5m \ --wait \ kedacore/keda diff --git a/osdc/modules/keda/helm/values.yaml b/osdc/modules/keda/helm/values.yaml new file mode 100644 index 00000000..282ca179 --- /dev/null +++ b/osdc/modules/keda/helm/values.yaml @@ -0,0 +1,13 @@ +# Schedule on the base-infra nodes (tainted CriticalAddonsOnly); every other node +# is reserved by workload taints, so without this the keda pods stay Pending and +# the install's --wait times out. Applies to operator, metrics server, webhooks. +tolerations: + - key: CriticalAddonsOnly + operator: Exists + +# Expose the KEDA operator's Prometheus metrics (keda_scaler_* / +# keda_scaledobject_*) on :8080. The ServiceMonitor that scrapes it lives in the +# monitoring module, so it applies after the monitoring.coreos.com CRDs exist. +prometheus: + operator: + enabled: true diff --git a/osdc/modules/monitoring/kubernetes/monitors/kustomization.yaml b/osdc/modules/monitoring/kubernetes/monitors/kustomization.yaml index 2d223760..08e1aa28 100644 --- a/osdc/modules/monitoring/kubernetes/monitors/kustomization.yaml +++ b/osdc/modules/monitoring/kubernetes/monitors/kustomization.yaml @@ -12,6 +12,7 @@ resources: - servicemonitors/buildkit-haproxy.yaml - servicemonitors/harbor.yaml - servicemonitors/karpenter.yaml + - servicemonitors/keda.yaml - servicemonitors/node-compactor.yaml - servicemonitors/pushgateway.yaml - servicemonitors/pypi-cache.yaml diff --git a/osdc/modules/monitoring/kubernetes/monitors/servicemonitors/keda.yaml b/osdc/modules/monitoring/kubernetes/monitors/servicemonitors/keda.yaml new file mode 100644 index 00000000..7200fc48 --- /dev/null +++ b/osdc/modules/monitoring/kubernetes/monitors/servicemonitors/keda.yaml @@ -0,0 +1,28 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: keda + namespace: monitoring + labels: + app.kubernetes.io/part-of: osdc-monitoring +spec: + namespaceSelector: + matchNames: + - keda + selector: + matchLabels: + app.kubernetes.io/name: keda-operator + endpoints: + - port: metrics + path: /metrics + interval: 60s + metricRelabelings: + # Keep only KEDA's own metrics (scaler/scaledobject values, errors, + # activity, latency) — this also drops the endpoint's go_/process_/ + # controller-runtime series. Then drop histogram buckets. + - action: keep + sourceLabels: [__name__] + regex: "keda_.*" + - action: drop + sourceLabels: [__name__] + regex: ".*_bucket" diff --git a/osdc/modules/monitoring/tests/smoke/test_monitoring.py b/osdc/modules/monitoring/tests/smoke/test_monitoring.py index c095df99..17123ffd 100644 --- a/osdc/modules/monitoring/tests/smoke/test_monitoring.py +++ b/osdc/modules/monitoring/tests/smoke/test_monitoring.py @@ -29,6 +29,7 @@ "buildkit-haproxy", "harbor", "karpenter", + "keda", "node-compactor", "pypi-cache", "dcgm-exporter", @@ -289,6 +290,7 @@ def test_metrics_arriving(self, resolve_config) -> None: "buildkit": ("buildkitd-pods", "buildkit"), "buildkit-haproxy": ("buildkitd-lb-metrics", "buildkit"), "karpenter": ("karpenter", "karpenter"), + "keda": ("keda-operator", "keda"), "node-compactor": ("node-compactor", None), # arc-controller: skipped — ARC controller metrics Service varies by chart version # harbor: skipped — Harbor exporter Service name varies by chart version