From 40abdad034b18790788009994eeda6b0f27da98b Mon Sep 17 00:00:00 2001 From: Yuan Chen Date: Sat, 25 Apr 2026 08:25:10 -0700 Subject: [PATCH 01/21] ci: harden H100 GPU qualification --- .../check-control-plane-health/action.yml | 116 ++++++++++++++++++ .../actions/gpu-operator-install/action.yml | 39 +++--- .../actions/gpu-snapshot-validate/action.yml | 5 + .github/actions/gpu-test-cleanup/action.yml | 9 ++ .../actions/install-karpenter-kwok/action.yml | 15 +++ .../workflows/gpu-h100-inference-test.yaml | 46 ++++++- .github/workflows/gpu-h100-training-test.yaml | 46 ++++++- kwok/scripts/install-karpenter-kwok.sh | 14 ++- pkg/bundler/deployer/helm/helm_test.go | 109 +++++++++++++++- .../deployer/helm/templates/deploy.sh.tmpl | 18 ++- .../kind-training-kubeflow/chainsaw-test.yaml | 2 + 11 files changed, 390 insertions(+), 29 deletions(-) create mode 100644 .github/actions/check-control-plane-health/action.yml diff --git a/.github/actions/check-control-plane-health/action.yml b/.github/actions/check-control-plane-health/action.yml new file mode 100644 index 000000000..ae2a87db7 --- /dev/null +++ b/.github/actions/check-control-plane-health/action.yml @@ -0,0 +1,116 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: 'Check Control Plane Health' +description: 'Fails if Kind control-plane static pods are missing, unready, or restarted.' + +inputs: + cluster_name: + description: 'Kind cluster name' + required: true + namespace: + description: 'Namespace that contains the control-plane pods' + required: false + default: kube-system + components: + description: 'Space-separated component label values to check' + required: false + default: kube-apiserver kube-controller-manager kube-scheduler etcd + wait_timeout: + description: 'Timeout for each component readiness wait' + required: false + default: 60s + max_restarts: + description: 'Maximum tolerated restart count for each control-plane container' + required: false + default: '1' + +runs: + using: 'composite' + steps: + - name: Check control-plane pods + shell: bash + env: + KIND_CLUSTER_NAME: ${{ inputs.cluster_name }} + NAMESPACE: ${{ inputs.namespace }} + COMPONENTS: ${{ inputs.components }} + WAIT_TIMEOUT: ${{ inputs.wait_timeout }} + MAX_RESTARTS: ${{ inputs.max_restarts }} + run: | + set -euo pipefail + + MAX_RESTARTS="${MAX_RESTARTS#"${MAX_RESTARTS%%[![:space:]]*}"}" + MAX_RESTARTS="${MAX_RESTARTS%"${MAX_RESTARTS##*[![:space:]]}"}" + if ! [[ "${MAX_RESTARTS}" =~ ^[0-9]+$ ]]; then + echo "::error::max_restarts must be a non-negative integer, got '${MAX_RESTARTS}'" + exit 1 + fi + + kubectl --context="kind-${KIND_CLUSTER_NAME}" get --raw='/readyz' || true + + check_component() { + local component="$1" + local selector="component=${component}" + local pods + + pods=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" -n "${NAMESPACE}" \ + get pod -l "${selector}" -o name) + if [[ -z "${pods}" ]]; then + echo "::error::no ${component} pods found in ${NAMESPACE} with selector ${selector}" + kubectl --context="kind-${KIND_CLUSTER_NAME}" -n "${NAMESPACE}" \ + get pods -o wide || true + exit 1 + fi + + if ! kubectl --context="kind-${KIND_CLUSTER_NAME}" -n "${NAMESPACE}" \ + wait --for=condition=Ready pod -l "${selector}" --timeout="${WAIT_TIMEOUT}"; then + echo "::error::${component} pods did not become Ready within ${WAIT_TIMEOUT}" + kubectl --context="kind-${KIND_CLUSTER_NAME}" -n "${NAMESPACE}" \ + get pod -l "${selector}" -o wide || true + kubectl --context="kind-${KIND_CLUSTER_NAME}" -n "${NAMESPACE}" \ + describe pod -l "${selector}" || true + kubectl --context="kind-${KIND_CLUSTER_NAME}" -n "${NAMESPACE}" \ + get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true + kubectl --context="kind-${KIND_CLUSTER_NAME}" get --raw='/readyz' || true + exit 1 + fi + + local restart_counts + restart_counts=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" -n "${NAMESPACE}" \ + get pod -l "${selector}" \ + -o jsonpath='{range .items[*]}{range .status.containerStatuses[*]}{.restartCount}{"\n"}{end}{end}') + if [[ -z "${restart_counts}" ]]; then + echo "::error::no container statuses found for ${component} pods" + kubectl --context="kind-${KIND_CLUSTER_NAME}" -n "${NAMESPACE}" \ + describe pod -l "${selector}" || true + exit 1 + fi + + while IFS= read -r restart_count; do + [[ -z "${restart_count}" ]] && continue + if (( restart_count > MAX_RESTARTS )); then + echo "::error::${component} restartCount=${restart_count}" + kubectl --context="kind-${KIND_CLUSTER_NAME}" -n "${NAMESPACE}" \ + get pod -l "${selector}" -o wide || true + kubectl --context="kind-${KIND_CLUSTER_NAME}" -n "${NAMESPACE}" \ + describe pod -l "${selector}" || true + exit 1 + fi + done <<< "${restart_counts}" + } + + for component in ${COMPONENTS}; do + check_component "${component}" + done + kubectl --context="kind-${KIND_CLUSTER_NAME}" get --raw='/readyz' diff --git a/.github/actions/gpu-operator-install/action.yml b/.github/actions/gpu-operator-install/action.yml index e2bdb300c..86d247932 100644 --- a/.github/actions/gpu-operator-install/action.yml +++ b/.github/actions/gpu-operator-install/action.yml @@ -31,6 +31,14 @@ inputs: description: 'Platform for recipe generation (bundle mode only, e.g. dynamo)' required: false default: '' + wait: + description: 'Wait for bundle Helm resources during deploy' + required: false + default: 'false' + best_effort: + description: 'Continue deploying remaining bundle components after a component failure' + required: false + default: 'true' runs: using: 'composite' @@ -82,8 +90,7 @@ runs: --intent ${{ inputs.intent }} \ ${PLATFORM_FLAG} \ --output recipe.yaml - echo "--- Recipe ---" - cat recipe.yaml + echo "Recipe written to recipe.yaml" - name: Generate deployment bundle if: inputs.method == 'bundle' @@ -101,19 +108,23 @@ runs: shell: bash run: | cd bundle - # Use --no-wait: several components (gpu-operator ClusterPolicy, - # kai-scheduler SchedulingShard, nvidia-dra-driver-gpu kubelet plugin) - # stay InProgress in kind because their CRs/DaemonSets require - # features not available in kind (DRA feature gates, driver modules). - # The explicit "Wait for GPU operands" step below gates on what - # actually matters (device plugin readiness). - # --best-effort: some components (e.g. network-operator) have Helm - # hooks that may time out in Kind; continue deploying remaining - # components so the overall stack is functional. + # The default keeps legacy bundle-mode behavior: do not wait on every + # Helm resource and keep deploying after component failures. H100 + # qualification jobs override these inputs to hard-fail and wait. chmod +x deploy.sh - echo "--- deploy.sh ---" - cat deploy.sh - ./deploy.sh --no-wait --best-effort + DEPLOY_ARGS=() + if [[ "${{ inputs.wait }}" != "true" ]]; then + DEPLOY_ARGS+=(--no-wait) + fi + if [[ "${{ inputs.best_effort }}" == "true" ]]; then + DEPLOY_ARGS+=(--best-effort) + fi + if [[ "${#DEPLOY_ARGS[@]}" -gt 0 ]]; then + echo "Deploying bundle with args: ${DEPLOY_ARGS[*]}" + else + echo "Deploying bundle with default args" + fi + ./deploy.sh "${DEPLOY_ARGS[@]}" - name: Wait for GPU operands (bundle) if: inputs.method == 'bundle' diff --git a/.github/actions/gpu-snapshot-validate/action.yml b/.github/actions/gpu-snapshot-validate/action.yml index e1ee3c14b..9cfb67e9c 100644 --- a/.github/actions/gpu-snapshot-validate/action.yml +++ b/.github/actions/gpu-snapshot-validate/action.yml @@ -26,6 +26,10 @@ inputs: cluster_name: description: 'Kind cluster name (for kubectl context)' required: true + snapshot_timeout: + description: 'Timeout for aicr snapshot' + required: false + default: '5m' runs: using: composite @@ -38,6 +42,7 @@ runs: --namespace=default \ --image=ko.local:smoke-test \ --require-gpu \ + --timeout="${{ inputs.snapshot_timeout }}" \ --output=snapshot.yaml echo "--- Snapshot output ---" cat snapshot.yaml diff --git a/.github/actions/gpu-test-cleanup/action.yml b/.github/actions/gpu-test-cleanup/action.yml index 30ac7831f..d2089816b 100644 --- a/.github/actions/gpu-test-cleanup/action.yml +++ b/.github/actions/gpu-test-cleanup/action.yml @@ -40,6 +40,15 @@ runs: kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator logs -l app=nvidia-device-plugin-daemonset --tail=100 > /tmp/debug-artifacts/device-plugin-logs.txt || true kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator logs -l app.kubernetes.io/component=gpu-operator --tail=100 > /tmp/debug-artifacts/gpu-operator-logs.txt || true kubectl --context="kind-${KIND_CLUSTER_NAME}" get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded > /tmp/debug-artifacts/non-running-pods.txt || true + tar_inputs=() + [[ -f recipe.yaml ]] && tar_inputs+=(recipe.yaml) + [[ -d bundle ]] && tar_inputs+=(bundle) + if [[ "${#tar_inputs[@]}" -gt 0 ]]; then + echo "Archiving runtime bundle inputs: ${tar_inputs[*]}" + tar -czf /tmp/debug-artifacts/aicr-runtime-bundle.tar.gz "${tar_inputs[@]}" || true + else + echo "No recipe.yaml or bundle directory found; skipping runtime bundle archive" + fi - name: Export kind logs if: failure() diff --git a/.github/actions/install-karpenter-kwok/action.yml b/.github/actions/install-karpenter-kwok/action.yml index fde7bddde..dab642174 100644 --- a/.github/actions/install-karpenter-kwok/action.yml +++ b/.github/actions/install-karpenter-kwok/action.yml @@ -19,6 +19,18 @@ inputs: cluster_name: description: 'Kind cluster name (used for kubectl context)' required: true + kwok_helm_timeout: + description: 'Timeout for KWOK controller Helm install' + required: false + default: '300s' + ko_build_timeout: + description: 'Timeout in seconds for Karpenter KWOK provider ko build' + required: false + default: '900' + karpenter_helm_timeout: + description: 'Timeout for Karpenter Helm install' + required: false + default: '300s' runs: using: 'composite' @@ -46,6 +58,9 @@ runs: env: KIND_CLUSTER_NAME: ${{ inputs.cluster_name }} KARPENTER_VERSION: ${{ steps.versions.outputs.karpenter }} + KWOK_HELM_TIMEOUT: ${{ inputs.kwok_helm_timeout }} + KO_BUILD_TIMEOUT: ${{ inputs.ko_build_timeout }} + KARPENTER_HELM_TIMEOUT: ${{ inputs.karpenter_helm_timeout }} run: | set -euo pipefail bash kwok/scripts/install-karpenter-kwok.sh diff --git a/.github/workflows/gpu-h100-inference-test.yaml b/.github/workflows/gpu-h100-inference-test.yaml index c5e1882d4..373cd3c6e 100644 --- a/.github/workflows/gpu-h100-inference-test.yaml +++ b/.github/workflows/gpu-h100-inference-test.yaml @@ -50,6 +50,7 @@ jobs: - '.settings.yaml' - '.github/actions/gpu-cluster-setup/**' - '.github/actions/gpu-operator-install/**' + - '.github/actions/check-control-plane-health/**' - '.github/actions/aicr-build/**' - '.github/actions/setup-build-tools/**' - '.github/actions/install-karpenter-kwok/**' @@ -96,7 +97,9 @@ jobs: group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }} cancel-in-progress: true runs-on: linux-amd64-gpu-h100-latest-2 - timeout-minutes: 120 + # Cold self-hosted H100 runners can spend most of the old budget pulling + # images and loading Kind nodes before validation starts. + timeout-minutes: 180 env: KIND_CLUSTER_NAME: gpu-inference-test @@ -116,6 +119,12 @@ jobs: with: validator_phases: 'conformance' + - name: Check control plane health + uses: ./.github/actions/check-control-plane-health + with: + cluster_name: ${{ env.KIND_CLUSTER_NAME }} + wait_timeout: 120s + - name: Install runtime bundle id: bundle-install uses: ./.github/actions/gpu-operator-install @@ -123,6 +132,14 @@ jobs: method: bundle accelerator: h100 platform: dynamo + wait: 'true' + best_effort: 'false' + + - name: Check control plane health after runtime install + uses: ./.github/actions/check-control-plane-health + with: + cluster_name: ${{ env.KIND_CLUSTER_NAME }} + wait_timeout: 120s # --- Snapshot and GPU validation --- @@ -132,16 +149,32 @@ jobs: gpu_model: H100 min_gpu_count: '2' cluster_name: ${{ env.KIND_CLUSTER_NAME }} + snapshot_timeout: 10m # --- Install Karpenter + KWOK early to give monitoring stack settle time --- + - name: Check control plane health before Karpenter + uses: ./.github/actions/check-control-plane-health + with: + cluster_name: ${{ env.KIND_CLUSTER_NAME }} + wait_timeout: 120s + - name: Install Karpenter + KWOK uses: ./.github/actions/install-karpenter-kwok with: cluster_name: ${{ env.KIND_CLUSTER_NAME }} + kwok_helm_timeout: 600s + ko_build_timeout: '1200' + karpenter_helm_timeout: 600s # --- Health checks --- + - name: Check control plane health after Karpenter + uses: ./.github/actions/check-control-plane-health + with: + cluster_name: ${{ env.KIND_CLUSTER_NAME }} + wait_timeout: 120s + - name: Prepare chainsaw id: versions uses: ./.github/actions/load-versions @@ -171,6 +204,12 @@ jobs: --kubeconfig="${HOME}/.kube/config" \ --debug + - name: Check control plane health before conformance validation + uses: ./.github/actions/check-control-plane-health + with: + cluster_name: ${{ env.KIND_CLUSTER_NAME }} + wait_timeout: 120s + - name: Validate CNCF AI Conformance id: validate-conformance run: | @@ -193,8 +232,9 @@ jobs: # training smoke test if needed. # --- Validation artifacts --- - # Collect a post-run resource snapshot regardless of whether conformance - # validation ran, so triage always has a cluster-state artifact. + # Collect a post-run resource snapshot whenever the runtime bundle + # installed. This preserves triage data for snapshot/chainsaw/conformance + # failures; continue-on-error keeps the original failure intact. - name: Collect validation artifacts if: >- always() diff --git a/.github/workflows/gpu-h100-training-test.yaml b/.github/workflows/gpu-h100-training-test.yaml index d3a04de03..36a2d1eec 100644 --- a/.github/workflows/gpu-h100-training-test.yaml +++ b/.github/workflows/gpu-h100-training-test.yaml @@ -50,6 +50,7 @@ jobs: - '.settings.yaml' - '.github/actions/gpu-cluster-setup/**' - '.github/actions/gpu-operator-install/**' + - '.github/actions/check-control-plane-health/**' - '.github/actions/aicr-build/**' - '.github/actions/setup-build-tools/**' - '.github/actions/install-karpenter-kwok/**' @@ -92,7 +93,9 @@ jobs: group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }} cancel-in-progress: true runs-on: linux-amd64-gpu-h100-latest-2 - timeout-minutes: 120 + # Cold self-hosted H100 runners can spend most of the old budget pulling + # images and loading Kind nodes before validation starts. + timeout-minutes: 180 env: KIND_CLUSTER_NAME: gpu-training-test @@ -112,6 +115,12 @@ jobs: with: validator_phases: 'conformance' + - name: Check control plane health + uses: ./.github/actions/check-control-plane-health + with: + cluster_name: ${{ env.KIND_CLUSTER_NAME }} + wait_timeout: 120s + - name: Install runtime bundle id: bundle-install uses: ./.github/actions/gpu-operator-install @@ -120,6 +129,14 @@ jobs: accelerator: h100 intent: training platform: kubeflow + wait: 'true' + best_effort: 'false' + + - name: Check control plane health after runtime install + uses: ./.github/actions/check-control-plane-health + with: + cluster_name: ${{ env.KIND_CLUSTER_NAME }} + wait_timeout: 120s # --- Snapshot and GPU validation --- @@ -129,16 +146,32 @@ jobs: gpu_model: H100 min_gpu_count: '2' cluster_name: ${{ env.KIND_CLUSTER_NAME }} + snapshot_timeout: 10m # --- Install Karpenter + KWOK early to give monitoring stack settle time --- + - name: Check control plane health before Karpenter + uses: ./.github/actions/check-control-plane-health + with: + cluster_name: ${{ env.KIND_CLUSTER_NAME }} + wait_timeout: 120s + - name: Install Karpenter + KWOK uses: ./.github/actions/install-karpenter-kwok with: cluster_name: ${{ env.KIND_CLUSTER_NAME }} + kwok_helm_timeout: 600s + ko_build_timeout: '1200' + karpenter_helm_timeout: 600s # --- Health checks --- + - name: Check control plane health after Karpenter + uses: ./.github/actions/check-control-plane-health + with: + cluster_name: ${{ env.KIND_CLUSTER_NAME }} + wait_timeout: 120s + - name: Prepare chainsaw id: versions uses: ./.github/actions/load-versions @@ -168,6 +201,12 @@ jobs: --kubeconfig="${HOME}/.kube/config" \ --debug + - name: Check control plane health before conformance validation + uses: ./.github/actions/check-control-plane-health + with: + cluster_name: ${{ env.KIND_CLUSTER_NAME }} + wait_timeout: 120s + - name: Validate CNCF AI Conformance id: validate-conformance run: | @@ -186,8 +225,9 @@ jobs: # --- Validation artifacts --- - # Collect a post-run resource snapshot regardless of whether conformance - # validation ran, so triage always has a cluster-state artifact. + # Collect a post-run resource snapshot whenever the runtime bundle + # installed. This preserves triage data for snapshot/chainsaw/conformance + # failures; continue-on-error keeps the original failure intact. - name: Collect validation artifacts if: >- always() diff --git a/kwok/scripts/install-karpenter-kwok.sh b/kwok/scripts/install-karpenter-kwok.sh index 72b64dae1..d6a17481f 100755 --- a/kwok/scripts/install-karpenter-kwok.sh +++ b/kwok/scripts/install-karpenter-kwok.sh @@ -41,7 +41,9 @@ KARPENTER_VERSION="${KARPENTER_VERSION:-v1.8.0}" KIND_CLUSTER_NAME="${KIND_CLUSTER_NAME:?KIND_CLUSTER_NAME must be set}" KARPENTER_NAMESPACE="${KARPENTER_NAMESPACE:-karpenter}" KARPENTER_CLONE_DIR="${KARPENTER_CLONE_DIR:-/tmp/karpenter}" +KWOK_HELM_TIMEOUT="${KWOK_HELM_TIMEOUT:-300s}" KO_BUILD_TIMEOUT="${KO_BUILD_TIMEOUT:-900}" # 15 minutes +KARPENTER_HELM_TIMEOUT="${KARPENTER_HELM_TIMEOUT:-300s}" RED='\033[0;31m' GREEN='\033[0;32m' @@ -68,7 +70,7 @@ install_kwok() { helm upgrade --install kwok-controller kwok/kwok \ --namespace kube-system \ --set hostNetwork=true \ - --wait --timeout 300s + --wait --timeout "${KWOK_HELM_TIMEOUT}" helm upgrade --install kwok-stage-fast kwok/stage-fast \ --namespace kube-system @@ -98,11 +100,16 @@ build_karpenter() { # Redirect stderr to avoid Go compilation warnings corrupting the image reference. # Output format: kind.local/: # Hard timeout prevents a slow/stuck compilation from consuming the entire job. + local ko_stderr="${KARPENTER_CLONE_DIR}/ko-build.stderr" CONTROLLER_IMG=$(timeout "${KO_BUILD_TIMEOUT}" \ env KO_DOCKER_REPO=kind.local \ KIND_CLUSTER_NAME="${KIND_CLUSTER_NAME}" \ - ko build sigs.k8s.io/karpenter/kwok 2>/dev/null) || { + ko build sigs.k8s.io/karpenter/kwok 2>"${ko_stderr}") || { log_error "ko build failed or timed out after ${KO_BUILD_TIMEOUT}s" + if [[ -s "${ko_stderr}" ]]; then + log_error "ko build stderr:" + sed 's/^/ /' "${ko_stderr}" || true + fi exit 1 } @@ -187,7 +194,7 @@ deploy_karpenter() { --set 'controller.extraVolumeMounts[0].readOnly=true' \ --set 'controller.env[0].name=INSTANCE_TYPES_FILE_PATH' \ --set 'controller.env[0].value=/etc/karpenter/instance-types/instance-types.json' \ - --wait --timeout 300s \ + --wait --timeout "${KARPENTER_HELM_TIMEOUT}" \ || { log_error "Helm install failed. Diagnostics:" kubectl -n "${KARPENTER_NAMESPACE}" get pods -o wide 2>/dev/null || true @@ -213,6 +220,7 @@ main() { log_info "Karpenter version: ${KARPENTER_VERSION}" log_info "Kind cluster: ${KIND_CLUSTER_NAME}" log_info "Namespace: ${KARPENTER_NAMESPACE}" + log_info "Timeouts: kwok=${KWOK_HELM_TIMEOUT} ko-build=${KO_BUILD_TIMEOUT}s karpenter=${KARPENTER_HELM_TIMEOUT}" install_kwok build_karpenter diff --git a/pkg/bundler/deployer/helm/helm_test.go b/pkg/bundler/deployer/helm/helm_test.go index 435948e8f..79c22f8fd 100644 --- a/pkg/bundler/deployer/helm/helm_test.go +++ b/pkg/bundler/deployer/helm/helm_test.go @@ -20,6 +20,7 @@ import ( "os" "os/exec" "path/filepath" + "regexp" "strings" "testing" "time" @@ -493,9 +494,9 @@ func TestGenerate_DeployScriptKaiSchedulerTimeout(t *testing.T) { } script := string(content) - // kai-scheduler should get a custom 20m timeout override - if !strings.Contains(script, `COMPONENT_HELM_TIMEOUT="20m"`) { - t.Error("deploy.sh missing kai-scheduler 20m timeout override") + // kai-scheduler should get a custom 30m timeout override + if !strings.Contains(script, `COMPONENT_HELM_TIMEOUT="30m"`) { + t.Error("deploy.sh missing kai-scheduler 30m timeout override") } // Other components should use the default HELM_TIMEOUT if !strings.Contains(script, `COMPONENT_HELM_TIMEOUT="${HELM_TIMEOUT}"`) { @@ -505,6 +506,9 @@ func TestGenerate_DeployScriptKaiSchedulerTimeout(t *testing.T) { if !strings.Contains(script, `COMPONENT_MAX_RETRIES="1"`) { t.Error("deploy.sh missing kai-scheduler retry override") } + if !strings.Contains(script, `if [[ "${COMPONENT_MAX_RETRIES}" -gt 1 ]]`) { + t.Error("deploy.sh missing kai-scheduler retry cap") + } if !strings.Contains(script, `dump_kai_scheduler_helm_diagnostics "${namespace}"`) { t.Error("deploy.sh missing kai-scheduler diagnostics hook") } @@ -516,6 +520,105 @@ func TestGenerate_DeployScriptKaiSchedulerTimeout(t *testing.T) { } } +func TestGenerate_DeployScriptComponentTimeouts(t *testing.T) { + retryCapPattern := regexp.MustCompile(`(?m)(if \[\[ "\$\{COMPONENT_MAX_RETRIES\}" -gt \d+ \]\]|COMPONENT_MAX_RETRIES="\d+")`) + tests := []struct { + name string + component recipe.ComponentRef + wantTimeout string + wantRetryAssignment string + wantRetryCap string + wantComment string + rejectRetryCap bool + }{ + { + name: "dynamo-platform", + component: recipe.ComponentRef{ + Name: "dynamo-platform", + Namespace: "dynamo-system", + Chart: "dynamo-platform", + Version: "0.9.0", + Type: recipe.ComponentTypeHelm, + Source: "oci://nvcr.io/nvidia/ai-dynamo", + }, + wantTimeout: `COMPONENT_HELM_TIMEOUT="30m"`, + wantRetryAssignment: `COMPONENT_MAX_RETRIES="1"`, + wantRetryCap: `if [[ "${COMPONENT_MAX_RETRIES}" -gt 1 ]]`, + }, + { + name: "kube-prometheus-stack", + component: recipe.ComponentRef{ + Name: "kube-prometheus-stack", + Namespace: "monitoring", + Chart: "kube-prometheus-stack", + Version: "82.8.0", + Type: recipe.ComponentTypeHelm, + Source: "https://prometheus-community.github.io/helm-charts", + }, + wantTimeout: `COMPONENT_HELM_TIMEOUT="20m"`, + wantComment: `Keep the default retry budget for kube-prometheus-stack`, + rejectRetryCap: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + ctx := context.Background() + outputDir := t.TempDir() + + g := &Generator{ + RecipeResult: &recipe.RecipeResult{ + Kind: "RecipeResult", + APIVersion: "aicr.nvidia.com/v1alpha1", + ComponentRefs: []recipe.ComponentRef{tt.component}, + DeploymentOrder: []string{tt.component.Name}, + }, + ComponentValues: map[string]map[string]any{ + tt.component.Name: {}, + }, + Version: "v1.0.0", + } + + _, err := g.Generate(ctx, outputDir) + if err != nil { + t.Fatalf("Generate failed: %v", err) + } + + content, err := os.ReadFile(filepath.Join(outputDir, "deploy.sh")) + if err != nil { + t.Fatalf("failed to read deploy.sh: %v", err) + } + script := string(content) + + blockStart := strings.Index(script, `Installing `+tt.component.Name) + if blockStart == -1 { + t.Fatalf("deploy.sh missing %s install block", tt.component.Name) + } + blockEnd := strings.Index(script[blockStart:], `helm upgrade --install `+tt.component.Name) + if blockEnd == -1 { + t.Fatalf("deploy.sh missing %s helm install command", tt.component.Name) + } + componentBlock := script[blockStart : blockStart+blockEnd] + + if !strings.Contains(componentBlock, tt.wantTimeout) { + t.Errorf("deploy.sh missing %s timeout override %q", tt.component.Name, tt.wantTimeout) + } + if tt.wantRetryAssignment != "" && !strings.Contains(componentBlock, tt.wantRetryAssignment) { + t.Errorf("deploy.sh missing %s retry override %q", tt.component.Name, tt.wantRetryAssignment) + } + if tt.wantRetryCap != "" && !strings.Contains(componentBlock, tt.wantRetryCap) { + t.Errorf("deploy.sh missing %s retry cap %q", tt.component.Name, tt.wantRetryCap) + } + if tt.wantComment != "" && !strings.Contains(componentBlock, tt.wantComment) { + t.Errorf("deploy.sh missing %s retry rationale", tt.component.Name) + } + if tt.rejectRetryCap && retryCapPattern.MatchString(componentBlock) { + t.Errorf("deploy.sh should not cap %s retries in its component block", tt.component.Name) + } + }) + } +} + func TestGenerate_UndeployScriptExecutable(t *testing.T) { ctx := context.Background() outputDir := t.TempDir() diff --git a/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl b/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl index 0f83eb71c..e456bba58 100644 --- a/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl +++ b/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl @@ -371,13 +371,25 @@ retry "{{ .Name }} pre-install manifests" apply_ignoring_crd_race "${SCRIPT_DIR} || helm_failed "{{ .Name }}" {{ end -}} # Per-component timeout override. Most components use HELM_TIMEOUT (10m). -# Components with slow hooks (e.g., kai-scheduler crd-upgrader image pull -# on cold runners) get a longer timeout to avoid unnecessary retry cycles. +# Components with slow hooks on cold runners get a longer timeout to avoid +# unnecessary retry cycles. COMPONENT_HELM_TIMEOUT="${HELM_TIMEOUT}" COMPONENT_MAX_RETRIES="${MAX_RETRIES}" {{ if eq .Name "kai-scheduler" -}} +COMPONENT_HELM_TIMEOUT="30m" +if [[ "${COMPONENT_MAX_RETRIES}" -gt 1 ]]; then + COMPONENT_MAX_RETRIES="1" +fi +{{ else if eq .Name "dynamo-platform" -}} +COMPONENT_HELM_TIMEOUT="30m" +if [[ "${COMPONENT_MAX_RETRIES}" -gt 1 ]]; then + COMPONENT_MAX_RETRIES="1" +fi +{{ else if eq .Name "kube-prometheus-stack" -}} COMPONENT_HELM_TIMEOUT="20m" -COMPONENT_MAX_RETRIES="1" +# Keep the default retry budget for kube-prometheus-stack. On cold H100 +# runners, Grafana can hit ProgressDeadlineExceeded multiple times before +# images and rollout state are warm enough for a later retry to succeed. {{ end -}} # Derive wait args: global --wait/--no-wait behavior + component timeout. if [[ "${NO_WAIT}" == "true" ]]; then diff --git a/tests/chainsaw/ai-conformance/kind-training-kubeflow/chainsaw-test.yaml b/tests/chainsaw/ai-conformance/kind-training-kubeflow/chainsaw-test.yaml index 382d99104..9809cd0bb 100644 --- a/tests/chainsaw/ai-conformance/kind-training-kubeflow/chainsaw-test.yaml +++ b/tests/chainsaw/ai-conformance/kind-training-kubeflow/chainsaw-test.yaml @@ -73,6 +73,8 @@ spec: - name: assert-kubeflow-trainer description: Verify Kubeflow Trainer controller, validating webhook, and TrainJob CRD are available. + timeouts: + assert: 600s try: - assert: file: assert-kubeflow-trainer.yaml From ae0b717e04746f7ed1481621f6747ed84b84d787 Mon Sep 17 00:00:00 2001 From: Yuan Chen Date: Sat, 25 Apr 2026 18:06:56 -0700 Subject: [PATCH 02/21] ci: split aicr build artifacts --- .github/actions/aicr-build/action.yml | 32 +++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/.github/actions/aicr-build/action.yml b/.github/actions/aicr-build/action.yml index 7a973ae21..ce9841f2f 100644 --- a/.github/actions/aicr-build/action.yml +++ b/.github/actions/aicr-build/action.yml @@ -13,9 +13,17 @@ # limitations under the License. name: 'AICR Build' -description: 'Builds the aicr validator image (via Dockerfile) and CLI binary, and loads the image into kind.' +description: 'Builds the aicr CLI and optional snapshot/validator images, and loads requested images into kind.' inputs: + build_cli: + description: 'Build and stage the aicr CLI binary at the repository root' + required: false + default: 'true' + build_snapshot_agent: + description: 'Build the CUDA-based snapshot agent image and load it into kind' + required: false + default: 'true' build_validators: description: 'Deprecated: use validator_phases instead. Ignored when validator_phases is set.' required: false @@ -35,15 +43,27 @@ runs: KO_VERSION=$(yq eval '.build_tools.ko' .settings.yaml) GOFLAGS= go install "github.com/google/ko@${KO_VERSION}" - - name: Build snapshot agent image and load into kind + - name: Build aicr CLI binary + if: inputs.build_cli == 'true' || inputs.build_snapshot_agent == 'true' shell: bash env: GOFLAGS: -mod=vendor run: | + set -euo pipefail + if [[ -x dist/aicr ]]; then + echo "Reusing existing dist/aicr" + exit 0 + fi + CGO_ENABLED=0 go build -trimpath -o dist/aicr ./cmd/aicr + + - name: Build snapshot agent image and load into kind + if: inputs.build_snapshot_agent == 'true' + shell: bash + run: | + set -euo pipefail # Build snapshot agent image with CUDA base (provides nvidia-smi for GPU detection). # Uses cuda:base (~250MB) instead of cuda:runtime (~1.8GB) — only nvidia-smi is needed. # GPU test workflows use --image=ko.local:smoke-test for aicr snapshot. - CGO_ENABLED=0 go build -trimpath -o dist/aicr ./cmd/aicr docker build -t ko.local:smoke-test -f - . <<'DOCKERFILE' FROM nvcr.io/nvidia/cuda:13.1.0-base-ubuntu24.04 COPY dist/aicr /usr/local/bin/aicr @@ -69,6 +89,7 @@ runs: env: GOFLAGS: -mod=vendor run: | + set -euo pipefail # Determine which validator phases to build. # validator_phases takes precedence; build_validators is a deprecated fallback. if [[ -n "${{ inputs.validator_phases }}" ]]; then @@ -109,5 +130,8 @@ runs: done - name: Stage aicr binary at repo root + if: inputs.build_cli == 'true' shell: bash - run: cp dist/aicr ./aicr + run: | + set -euo pipefail + cp dist/aicr ./aicr From 33417c3e4aac6bd4f5b70daab0e4b392221baad4 Mon Sep 17 00:00:00 2001 From: Yuan Chen Date: Sat, 25 Apr 2026 18:07:00 -0700 Subject: [PATCH 03/21] ci: harden gpu cluster setup --- .github/actions/README.md | 3 +- .github/actions/gpu-cluster-setup/action.yml | 164 ++++++++++++++++++- .github/actions/load-versions/action.yml | 10 ++ .settings.yaml | 2 + 4 files changed, 173 insertions(+), 6 deletions(-) diff --git a/.github/actions/README.md b/.github/actions/README.md index cef2fd6ca..3b1ee648d 100644 --- a/.github/actions/README.md +++ b/.github/actions/README.md @@ -50,7 +50,8 @@ This action runs `tools/setup-tools --skip-go --skip-docker` in auto mode, which **When to use**: When you need version values in workflow steps **Outputs**: - `go`, `goreleaser`, `ko`, `crane`, `golangci_lint`, `yamllint`, `addlicense` -- `grype`, `kubectl`, `kind`, `ctlptl`, `tilt`, `helm` +- `grype`, `kubectl`, `kind`, `nvkind`, `ctlptl`, `tilt`, `helm` +- `kind_node_image`, `h100_kind_node_image` **Example**: ```yaml diff --git a/.github/actions/gpu-cluster-setup/action.yml b/.github/actions/gpu-cluster-setup/action.yml index b9bc3060f..c5b5bb3cd 100644 --- a/.github/actions/gpu-cluster-setup/action.yml +++ b/.github/actions/gpu-cluster-setup/action.yml @@ -15,6 +15,32 @@ name: 'GPU Cluster Setup' description: 'Creates a GPU-enabled kind cluster using nvkind with CDI-mode GPU passthrough.' +inputs: + kind_node_image: + description: 'Kind node image for nvkind cluster creation' + required: false + default: '' + min_gpu_count: + description: 'Minimum visible GPU count required before cluster setup' + required: false + default: '1' + gpu_model_pattern: + description: 'Optional grep-compatible GPU model pattern required for visible GPUs' + required: false + default: '' + min_free_disk_gb: + description: 'Minimum free disk space on / required before cluster setup' + required: false + default: '20' + min_available_memory_gb: + description: 'Minimum available system memory required before cluster setup' + required: false + default: '8' + cluster_create_timeout: + description: 'Timeout for nvkind cluster create' + required: false + default: '900s' + runs: using: 'composite' steps: @@ -53,12 +79,84 @@ runs: - name: Install nvkind shell: bash run: | - go install github.com/NVIDIA/nvkind/cmd/nvkind@latest + go install "github.com/NVIDIA/nvkind/cmd/nvkind@${{ steps.versions.outputs.nvkind }}" nvkind --help - - name: Verify host GPU + - name: Runner preflight shell: bash - run: nvidia-smi -L + env: + GPU_MODEL_PATTERN: ${{ inputs.gpu_model_pattern }} + MIN_GPU_COUNT: ${{ inputs.min_gpu_count }} + MIN_FREE_DISK_GB: ${{ inputs.min_free_disk_gb }} + MIN_AVAILABLE_MEMORY_GB: ${{ inputs.min_available_memory_gb }} + KIND_NODE_IMAGE: ${{ inputs.kind_node_image }} + run: | + set -euo pipefail + + echo "=== Runner baseline ===" + date -u + hostname + uptime + nproc + free -h + df -h / + df -ih / + + for value_name in MIN_GPU_COUNT MIN_FREE_DISK_GB MIN_AVAILABLE_MEMORY_GB; do + value="${!value_name}" + if ! [[ "${value}" =~ ^[0-9]+$ ]]; then + echo "::error::${value_name} must be an integer, got '${value}'" + exit 1 + fi + done + + free_disk_gb=$(df -BG --output=avail / | tail -1 | tr -dc '0-9') + if (( free_disk_gb < MIN_FREE_DISK_GB )); then + echo "::error::free disk on / is ${free_disk_gb}GiB, need at least ${MIN_FREE_DISK_GB}GiB" + exit 1 + fi + + available_memory_gb=$(free -g | awk '/^Mem:/ {print $7}') + if (( available_memory_gb < MIN_AVAILABLE_MEMORY_GB )); then + echo "::error::available memory is ${available_memory_gb}GiB, need at least ${MIN_AVAILABLE_MEMORY_GB}GiB" + exit 1 + fi + + echo "=== Docker health ===" + docker info >/dev/null + docker version + + echo "=== Host GPUs ===" + nvidia-smi -L + nvidia-smi + + mapfile -t gpu_names < <(nvidia-smi --query-gpu=name --format=csv,noheader) + if [[ -n "${GPU_MODEL_PATTERN}" ]]; then + gpu_count=$(printf '%s\n' "${gpu_names[@]}" | grep -Eic "${GPU_MODEL_PATTERN}" || true) + echo "Visible GPUs matching '${GPU_MODEL_PATTERN}': ${gpu_count}" + else + gpu_count="${#gpu_names[@]}" + echo "Visible GPUs: ${gpu_count}" + fi + + if (( gpu_count < MIN_GPU_COUNT )); then + echo "::error::visible GPU count ${gpu_count} is below required minimum ${MIN_GPU_COUNT}" + exit 1 + fi + + echo "=== Existing kind state ===" + kind get clusters || true + docker ps -a --filter "name=${KIND_CLUSTER_NAME}" || true + + if [[ -n "${KIND_NODE_IMAGE}" ]]; then + echo "=== Kind node image cache ===" + if docker image inspect "${KIND_NODE_IMAGE}" >/dev/null 2>&1; then + echo "Kind node image already cached: ${KIND_NODE_IMAGE}" + else + echo "Pulling kind node image: ${KIND_NODE_IMAGE}" + timeout 600s docker pull "${KIND_NODE_IMAGE}" + fi + fi - name: Configure NVIDIA Container Toolkit for kind shell: bash @@ -70,7 +168,9 @@ runs: - name: Validate Docker GPU access shell: bash - run: docker run --rm -v /dev/null:/var/run/nvidia-container-devices/all ubuntu:22.04 nvidia-smi -L + run: | + set -euo pipefail + docker run --rm -v /dev/null:/var/run/nvidia-container-devices/all ubuntu:22.04 nvidia-smi -L - name: Increase inotify limits shell: bash @@ -78,14 +178,68 @@ runs: sudo sysctl -w fs.inotify.max_user_watches=524288 sudo sysctl -w fs.inotify.max_user_instances=1024 + - name: Delete stale kind cluster + shell: bash + run: | + set -euo pipefail + if kind get clusters | grep -Fxq "${KIND_CLUSTER_NAME}"; then + echo "Deleting stale kind cluster: ${KIND_CLUSTER_NAME}" + timeout 180s kind delete cluster --name "${KIND_CLUSTER_NAME}" + else + echo "No stale kind cluster named ${KIND_CLUSTER_NAME}" + fi + + remaining_containers=$(docker ps -aq --filter "name=${KIND_CLUSTER_NAME}") + if [[ -n "${remaining_containers}" ]]; then + echo "Removing stale containers for ${KIND_CLUSTER_NAME}:" + docker ps -a --filter "name=${KIND_CLUSTER_NAME}" + docker rm -f ${remaining_containers} + fi + + remaining_containers=$(docker ps -aq --filter "name=${KIND_CLUSTER_NAME}") + if [[ -n "${remaining_containers}" ]]; then + echo "::error::stale containers still remain for ${KIND_CLUSTER_NAME}:" + docker ps -a --filter "name=${KIND_CLUSTER_NAME}" + exit 1 + fi + - name: Create GPU-enabled kind cluster shell: bash + env: + KIND_NODE_IMAGE: ${{ inputs.kind_node_image }} + CLUSTER_CREATE_TIMEOUT: ${{ inputs.cluster_create_timeout }} run: | - nvkind cluster create --name="${KIND_CLUSTER_NAME}" || echo "::warning::nvkind cluster create returned non-zero (umount errors are expected with CDI mode)" + set -euo pipefail + + CREATE_ARGS=(--name="${KIND_CLUSTER_NAME}") + if [[ -n "${KIND_NODE_IMAGE}" ]]; then + echo "Using kind node image: ${KIND_NODE_IMAGE}" + CREATE_ARGS+=(--image="${KIND_NODE_IMAGE}") + fi + + set +e + timeout "${CLUSTER_CREATE_TIMEOUT}" nvkind cluster create "${CREATE_ARGS[@]}" + create_status=$? + set -e + if (( create_status != 0 )); then + echo "::warning::nvkind cluster create exited with status ${create_status}; continuing only if post-create checks pass" + fi + kubectl --context="kind-${KIND_CLUSTER_NAME}" wait --for=condition=Ready nodes --all --timeout=300s kubectl --context="kind-${KIND_CLUSTER_NAME}" cluster-info kubectl --context="kind-${KIND_CLUSTER_NAME}" get nodes -o wide + echo "=== Control-plane resource requests/limits ===" + kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kube-system \ + get pods -l tier=control-plane -o json | jq -r ' + .items[] as $pod | + $pod.metadata.name, + ($pod.spec.containers[] | + " " + .name + + " requests=" + ((.resources.requests // {}) | tostring) + + " limits=" + ((.resources.limits // {}) | tostring)) + ' || true + - name: Print GPUs (nvkind) shell: bash run: nvkind cluster print-gpus --name="${KIND_CLUSTER_NAME}" diff --git a/.github/actions/load-versions/action.yml b/.github/actions/load-versions/action.yml index b87e321d1..b3c506d40 100644 --- a/.github/actions/load-versions/action.yml +++ b/.github/actions/load-versions/action.yml @@ -40,6 +40,9 @@ outputs: kind: description: 'Kind version' value: ${{ steps.versions.outputs.kind }} + nvkind: + description: 'nvkind git ref' + value: ${{ steps.versions.outputs.nvkind }} ctlptl: description: 'ctlptl version' value: ${{ steps.versions.outputs.ctlptl }} @@ -91,6 +94,9 @@ outputs: kind_node_image: description: 'Kind node image for testing' value: ${{ steps.versions.outputs.kind_node_image }} + h100_kind_node_image: + description: 'Kind node image for H100 GPU tests' + value: ${{ steps.versions.outputs.h100_kind_node_image }} runs: using: 'composite' @@ -121,6 +127,7 @@ runs: # Testing tools echo "kubectl=$(yq eval '.testing_tools.kubectl' .settings.yaml)" >> $GITHUB_OUTPUT echo "kind=$(yq eval '.testing_tools.kind' .settings.yaml)" >> $GITHUB_OUTPUT + echo "nvkind=$(yq eval '.testing_tools.nvkind' .settings.yaml)" >> $GITHUB_OUTPUT echo "ctlptl=$(yq eval '.testing_tools.ctlptl' .settings.yaml)" >> $GITHUB_OUTPUT echo "tilt=$(yq eval '.testing_tools.tilt' .settings.yaml)" >> $GITHUB_OUTPUT echo "helm=$(yq eval '.testing_tools.helm' .settings.yaml)" >> $GITHUB_OUTPUT @@ -141,6 +148,7 @@ runs: # Testing configuration echo "kind_node_image=$(yq eval '.testing.kind_node_image' .settings.yaml)" >> $GITHUB_OUTPUT + echo "h100_kind_node_image=$(yq eval '.testing.h100_kind_node_image' .settings.yaml)" >> $GITHUB_OUTPUT - name: Display loaded versions shell: bash @@ -158,6 +166,7 @@ runs: echo " grype: ${{ steps.versions.outputs.grype }}" echo " kubectl: ${{ steps.versions.outputs.kubectl }}" echo " kind: ${{ steps.versions.outputs.kind }}" + echo " nvkind: ${{ steps.versions.outputs.nvkind }}" echo " ctlptl: ${{ steps.versions.outputs.ctlptl }}" echo " tilt: ${{ steps.versions.outputs.tilt }}" echo " helm: ${{ steps.versions.outputs.helm }}" @@ -172,3 +181,4 @@ runs: echo " lint_timeout: ${{ steps.versions.outputs.lint_timeout }}" echo " test_timeout: ${{ steps.versions.outputs.test_timeout }}" echo " kind_node_image: ${{ steps.versions.outputs.kind_node_image }}" + echo " h100_kind_node_image: ${{ steps.versions.outputs.h100_kind_node_image }}" diff --git a/.settings.yaml b/.settings.yaml index 75b4559b1..5ef15198f 100644 --- a/.settings.yaml +++ b/.settings.yaml @@ -40,6 +40,7 @@ security_tools: testing_tools: kubectl: 'v1.35.0' kind: '0.31.0' + nvkind: '78a0a514c41c3e77ac0d935f38d971d3b4455138' ctlptl: '0.9.0' tilt: '0.37.0' helm: 'v4.1.1' @@ -71,6 +72,7 @@ docs_tools: # Testing Configuration testing: kind_node_image: 'kindest/node:v1.32.0' + h100_kind_node_image: 'kindest/node:v1.35.0' # Component test harness configuration # Used by tools/component-test/ scripts to validate individual components From b7ffe6beeed5059f8feeebce62686e97fe9a5975 Mon Sep 17 00:00:00 2001 From: Yuan Chen Date: Sat, 25 Apr 2026 18:07:04 -0700 Subject: [PATCH 04/21] ci: tune H100 workflow reliability --- .../check-control-plane-health/action.yml | 294 +++++++++++++++--- .github/actions/gpu-test-cleanup/action.yml | 37 ++- .../workflows/gpu-h100-inference-test.yaml | 128 +++++--- .github/workflows/gpu-h100-training-test.yaml | 117 ++++--- .../kind-inference-dynamo/chainsaw-test.yaml | 2 + .../kind-training-kubeflow/chainsaw-test.yaml | 2 + 6 files changed, 443 insertions(+), 137 deletions(-) diff --git a/.github/actions/check-control-plane-health/action.yml b/.github/actions/check-control-plane-health/action.yml index ae2a87db7..bee31623b 100644 --- a/.github/actions/check-control-plane-health/action.yml +++ b/.github/actions/check-control-plane-health/action.yml @@ -13,7 +13,7 @@ # limitations under the License. name: 'Check Control Plane Health' -description: 'Fails if Kind control-plane static pods are missing, unready, or restarted.' +description: 'Fails if Kind control-plane static pods are missing, unready, or unstable.' inputs: cluster_name: @@ -35,6 +35,22 @@ inputs: description: 'Maximum tolerated restart count for each control-plane container' required: false default: '1' + stability_window: + description: 'Optional duration to watch for new control-plane restarts after pods are Ready' + required: false + default: '0s' + recover_unhealthy: + description: 'Restart eligible Kind control-plane static pod containers when they are currently unhealthy' + required: false + default: 'false' + recovery_components: + description: 'Space-separated component label values eligible for recovery' + required: false + default: kube-controller-manager kube-scheduler kube-apiserver + max_recovery_attempts: + description: 'Maximum recovery attempts for each eligible component' + required: false + default: '1' runs: using: 'composite' @@ -47,6 +63,10 @@ runs: COMPONENTS: ${{ inputs.components }} WAIT_TIMEOUT: ${{ inputs.wait_timeout }} MAX_RESTARTS: ${{ inputs.max_restarts }} + STABILITY_WINDOW: ${{ inputs.stability_window }} + RECOVER_UNHEALTHY: ${{ inputs.recover_unhealthy }} + RECOVERY_COMPONENTS: ${{ inputs.recovery_components }} + MAX_RECOVERY_ATTEMPTS: ${{ inputs.max_recovery_attempts }} run: | set -euo pipefail @@ -57,60 +77,262 @@ runs: exit 1 fi - kubectl --context="kind-${KIND_CLUSTER_NAME}" get --raw='/readyz' || true + MAX_RECOVERY_ATTEMPTS="${MAX_RECOVERY_ATTEMPTS#"${MAX_RECOVERY_ATTEMPTS%%[![:space:]]*}"}" + MAX_RECOVERY_ATTEMPTS="${MAX_RECOVERY_ATTEMPTS%"${MAX_RECOVERY_ATTEMPTS##*[![:space:]]}"}" + if ! [[ "${MAX_RECOVERY_ATTEMPTS}" =~ ^[0-9]+$ ]]; then + echo "::error::max_recovery_attempts must be a non-negative integer, got '${MAX_RECOVERY_ATTEMPTS}'" + exit 1 + fi - check_component() { - local component="$1" - local selector="component=${component}" - local pods + STABILITY_WINDOW="${STABILITY_WINDOW#"${STABILITY_WINDOW%%[![:space:]]*}"}" + STABILITY_WINDOW="${STABILITY_WINDOW%"${STABILITY_WINDOW##*[![:space:]]}"}" + if [[ -z "${STABILITY_WINDOW}" ]]; then + STABILITY_WINDOW="0s" + fi + if ! [[ "${STABILITY_WINDOW}" =~ ^[0-9]+[smh]$ ]]; then + echo "::error::stability_window must be a duration like 0s, 60s, or 2m; got '${STABILITY_WINDOW}'" + exit 1 + fi - pods=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" -n "${NAMESPACE}" \ - get pod -l "${selector}" -o name) - if [[ -z "${pods}" ]]; then - echo "::error::no ${component} pods found in ${NAMESPACE} with selector ${selector}" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n "${NAMESPACE}" \ - get pods -o wide || true + RECOVER_UNHEALTHY="${RECOVER_UNHEALTHY#"${RECOVER_UNHEALTHY%%[![:space:]]*}"}" + RECOVER_UNHEALTHY="${RECOVER_UNHEALTHY%"${RECOVER_UNHEALTHY##*[![:space:]]}"}" + case "${RECOVER_UNHEALTHY}" in + true|false) ;; + *) + echo "::error::recover_unhealthy must be true or false, got '${RECOVER_UNHEALTHY}'" exit 1 - fi + ;; + esac + + kubectl_kind() { + timeout 30s kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" "$@" + } + + declare -A RECOVERY_ATTEMPTS=() + + kubectl_kind get --raw='/readyz' || true + + wait_ready() { + local component="$1" + local selector="component=${component}" if ! kubectl --context="kind-${KIND_CLUSTER_NAME}" -n "${NAMESPACE}" \ wait --for=condition=Ready pod -l "${selector}" --timeout="${WAIT_TIMEOUT}"; then - echo "::error::${component} pods did not become Ready within ${WAIT_TIMEOUT}" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n "${NAMESPACE}" \ - get pod -l "${selector}" -o wide || true - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n "${NAMESPACE}" \ - describe pod -l "${selector}" || true - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n "${NAMESPACE}" \ - get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true - kubectl --context="kind-${KIND_CLUSTER_NAME}" get --raw='/readyz' || true - exit 1 + return 1 fi + } + restart_total() { + local component="$1" + local selector="component=${component}" local restart_counts - restart_counts=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" -n "${NAMESPACE}" \ - get pod -l "${selector}" \ - -o jsonpath='{range .items[*]}{range .status.containerStatuses[*]}{.restartCount}{"\n"}{end}{end}') + local restart_count + local total=0 + + if ! restart_counts=$(kubectl_kind -n "${NAMESPACE}" get pod -l "${selector}" \ + -o jsonpath='{range .items[*]}{range .status.containerStatuses[*]}{.restartCount}{"\n"}{end}{end}'); then + echo "::error::failed to read restart counts for ${component} pods" >&2 + dump_component_diagnostics "${component}" >&2 + exit 1 + fi if [[ -z "${restart_counts}" ]]; then - echo "::error::no container statuses found for ${component} pods" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n "${NAMESPACE}" \ - describe pod -l "${selector}" || true + echo "::error::no container statuses found for ${component} pods" >&2 + dump_component_diagnostics "${component}" >&2 exit 1 fi while IFS= read -r restart_count; do [[ -z "${restart_count}" ]] && continue - if (( restart_count > MAX_RESTARTS )); then - echo "::error::${component} restartCount=${restart_count}" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n "${NAMESPACE}" \ - get pod -l "${selector}" -o wide || true - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n "${NAMESPACE}" \ - describe pod -l "${selector}" || true + total=$((total + restart_count)) + done <<< "${restart_counts}" + echo "${total}" + } + + dump_component_diagnostics() { + local component="$1" + local selector="component=${component}" + local pods + local pod + + kubectl_kind -n "${NAMESPACE}" get pod -l "${selector}" -o wide || true + kubectl_kind -n "${NAMESPACE}" describe pod -l "${selector}" || true + kubectl_kind -n "${NAMESPACE}" get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true + + pods=$(kubectl_kind -n "${NAMESPACE}" get pod -l "${selector}" -o name 2>/dev/null || true) + while IFS= read -r pod; do + [[ -z "${pod}" ]] && continue + echo "=== ${pod} logs ===" + kubectl_kind -n "${NAMESPACE}" logs "${pod}" --all-containers --tail=100 2>/dev/null || true + echo "=== ${pod} previous logs ===" + kubectl_kind -n "${NAMESPACE}" logs "${pod}" --all-containers --previous --tail=100 2>/dev/null || true + done <<< "${pods}" + + kubectl_kind -n kube-system get lease "${component}" -o yaml 2>/dev/null || true + } + + is_recovery_component() { + local component="$1" + local candidate + + for candidate in ${RECOVERY_COMPONENTS}; do + if [[ "${candidate}" == "${component}" ]]; then + return 0 + fi + done + return 1 + } + + try_recover_component() { + local component="$1" + local reason="$2" + local node="${KIND_CLUSTER_NAME}-control-plane" + local attempt + local container_ids + local container_id + + if [[ "${RECOVER_UNHEALTHY}" != "true" ]]; then + return 1 + fi + if (( MAX_RECOVERY_ATTEMPTS == 0 )); then + return 1 + fi + if ! is_recovery_component "${component}"; then + return 1 + fi + + attempt="${RECOVERY_ATTEMPTS[${component}]:-0}" + if (( attempt >= MAX_RECOVERY_ATTEMPTS )); then + return 1 + fi + RECOVERY_ATTEMPTS["${component}"]=$((attempt + 1)) + + echo "::warning::${component} is unhealthy (${reason}); restarting static pod container (attempt $((attempt + 1))/${MAX_RECOVERY_ATTEMPTS})" + dump_component_diagnostics "${component}" + + if ! docker inspect "${node}" >/dev/null 2>&1; then + echo "::warning::cannot recover ${component}: kind node container ${node} not found" + return 1 + fi + + container_ids=$(docker exec "${node}" crictl ps --name "${component}" -q 2>/dev/null || true) + if [[ -z "${container_ids}" ]]; then + echo "::warning::cannot recover ${component}: no running container found in ${node}" + return 1 + fi + + for container_id in ${container_ids}; do + echo "Stopping ${component} container ${container_id} in ${node}..." + if ! docker exec "${node}" crictl stop "${container_id}"; then + echo "::warning::failed to stop ${component} container ${container_id}" + return 1 + fi + done + + sleep 5 + if ! wait_ready "${component}"; then + echo "::warning::${component} did not recover after static pod container restart" + dump_component_diagnostics "${component}" + kubectl_kind get --raw='/readyz' || true + return 1 + fi + + echo "${component} recovered after static pod container restart." + return 0 + } + + check_component() { + local component="$1" + local selector="component=${component}" + local pods + local initial_restarts + local final_restarts + + if ! pods=$(kubectl_kind -n "${NAMESPACE}" get pod -l "${selector}" -o name); then + if ! try_recover_component "${component}" "failed to list pods in ${NAMESPACE} with selector ${selector}"; then + echo "::error::failed to list ${component} pods in ${NAMESPACE} with selector ${selector}" + kubectl_kind -n "${NAMESPACE}" get pods -o wide || true exit 1 fi - done <<< "${restart_counts}" + if ! pods=$(kubectl_kind -n "${NAMESPACE}" get pod -l "${selector}" -o name); then + echo "::error::failed to list ${component} pods after recovery" + kubectl_kind -n "${NAMESPACE}" get pods -o wide || true + exit 1 + fi + fi + if [[ -z "${pods}" ]]; then + echo "::error::no ${component} pods found in ${NAMESPACE} with selector ${selector}" + kubectl_kind -n "${NAMESPACE}" get pods -o wide || true + exit 1 + fi + + if ! wait_ready "${component}"; then + if ! try_recover_component "${component}" "pods did not become Ready within ${WAIT_TIMEOUT}"; then + echo "::error::${component} pods did not become Ready within ${WAIT_TIMEOUT}" + dump_component_diagnostics "${component}" + kubectl_kind get --raw='/readyz' || true + exit 1 + fi + fi + initial_restarts=$(restart_total "${component}") + + if [[ "${STABILITY_WINDOW}" != "0s" ]]; then + if (( initial_restarts > MAX_RESTARTS )); then + echo "::warning::${component} historical restartCount=${initial_restarts}; checking for stability over ${STABILITY_WINDOW}" + dump_component_diagnostics "${component}" + fi + + sleep "${STABILITY_WINDOW}" + if ! wait_ready "${component}"; then + if ! try_recover_component "${component}" "pods became unready during ${STABILITY_WINDOW}"; then + echo "::error::${component} pods became unready during ${STABILITY_WINDOW}" + dump_component_diagnostics "${component}" + kubectl_kind get --raw='/readyz' || true + exit 1 + fi + initial_restarts=$(restart_total "${component}") + sleep "${STABILITY_WINDOW}" + if ! wait_ready "${component}"; then + echo "::error::${component} pods became unready after recovery" + dump_component_diagnostics "${component}" + kubectl_kind get --raw='/readyz' || true + exit 1 + fi + fi + final_restarts=$(restart_total "${component}") + if (( final_restarts > initial_restarts )); then + if ! try_recover_component "${component}" "restartCount increased from ${initial_restarts} to ${final_restarts} during ${STABILITY_WINDOW}"; then + echo "::error::${component} restartCount increased from ${initial_restarts} to ${final_restarts} during ${STABILITY_WINDOW}" + dump_component_diagnostics "${component}" + exit 1 + fi + initial_restarts=$(restart_total "${component}") + sleep "${STABILITY_WINDOW}" + if ! wait_ready "${component}"; then + echo "::error::${component} pods became unready after recovery" + dump_component_diagnostics "${component}" + kubectl_kind get --raw='/readyz' || true + exit 1 + fi + final_restarts=$(restart_total "${component}") + if (( final_restarts > initial_restarts )); then + echo "::error::${component} restartCount increased from ${initial_restarts} to ${final_restarts} after recovery" + dump_component_diagnostics "${component}" + exit 1 + fi + fi + return + fi + + if (( initial_restarts > MAX_RESTARTS )); then + if ! try_recover_component "${component}" "restartCount=${initial_restarts}"; then + echo "::error::${component} restartCount=${initial_restarts}" + dump_component_diagnostics "${component}" + exit 1 + fi + fi } for component in ${COMPONENTS}; do check_component "${component}" done - kubectl --context="kind-${KIND_CLUSTER_NAME}" get --raw='/readyz' + kubectl_kind get --raw='/readyz' diff --git a/.github/actions/gpu-test-cleanup/action.yml b/.github/actions/gpu-test-cleanup/action.yml index d2089816b..e8536e530 100644 --- a/.github/actions/gpu-test-cleanup/action.yml +++ b/.github/actions/gpu-test-cleanup/action.yml @@ -33,13 +33,18 @@ runs: env: KIND_CLUSTER_NAME: ${{ inputs.cluster_name }} run: | + set -o pipefail mkdir -p /tmp/debug-artifacts - kubectl --context="kind-${KIND_CLUSTER_NAME}" get all --all-namespaces > /tmp/debug-artifacts/all-resources.txt || true - kubectl --context="kind-${KIND_CLUSTER_NAME}" get events --all-namespaces --sort-by='.lastTimestamp' > /tmp/debug-artifacts/events.txt || true - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods -o wide > /tmp/debug-artifacts/gpu-operator-pods.txt || true - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator logs -l app=nvidia-device-plugin-daemonset --tail=100 > /tmp/debug-artifacts/device-plugin-logs.txt || true - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator logs -l app.kubernetes.io/component=gpu-operator --tail=100 > /tmp/debug-artifacts/gpu-operator-logs.txt || true - kubectl --context="kind-${KIND_CLUSTER_NAME}" get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded > /tmp/debug-artifacts/non-running-pods.txt || true + kubectl_kind() { + timeout 30s kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" "$@" + } + + kubectl_kind get all --all-namespaces > /tmp/debug-artifacts/all-resources.txt || true + kubectl_kind get events --all-namespaces --sort-by='.lastTimestamp' > /tmp/debug-artifacts/events.txt || true + kubectl_kind -n gpu-operator get pods -o wide > /tmp/debug-artifacts/gpu-operator-pods.txt || true + kubectl_kind -n gpu-operator logs -l app=nvidia-device-plugin-daemonset --tail=100 > /tmp/debug-artifacts/device-plugin-logs.txt || true + kubectl_kind -n gpu-operator logs -l app.kubernetes.io/component=gpu-operator --tail=100 > /tmp/debug-artifacts/gpu-operator-logs.txt || true + kubectl_kind get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded > /tmp/debug-artifacts/non-running-pods.txt || true tar_inputs=() [[ -f recipe.yaml ]] && tar_inputs+=(recipe.yaml) [[ -d bundle ]] && tar_inputs+=(bundle) @@ -57,7 +62,16 @@ runs: KIND_CLUSTER_NAME: ${{ inputs.cluster_name }} run: | mkdir -p /tmp/kind-logs - kind export logs /tmp/kind-logs --name "${KIND_CLUSTER_NAME}" || true + timeout 300s kind export logs /tmp/kind-logs --name "${KIND_CLUSTER_NAME}" || true + + - name: Cleanup + if: always() + shell: bash + env: + KIND_CLUSTER_NAME: ${{ inputs.cluster_name }} + run: | + timeout 300s kind delete cluster --name "${KIND_CLUSTER_NAME}" || true + timeout 120s docker system prune -f || true - name: Upload debug artifacts if: failure() @@ -68,12 +82,3 @@ runs: /tmp/debug-artifacts/ /tmp/kind-logs/ retention-days: 7 - - - name: Cleanup - if: always() - shell: bash - env: - KIND_CLUSTER_NAME: ${{ inputs.cluster_name }} - run: | - kind delete cluster --name "${KIND_CLUSTER_NAME}" || true - docker system prune -f || true diff --git a/.github/workflows/gpu-h100-inference-test.yaml b/.github/workflows/gpu-h100-inference-test.yaml index 373cd3c6e..95df8740e 100644 --- a/.github/workflows/gpu-h100-inference-test.yaml +++ b/.github/workflows/gpu-h100-inference-test.yaml @@ -111,19 +111,33 @@ jobs: with: persist-credentials: false + - name: Load GPU test versions + id: versions + uses: ./.github/actions/load-versions + - name: Set up GPU cluster uses: ./.github/actions/gpu-cluster-setup + with: + kind_node_image: ${{ steps.versions.outputs.h100_kind_node_image }} + min_gpu_count: '2' + gpu_model_pattern: H100 + min_free_disk_gb: '50' + min_available_memory_gb: '16' + cluster_create_timeout: 900s - name: Build aicr uses: ./.github/actions/aicr-build with: - validator_phases: 'conformance' + build_snapshot_agent: 'false' + validator_phases: 'none' - name: Check control plane health uses: ./.github/actions/check-control-plane-health with: cluster_name: ${{ env.KIND_CLUSTER_NAME }} wait_timeout: 120s + stability_window: 60s + recover_unhealthy: 'true' - name: Install runtime bundle id: bundle-install @@ -140,9 +154,18 @@ jobs: with: cluster_name: ${{ env.KIND_CLUSTER_NAME }} wait_timeout: 120s + stability_window: 60s + recover_unhealthy: 'true' # --- Snapshot and GPU validation --- + - name: Build snapshot agent image + uses: ./.github/actions/aicr-build + with: + build_cli: 'false' + build_snapshot_agent: 'true' + validator_phases: 'none' + - name: Snapshot and validate GPU uses: ./.github/actions/gpu-snapshot-validate with: @@ -158,6 +181,8 @@ jobs: with: cluster_name: ${{ env.KIND_CLUSTER_NAME }} wait_timeout: 120s + stability_window: 60s + recover_unhealthy: 'true' - name: Install Karpenter + KWOK uses: ./.github/actions/install-karpenter-kwok @@ -174,10 +199,8 @@ jobs: with: cluster_name: ${{ env.KIND_CLUSTER_NAME }} wait_timeout: 120s - - - name: Prepare chainsaw - id: versions - uses: ./.github/actions/load-versions + stability_window: 60s + recover_unhealthy: 'true' - name: Install chainsaw uses: ./.github/actions/setup-build-tools @@ -196,6 +219,7 @@ jobs: # see a settled inference stack. - name: Verify expected resources exist + timeout-minutes: 3 run: | go run ./tests/chainsaw/ai-conformance/ \ --dir tests/chainsaw/ai-conformance/kind-inference-dynamo \ @@ -204,11 +228,20 @@ jobs: --kubeconfig="${HOME}/.kube/config" \ --debug + - name: Build conformance validator image + uses: ./.github/actions/aicr-build + with: + build_cli: 'false' + build_snapshot_agent: 'false' + validator_phases: 'conformance' + - name: Check control plane health before conformance validation uses: ./.github/actions/check-control-plane-health with: cluster_name: ${{ env.KIND_CLUSTER_NAME }} wait_timeout: 120s + stability_window: 60s + recover_unhealthy: 'true' - name: Validate CNCF AI Conformance id: validate-conformance @@ -232,28 +265,9 @@ jobs: # training smoke test if needed. # --- Validation artifacts --- - # Collect a post-run resource snapshot whenever the runtime bundle - # installed. This preserves triage data for snapshot/chainsaw/conformance - # failures; continue-on-error keeps the original failure intact. - - name: Collect validation artifacts - if: >- - always() - && !cancelled() - && steps.bundle-install.outcome == 'success' - continue-on-error: true - shell: bash - run: | - set -o pipefail - mkdir -p conformance-evidence - go run ./tests/chainsaw/ai-conformance/ \ - --dir tests/chainsaw/ai-conformance/kind-inference-dynamo \ - --dir tests/chainsaw/ai-conformance/common \ - --dir tests/chainsaw/ai-conformance/kind-common \ - --kubeconfig="${HOME}/.kube/config" \ - --debug | tee conformance-evidence/resource-existence-post.txt - - name: Upload validation artifacts if: always() + timeout-minutes: 5 uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: name: conformance-evidence @@ -264,57 +278,81 @@ jobs: - name: Debug diagnostics if: failure() + timeout-minutes: 5 + shell: bash run: | + set -o pipefail + kubectl_kind() { + timeout 30s kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" "$@" + } + echo "=== ClusterPolicy status ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" get clusterpolicy -o yaml 2>/dev/null || true + kubectl_kind get clusterpolicy -o yaml 2>/dev/null || true echo "=== GPU Operator pods ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods -o wide 2>/dev/null || true + kubectl_kind -n gpu-operator get pods -o wide 2>/dev/null || true echo "=== Non-running pods (all namespaces) ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded 2>/dev/null || true + kubectl_kind get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded 2>/dev/null || true echo "=== Recent events (gpu-operator) ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true + kubectl_kind -n gpu-operator get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true echo "=== Dynamo pods ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system get pods -o wide 2>/dev/null || true + kubectl_kind -n dynamo-system get pods -o wide 2>/dev/null || true echo "=== Dynamo operator logs ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system \ + kubectl_kind -n dynamo-system \ logs deployment/dynamo-operator-controller-manager --tail=100 -c manager 2>/dev/null || true echo "=== Recent events (dynamo-system) ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true + kubectl_kind -n dynamo-system get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true + echo "=== KAI scheduler pods ===" + kubectl_kind -n kai-scheduler get pods -o wide 2>/dev/null || true + echo "=== KAI admission deployment ===" + kubectl_kind -n kai-scheduler get deployment admission -o wide 2>/dev/null || true + echo "=== KAI admission deployment describe ===" + kubectl_kind -n kai-scheduler describe deployment admission 2>/dev/null || true + echo "=== KAI admission pod describe ===" + kubectl_kind -n kai-scheduler get pods -o name 2>/dev/null \ + | grep '^pod/admission-' \ + | while read -r pod; do + kubectl_kind -n kai-scheduler describe "${pod}" 2>/dev/null || true + done || true + echo "=== KAI admission logs ===" + kubectl_kind -n kai-scheduler \ + logs deployment/admission --all-containers --tail=200 2>/dev/null || true + echo "=== Recent events (kai-scheduler) ===" + kubectl_kind -n kai-scheduler get events --sort-by='.lastTimestamp' 2>/dev/null | tail -50 || true echo "=== Custom metrics API ===" for METRIC in gpu_utilization gpu_memory_used gpu_power_usage; do echo "--- ${METRIC} ---" for NS in gpu-operator dynamo-system; do - kubectl --context="kind-${KIND_CLUSTER_NAME}" get --raw \ + kubectl_kind get --raw \ "/apis/custom.metrics.k8s.io/v1beta1/namespaces/${NS}/pods/*/${METRIC}" 2>/dev/null | jq . || true done done echo "=== Grafana deployment ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring get deployment grafana -o wide 2>/dev/null || true + kubectl_kind -n monitoring get deployment grafana -o wide 2>/dev/null || true echo "=== Grafana pods ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring get pods \ + kubectl_kind -n monitoring get pods \ -l app.kubernetes.io/name=grafana -o wide 2>/dev/null || true echo "=== Grafana deployment describe ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring describe deployment grafana 2>/dev/null || true + kubectl_kind -n monitoring describe deployment grafana 2>/dev/null || true echo "=== Grafana pod describe ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring describe pods \ + kubectl_kind -n monitoring describe pods \ -l app.kubernetes.io/name=grafana 2>/dev/null || true echo "=== prometheus-adapter pods ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring get pods -l app.kubernetes.io/name=prometheus-adapter -o wide 2>/dev/null || true + kubectl_kind -n monitoring get pods -l app.kubernetes.io/name=prometheus-adapter -o wide 2>/dev/null || true echo "=== kgateway pods ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kgateway-system get pods -o wide 2>/dev/null || true + kubectl_kind -n kgateway-system get pods -o wide 2>/dev/null || true echo "=== GatewayClass status ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" get gatewayclass -o yaml 2>/dev/null || true + kubectl_kind get gatewayclass -o yaml 2>/dev/null || true echo "=== Gateway status ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" get gateways -A -o yaml 2>/dev/null || true + kubectl_kind get gateways -A -o yaml 2>/dev/null || true echo "=== DCGM Exporter pods ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \ + kubectl_kind -n gpu-operator \ get pods -l app=nvidia-dcgm-exporter -o wide 2>/dev/null || true echo "=== Monitoring pods ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring get pods -o wide 2>/dev/null || true + kubectl_kind -n monitoring get pods -o wide 2>/dev/null || true echo "=== DRA ResourceSlices ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" get resourceslices -o wide 2>/dev/null || true + kubectl_kind get resourceslices -o wide 2>/dev/null || true echo "=== Node status ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" get nodes -o wide 2>/dev/null || true + kubectl_kind get nodes -o wide 2>/dev/null || true - name: GPU Test Cleanup if: always() diff --git a/.github/workflows/gpu-h100-training-test.yaml b/.github/workflows/gpu-h100-training-test.yaml index 36a2d1eec..6193544fa 100644 --- a/.github/workflows/gpu-h100-training-test.yaml +++ b/.github/workflows/gpu-h100-training-test.yaml @@ -107,19 +107,33 @@ jobs: with: persist-credentials: false + - name: Load GPU test versions + id: versions + uses: ./.github/actions/load-versions + - name: Set up GPU cluster uses: ./.github/actions/gpu-cluster-setup + with: + kind_node_image: ${{ steps.versions.outputs.h100_kind_node_image }} + min_gpu_count: '2' + gpu_model_pattern: H100 + min_free_disk_gb: '50' + min_available_memory_gb: '16' + cluster_create_timeout: 900s - name: Build aicr uses: ./.github/actions/aicr-build with: - validator_phases: 'conformance' + build_snapshot_agent: 'false' + validator_phases: 'none' - name: Check control plane health uses: ./.github/actions/check-control-plane-health with: cluster_name: ${{ env.KIND_CLUSTER_NAME }} wait_timeout: 120s + stability_window: 60s + recover_unhealthy: 'true' - name: Install runtime bundle id: bundle-install @@ -133,13 +147,23 @@ jobs: best_effort: 'false' - name: Check control plane health after runtime install + id: post_runtime_control_plane_health uses: ./.github/actions/check-control-plane-health with: cluster_name: ${{ env.KIND_CLUSTER_NAME }} wait_timeout: 120s + stability_window: 60s + recover_unhealthy: 'true' # --- Snapshot and GPU validation --- + - name: Build snapshot agent image + uses: ./.github/actions/aicr-build + with: + build_cli: 'false' + build_snapshot_agent: 'true' + validator_phases: 'none' + - name: Snapshot and validate GPU uses: ./.github/actions/gpu-snapshot-validate with: @@ -155,6 +179,8 @@ jobs: with: cluster_name: ${{ env.KIND_CLUSTER_NAME }} wait_timeout: 120s + stability_window: 60s + recover_unhealthy: 'true' - name: Install Karpenter + KWOK uses: ./.github/actions/install-karpenter-kwok @@ -171,10 +197,8 @@ jobs: with: cluster_name: ${{ env.KIND_CLUSTER_NAME }} wait_timeout: 120s - - - name: Prepare chainsaw - id: versions - uses: ./.github/actions/load-versions + stability_window: 60s + recover_unhealthy: 'true' - name: Install chainsaw uses: ./.github/actions/setup-build-tools @@ -193,6 +217,7 @@ jobs: # has had time to bootstrap (pod-autoscaling check needs live metric data). - name: Verify expected resources exist + timeout-minutes: 3 run: | go run ./tests/chainsaw/ai-conformance/ \ --dir tests/chainsaw/ai-conformance/kind-training-kubeflow \ @@ -201,11 +226,20 @@ jobs: --kubeconfig="${HOME}/.kube/config" \ --debug + - name: Build conformance validator image + uses: ./.github/actions/aicr-build + with: + build_cli: 'false' + build_snapshot_agent: 'false' + validator_phases: 'conformance' + - name: Check control plane health before conformance validation uses: ./.github/actions/check-control-plane-health with: cluster_name: ${{ env.KIND_CLUSTER_NAME }} wait_timeout: 120s + stability_window: 60s + recover_unhealthy: 'true' - name: Validate CNCF AI Conformance id: validate-conformance @@ -225,28 +259,9 @@ jobs: # --- Validation artifacts --- - # Collect a post-run resource snapshot whenever the runtime bundle - # installed. This preserves triage data for snapshot/chainsaw/conformance - # failures; continue-on-error keeps the original failure intact. - - name: Collect validation artifacts - if: >- - always() - && !cancelled() - && steps.bundle-install.outcome == 'success' - continue-on-error: true - shell: bash - run: | - set -o pipefail - mkdir -p conformance-evidence - go run ./tests/chainsaw/ai-conformance/ \ - --dir tests/chainsaw/ai-conformance/kind-training-kubeflow \ - --dir tests/chainsaw/ai-conformance/common \ - --dir tests/chainsaw/ai-conformance/kind-common \ - --kubeconfig="${HOME}/.kube/config" \ - --debug | tee conformance-evidence/resource-existence-post.txt - - name: Upload validation artifacts if: always() + timeout-minutes: 5 uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: name: conformance-evidence @@ -259,41 +274,63 @@ jobs: - name: Debug diagnostics if: failure() + timeout-minutes: 5 + shell: bash run: | + set -o pipefail + kubectl_kind() { + timeout 30s kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" "$@" + } + echo "=== Grafana deployment ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring get deployment grafana -o wide 2>/dev/null || true + kubectl_kind -n monitoring get deployment grafana -o wide 2>/dev/null || true echo "=== Grafana pods ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring get pods \ + kubectl_kind -n monitoring get pods \ -l app.kubernetes.io/name=grafana -o wide 2>/dev/null || true echo "=== Grafana deployment describe ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring describe deployment grafana 2>/dev/null || true + kubectl_kind -n monitoring describe deployment grafana 2>/dev/null || true echo "=== Grafana pod describe ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring describe pods \ + kubectl_kind -n monitoring describe pods \ -l app.kubernetes.io/name=grafana 2>/dev/null || true echo "=== KAI scheduler pods ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kai-scheduler get pods -o wide 2>/dev/null || true + kubectl_kind -n kai-scheduler get pods -o wide 2>/dev/null || true + echo "=== KAI admission deployment ===" + kubectl_kind -n kai-scheduler get deployment admission -o wide 2>/dev/null || true + echo "=== KAI admission deployment describe ===" + kubectl_kind -n kai-scheduler describe deployment admission 2>/dev/null || true + echo "=== KAI admission pod describe ===" + kubectl_kind -n kai-scheduler get pods -o name 2>/dev/null \ + | grep '^pod/admission-' \ + | while read -r pod; do + kubectl_kind -n kai-scheduler describe "${pod}" 2>/dev/null || true + done || true + echo "=== KAI admission logs ===" + kubectl_kind -n kai-scheduler \ + logs deployment/admission --all-containers --tail=200 2>/dev/null || true echo "=== KAI scheduler logs ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kai-scheduler \ + kubectl_kind -n kai-scheduler \ logs deployment/kai-scheduler-default --tail=100 2>/dev/null || true echo "=== KAI scheduler queues ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" get queues -A 2>/dev/null || true + kubectl_kind get queues -A 2>/dev/null || true echo "=== KAI scheduler podgroups ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" get podgroups -A 2>/dev/null || true + kubectl_kind get podgroups -A 2>/dev/null || true + echo "=== Recent events (kai-scheduler) ===" + kubectl_kind -n kai-scheduler get events --sort-by='.lastTimestamp' 2>/dev/null | tail -50 || true echo "=== Kubeflow Trainer deployment ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kubeflow get deployment kubeflow-trainer-controller-manager -o wide 2>/dev/null || true + kubectl_kind -n kubeflow get deployment kubeflow-trainer-controller-manager -o wide 2>/dev/null || true echo "=== Kubeflow pods ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kubeflow get pods -o wide 2>/dev/null || true + kubectl_kind -n kubeflow get pods -o wide 2>/dev/null || true echo "=== Kubeflow validating webhooks ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" get validatingwebhookconfigurations validator.trainer.kubeflow.org -o yaml 2>/dev/null || true + kubectl_kind get validatingwebhookconfigurations validator.trainer.kubeflow.org -o yaml 2>/dev/null || true echo "=== Kubeflow Trainer CRD ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" get crd trainjobs.trainer.kubeflow.org -o yaml 2>/dev/null || true + kubectl_kind get crd trainjobs.trainer.kubeflow.org -o yaml 2>/dev/null || true echo "=== Non-running pods (all namespaces) ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" get pods -A \ + kubectl_kind get pods -A \ --field-selector=status.phase!=Running,status.phase!=Succeeded 2>/dev/null || true echo "=== GPU Operator pods ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods -o wide 2>/dev/null || true + kubectl_kind -n gpu-operator get pods -o wide 2>/dev/null || true echo "=== Node resources ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" describe nodes 2>/dev/null | \ + kubectl_kind describe nodes 2>/dev/null | \ grep -A 20 "Allocated resources" || true - name: GPU Test Cleanup diff --git a/tests/chainsaw/ai-conformance/kind-inference-dynamo/chainsaw-test.yaml b/tests/chainsaw/ai-conformance/kind-inference-dynamo/chainsaw-test.yaml index 1b1f701ad..85aa33ab6 100644 --- a/tests/chainsaw/ai-conformance/kind-inference-dynamo/chainsaw-test.yaml +++ b/tests/chainsaw/ai-conformance/kind-inference-dynamo/chainsaw-test.yaml @@ -110,6 +110,8 @@ spec: # ── KAI Scheduler ────────────────────────────────────────────────── - name: assert-kai-scheduler description: Verify KAI scheduler is available. + timeouts: + assert: 600s try: - assert: file: ../common/assert-kai-scheduler.yaml diff --git a/tests/chainsaw/ai-conformance/kind-training-kubeflow/chainsaw-test.yaml b/tests/chainsaw/ai-conformance/kind-training-kubeflow/chainsaw-test.yaml index 9809cd0bb..e3d2b35a9 100644 --- a/tests/chainsaw/ai-conformance/kind-training-kubeflow/chainsaw-test.yaml +++ b/tests/chainsaw/ai-conformance/kind-training-kubeflow/chainsaw-test.yaml @@ -101,6 +101,8 @@ spec: - name: assert-kai-scheduler description: Verify KAI scheduler is available. + timeouts: + assert: 600s try: - assert: file: ../common/assert-kai-scheduler.yaml From 0834778adcdd5fce8ae6f0fd3f6cc2b20713c9b4 Mon Sep 17 00:00:00 2001 From: Yuan Chen Date: Sat, 25 Apr 2026 18:27:13 -0700 Subject: [PATCH 05/21] ci: address H100 review feedback --- .github/actions/aicr-build/action.yml | 7 -- .../check-control-plane-health/action.yml | 10 ++- .github/actions/gpu-cluster-setup/action.yml | 65 ++++++++++++------- .../actions/gpu-snapshot-validate/action.yml | 23 ++++--- .github/actions/gpu-test-cleanup/action.yml | 1 - 5 files changed, 58 insertions(+), 48 deletions(-) diff --git a/.github/actions/aicr-build/action.yml b/.github/actions/aicr-build/action.yml index ce9841f2f..14b68dcf8 100644 --- a/.github/actions/aicr-build/action.yml +++ b/.github/actions/aicr-build/action.yml @@ -36,13 +36,6 @@ inputs: runs: using: 'composite' steps: - - - name: Install ko - shell: bash - run: | - KO_VERSION=$(yq eval '.build_tools.ko' .settings.yaml) - GOFLAGS= go install "github.com/google/ko@${KO_VERSION}" - - name: Build aicr CLI binary if: inputs.build_cli == 'true' || inputs.build_snapshot_agent == 'true' shell: bash diff --git a/.github/actions/check-control-plane-health/action.yml b/.github/actions/check-control-plane-health/action.yml index bee31623b..34d42937c 100644 --- a/.github/actions/check-control-plane-health/action.yml +++ b/.github/actions/check-control-plane-health/action.yml @@ -108,6 +108,10 @@ runs: timeout 30s kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" "$@" } + docker_timeout() { + timeout 30s docker "$@" + } + declare -A RECOVERY_ATTEMPTS=() kubectl_kind get --raw='/readyz' || true @@ -209,12 +213,12 @@ runs: echo "::warning::${component} is unhealthy (${reason}); restarting static pod container (attempt $((attempt + 1))/${MAX_RECOVERY_ATTEMPTS})" dump_component_diagnostics "${component}" - if ! docker inspect "${node}" >/dev/null 2>&1; then + if ! docker_timeout inspect "${node}" >/dev/null 2>&1; then echo "::warning::cannot recover ${component}: kind node container ${node} not found" return 1 fi - container_ids=$(docker exec "${node}" crictl ps --name "${component}" -q 2>/dev/null || true) + container_ids=$(docker_timeout exec "${node}" crictl ps --name "${component}" -q 2>/dev/null || true) if [[ -z "${container_ids}" ]]; then echo "::warning::cannot recover ${component}: no running container found in ${node}" return 1 @@ -222,7 +226,7 @@ runs: for container_id in ${container_ids}; do echo "Stopping ${component} container ${container_id} in ${node}..." - if ! docker exec "${node}" crictl stop "${container_id}"; then + if ! docker_timeout exec "${node}" crictl stop "${container_id}"; then echo "::warning::failed to stop ${component} container ${container_id}" return 1 fi diff --git a/.github/actions/gpu-cluster-setup/action.yml b/.github/actions/gpu-cluster-setup/action.yml index c5b5bb3cd..d4ea8f744 100644 --- a/.github/actions/gpu-cluster-setup/action.yml +++ b/.github/actions/gpu-cluster-setup/action.yml @@ -89,7 +89,6 @@ runs: MIN_GPU_COUNT: ${{ inputs.min_gpu_count }} MIN_FREE_DISK_GB: ${{ inputs.min_free_disk_gb }} MIN_AVAILABLE_MEMORY_GB: ${{ inputs.min_available_memory_gb }} - KIND_NODE_IMAGE: ${{ inputs.kind_node_image }} run: | set -euo pipefail @@ -110,18 +109,6 @@ runs: fi done - free_disk_gb=$(df -BG --output=avail / | tail -1 | tr -dc '0-9') - if (( free_disk_gb < MIN_FREE_DISK_GB )); then - echo "::error::free disk on / is ${free_disk_gb}GiB, need at least ${MIN_FREE_DISK_GB}GiB" - exit 1 - fi - - available_memory_gb=$(free -g | awk '/^Mem:/ {print $7}') - if (( available_memory_gb < MIN_AVAILABLE_MEMORY_GB )); then - echo "::error::available memory is ${available_memory_gb}GiB, need at least ${MIN_AVAILABLE_MEMORY_GB}GiB" - exit 1 - fi - echo "=== Docker health ===" docker info >/dev/null docker version @@ -148,16 +135,6 @@ runs: kind get clusters || true docker ps -a --filter "name=${KIND_CLUSTER_NAME}" || true - if [[ -n "${KIND_NODE_IMAGE}" ]]; then - echo "=== Kind node image cache ===" - if docker image inspect "${KIND_NODE_IMAGE}" >/dev/null 2>&1; then - echo "Kind node image already cached: ${KIND_NODE_IMAGE}" - else - echo "Pulling kind node image: ${KIND_NODE_IMAGE}" - timeout 600s docker pull "${KIND_NODE_IMAGE}" - fi - fi - - name: Configure NVIDIA Container Toolkit for kind shell: bash run: | @@ -170,7 +147,7 @@ runs: shell: bash run: | set -euo pipefail - docker run --rm -v /dev/null:/var/run/nvidia-container-devices/all ubuntu:22.04 nvidia-smi -L + timeout 300s docker run --rm -v /dev/null:/var/run/nvidia-container-devices/all ubuntu:22.04 nvidia-smi -L - name: Increase inotify limits shell: bash @@ -184,7 +161,9 @@ runs: set -euo pipefail if kind get clusters | grep -Fxq "${KIND_CLUSTER_NAME}"; then echo "Deleting stale kind cluster: ${KIND_CLUSTER_NAME}" - timeout 180s kind delete cluster --name "${KIND_CLUSTER_NAME}" + if ! timeout 180s kind delete cluster --name "${KIND_CLUSTER_NAME}"; then + echo "::warning::kind delete cluster timed out or failed; falling back to direct container cleanup" + fi else echo "No stale kind cluster named ${KIND_CLUSTER_NAME}" fi @@ -203,6 +182,42 @@ runs: exit 1 fi + - name: Check runner capacity + shell: bash + env: + MIN_FREE_DISK_GB: ${{ inputs.min_free_disk_gb }} + MIN_AVAILABLE_MEMORY_GB: ${{ inputs.min_available_memory_gb }} + run: | + set -euo pipefail + free_disk_gb=$(df -BG --output=avail / | tail -1 | tr -dc '0-9') + if (( free_disk_gb < MIN_FREE_DISK_GB )); then + echo "::error::free disk on / is ${free_disk_gb}GiB, need at least ${MIN_FREE_DISK_GB}GiB" + exit 1 + fi + + available_memory_gb=$(free -g | awk '/^Mem:/ {print $7}') + if (( available_memory_gb < MIN_AVAILABLE_MEMORY_GB )); then + echo "::error::available memory is ${available_memory_gb}GiB, need at least ${MIN_AVAILABLE_MEMORY_GB}GiB" + exit 1 + fi + + echo "Runner capacity is sufficient: disk=${free_disk_gb}GiB memory=${available_memory_gb}GiB" + + - name: Warm kind node image + if: ${{ inputs.kind_node_image != '' }} + shell: bash + env: + KIND_NODE_IMAGE: ${{ inputs.kind_node_image }} + run: | + set -euo pipefail + echo "=== Kind node image cache ===" + if docker image inspect "${KIND_NODE_IMAGE}" >/dev/null 2>&1; then + echo "Kind node image already cached: ${KIND_NODE_IMAGE}" + else + echo "Pulling kind node image: ${KIND_NODE_IMAGE}" + timeout 600s docker pull "${KIND_NODE_IMAGE}" + fi + - name: Create GPU-enabled kind cluster shell: bash env: diff --git a/.github/actions/gpu-snapshot-validate/action.yml b/.github/actions/gpu-snapshot-validate/action.yml index 9cfb67e9c..b89224a60 100644 --- a/.github/actions/gpu-snapshot-validate/action.yml +++ b/.github/actions/gpu-snapshot-validate/action.yml @@ -69,22 +69,21 @@ runs: if: failure() shell: bash run: | + kubectl_kind() { + timeout 30s kubectl --request-timeout=10s --context="kind-${{ inputs.cluster_name }}" "$@" + } + echo "=== Snapshot Job ===" - kubectl --context="kind-${{ inputs.cluster_name }}" -n default get job aicr -o yaml || true + kubectl_kind -n default get job aicr -o yaml || true echo "=== Snapshot Pods ===" - kubectl --context="kind-${{ inputs.cluster_name }}" -n default \ - get pods -l app.kubernetes.io/name=aicr -o wide || true + kubectl_kind -n default get pods -l app.kubernetes.io/name=aicr -o wide || true echo "=== Snapshot Job describe ===" - kubectl --context="kind-${{ inputs.cluster_name }}" -n default describe job aicr || true + kubectl_kind -n default describe job aicr || true echo "=== Snapshot Pod describe ===" - kubectl --context="kind-${{ inputs.cluster_name }}" -n default \ - describe pods -l app.kubernetes.io/name=aicr || true + kubectl_kind -n default describe pods -l app.kubernetes.io/name=aicr || true echo "=== Snapshot current logs ===" - kubectl --context="kind-${{ inputs.cluster_name }}" -n default \ - logs -l app.kubernetes.io/name=aicr --all-containers --tail=200 || true + kubectl_kind -n default logs -l app.kubernetes.io/name=aicr --all-containers --tail=200 || true echo "=== Snapshot previous logs ===" - kubectl --context="kind-${{ inputs.cluster_name }}" -n default \ - logs -l app.kubernetes.io/name=aicr --all-containers --previous --tail=200 || true + kubectl_kind -n default logs -l app.kubernetes.io/name=aicr --all-containers --previous --tail=200 || true echo "=== Snapshot ConfigMap ===" - kubectl --context="kind-${{ inputs.cluster_name }}" -n default \ - get configmap aicr-snapshot -o yaml || true + kubectl_kind -n default get configmap aicr-snapshot -o yaml || true diff --git a/.github/actions/gpu-test-cleanup/action.yml b/.github/actions/gpu-test-cleanup/action.yml index e8536e530..2e3ca2685 100644 --- a/.github/actions/gpu-test-cleanup/action.yml +++ b/.github/actions/gpu-test-cleanup/action.yml @@ -71,7 +71,6 @@ runs: KIND_CLUSTER_NAME: ${{ inputs.cluster_name }} run: | timeout 300s kind delete cluster --name "${KIND_CLUSTER_NAME}" || true - timeout 120s docker system prune -f || true - name: Upload debug artifacts if: failure() From dfd2685dbdc557e7847d7fbc3d364965627948c3 Mon Sep 17 00:00:00 2001 From: Yuan Chen Date: Sat, 25 Apr 2026 18:28:54 -0700 Subject: [PATCH 06/21] ci: clarify control-plane recovery handling --- .github/actions/check-control-plane-health/action.yml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/actions/check-control-plane-health/action.yml b/.github/actions/check-control-plane-health/action.yml index 34d42937c..fda1075c7 100644 --- a/.github/actions/check-control-plane-health/action.yml +++ b/.github/actions/check-control-plane-health/action.yml @@ -32,7 +32,7 @@ inputs: required: false default: 60s max_restarts: - description: 'Maximum tolerated restart count for each control-plane container' + description: 'Maximum tolerated restart count. With stability_window > 0, historical restarts are diagnostic and new restarts fail.' required: false default: '1' stability_window: @@ -218,7 +218,10 @@ runs: return 1 fi - container_ids=$(docker_timeout exec "${node}" crictl ps --name "${component}" -q 2>/dev/null || true) + if ! container_ids=$(docker_timeout exec "${node}" crictl ps --name "${component}" -q 2>/dev/null); then + echo "::warning::cannot recover ${component}: timed out or failed to list containers in ${node}" + return 1 + fi if [[ -z "${container_ids}" ]]; then echo "::warning::cannot recover ${component}: no running container found in ${node}" return 1 From 87d84a0bdaf74a460758ef03dd80f506c94d7642 Mon Sep 17 00:00:00 2001 From: Yuan Chen Date: Sat, 25 Apr 2026 18:35:44 -0700 Subject: [PATCH 07/21] ci: address remaining review comments --- .github/actions/aicr-build/action.yml | 1 + .../check-control-plane-health/action.yml | 2 +- .github/actions/gpu-cluster-setup/action.yml | 30 +++++++++++++++---- 3 files changed, 26 insertions(+), 7 deletions(-) diff --git a/.github/actions/aicr-build/action.yml b/.github/actions/aicr-build/action.yml index 14b68dcf8..14d6a595b 100644 --- a/.github/actions/aicr-build/action.yml +++ b/.github/actions/aicr-build/action.yml @@ -43,6 +43,7 @@ runs: GOFLAGS: -mod=vendor run: | set -euo pipefail + mkdir -p dist if [[ -x dist/aicr ]]; then echo "Reusing existing dist/aicr" exit 0 diff --git a/.github/actions/check-control-plane-health/action.yml b/.github/actions/check-control-plane-health/action.yml index fda1075c7..4063a330f 100644 --- a/.github/actions/check-control-plane-health/action.yml +++ b/.github/actions/check-control-plane-health/action.yml @@ -120,7 +120,7 @@ runs: local component="$1" local selector="component=${component}" - if ! kubectl --context="kind-${KIND_CLUSTER_NAME}" -n "${NAMESPACE}" \ + if ! timeout "${WAIT_TIMEOUT}" kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" -n "${NAMESPACE}" \ wait --for=condition=Ready pod -l "${selector}" --timeout="${WAIT_TIMEOUT}"; then return 1 fi diff --git a/.github/actions/gpu-cluster-setup/action.yml b/.github/actions/gpu-cluster-setup/action.yml index d4ea8f744..fd310e8fa 100644 --- a/.github/actions/gpu-cluster-setup/action.yml +++ b/.github/actions/gpu-cluster-setup/action.yml @@ -119,7 +119,17 @@ runs: mapfile -t gpu_names < <(nvidia-smi --query-gpu=name --format=csv,noheader) if [[ -n "${GPU_MODEL_PATTERN}" ]]; then - gpu_count=$(printf '%s\n' "${gpu_names[@]}" | grep -Eic "${GPU_MODEL_PATTERN}" || true) + set +e + gpu_count=$(printf '%s\n' "${gpu_names[@]}" | grep -Eic -- "${GPU_MODEL_PATTERN}") + grep_status=$? + set -e + if (( grep_status == 2 )); then + echo "::error::invalid gpu_model_pattern regex: ${GPU_MODEL_PATTERN}" + exit 1 + fi + if (( grep_status != 0 )); then + gpu_count=0 + fi echo "Visible GPUs matching '${GPU_MODEL_PATTERN}': ${gpu_count}" else gpu_count="${#gpu_names[@]}" @@ -133,7 +143,7 @@ runs: echo "=== Existing kind state ===" kind get clusters || true - docker ps -a --filter "name=${KIND_CLUSTER_NAME}" || true + docker ps -a --filter "label=io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}" || true - name: Configure NVIDIA Container Toolkit for kind shell: bash @@ -159,6 +169,7 @@ runs: shell: bash run: | set -euo pipefail + kind_cluster_label="io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}" if kind get clusters | grep -Fxq "${KIND_CLUSTER_NAME}"; then echo "Deleting stale kind cluster: ${KIND_CLUSTER_NAME}" if ! timeout 180s kind delete cluster --name "${KIND_CLUSTER_NAME}"; then @@ -168,17 +179,17 @@ runs: echo "No stale kind cluster named ${KIND_CLUSTER_NAME}" fi - remaining_containers=$(docker ps -aq --filter "name=${KIND_CLUSTER_NAME}") + remaining_containers=$(docker ps -aq --filter "label=${kind_cluster_label}") if [[ -n "${remaining_containers}" ]]; then echo "Removing stale containers for ${KIND_CLUSTER_NAME}:" - docker ps -a --filter "name=${KIND_CLUSTER_NAME}" + docker ps -a --filter "label=${kind_cluster_label}" docker rm -f ${remaining_containers} fi - remaining_containers=$(docker ps -aq --filter "name=${KIND_CLUSTER_NAME}") + remaining_containers=$(docker ps -aq --filter "label=${kind_cluster_label}") if [[ -n "${remaining_containers}" ]]; then echo "::error::stale containers still remain for ${KIND_CLUSTER_NAME}:" - docker ps -a --filter "name=${KIND_CLUSTER_NAME}" + docker ps -a --filter "label=${kind_cluster_label}" exit 1 fi @@ -208,6 +219,7 @@ runs: shell: bash env: KIND_NODE_IMAGE: ${{ inputs.kind_node_image }} + MIN_FREE_DISK_GB: ${{ inputs.min_free_disk_gb }} run: | set -euo pipefail echo "=== Kind node image cache ===" @@ -217,6 +229,12 @@ runs: echo "Pulling kind node image: ${KIND_NODE_IMAGE}" timeout 600s docker pull "${KIND_NODE_IMAGE}" fi + free_disk_gb=$(df -BG --output=avail / | tail -1 | tr -dc '0-9') + if (( free_disk_gb < MIN_FREE_DISK_GB )); then + echo "::error::free disk on / is ${free_disk_gb}GiB after warming ${KIND_NODE_IMAGE}, need at least ${MIN_FREE_DISK_GB}GiB" + exit 1 + fi + echo "Runner disk remains sufficient after kind image warm-up: ${free_disk_gb}GiB" - name: Create GPU-enabled kind cluster shell: bash From d14f9c56d4a8fc358d200e58d0a4b44d4ddf3fa1 Mon Sep 17 00:00:00 2001 From: Yuan Chen Date: Sat, 25 Apr 2026 18:39:51 -0700 Subject: [PATCH 08/21] ci: bound H100 retry budgets --- .../check-control-plane-health/action.yml | 32 +++++++++++-------- pkg/bundler/deployer/helm/helm_test.go | 6 ++-- .../deployer/helm/templates/deploy.sh.tmpl | 8 +++-- 3 files changed, 26 insertions(+), 20 deletions(-) diff --git a/.github/actions/check-control-plane-health/action.yml b/.github/actions/check-control-plane-health/action.yml index 4063a330f..153ecfa2b 100644 --- a/.github/actions/check-control-plane-health/action.yml +++ b/.github/actions/check-control-plane-health/action.yml @@ -32,7 +32,7 @@ inputs: required: false default: 60s max_restarts: - description: 'Maximum tolerated restart count. With stability_window > 0, historical restarts are diagnostic and new restarts fail.' + description: 'Maximum tolerated restart count for each control-plane component' required: false default: '1' stability_window: @@ -152,6 +152,18 @@ runs: echo "${total}" } + enforce_restart_budget() { + local component="$1" + local restart_count="$2" + + if (( restart_count > MAX_RESTARTS )); then + echo "::error::${component} restartCount=${restart_count} exceeds max_restarts=${MAX_RESTARTS}" + dump_component_diagnostics "${component}" + kubectl_kind get --raw='/readyz' || true + exit 1 + fi + } + dump_component_diagnostics() { local component="$1" local selector="component=${component}" @@ -281,13 +293,9 @@ runs: fi fi initial_restarts=$(restart_total "${component}") + enforce_restart_budget "${component}" "${initial_restarts}" if [[ "${STABILITY_WINDOW}" != "0s" ]]; then - if (( initial_restarts > MAX_RESTARTS )); then - echo "::warning::${component} historical restartCount=${initial_restarts}; checking for stability over ${STABILITY_WINDOW}" - dump_component_diagnostics "${component}" - fi - sleep "${STABILITY_WINDOW}" if ! wait_ready "${component}"; then if ! try_recover_component "${component}" "pods became unready during ${STABILITY_WINDOW}"; then @@ -297,6 +305,7 @@ runs: exit 1 fi initial_restarts=$(restart_total "${component}") + enforce_restart_budget "${component}" "${initial_restarts}" sleep "${STABILITY_WINDOW}" if ! wait_ready "${component}"; then echo "::error::${component} pods became unready after recovery" @@ -306,6 +315,7 @@ runs: fi fi final_restarts=$(restart_total "${component}") + enforce_restart_budget "${component}" "${final_restarts}" if (( final_restarts > initial_restarts )); then if ! try_recover_component "${component}" "restartCount increased from ${initial_restarts} to ${final_restarts} during ${STABILITY_WINDOW}"; then echo "::error::${component} restartCount increased from ${initial_restarts} to ${final_restarts} during ${STABILITY_WINDOW}" @@ -313,6 +323,7 @@ runs: exit 1 fi initial_restarts=$(restart_total "${component}") + enforce_restart_budget "${component}" "${initial_restarts}" sleep "${STABILITY_WINDOW}" if ! wait_ready "${component}"; then echo "::error::${component} pods became unready after recovery" @@ -321,6 +332,7 @@ runs: exit 1 fi final_restarts=$(restart_total "${component}") + enforce_restart_budget "${component}" "${final_restarts}" if (( final_restarts > initial_restarts )); then echo "::error::${component} restartCount increased from ${initial_restarts} to ${final_restarts} after recovery" dump_component_diagnostics "${component}" @@ -329,14 +341,6 @@ runs: fi return fi - - if (( initial_restarts > MAX_RESTARTS )); then - if ! try_recover_component "${component}" "restartCount=${initial_restarts}"; then - echo "::error::${component} restartCount=${initial_restarts}" - dump_component_diagnostics "${component}" - exit 1 - fi - fi } for component in ${COMPONENTS}; do diff --git a/pkg/bundler/deployer/helm/helm_test.go b/pkg/bundler/deployer/helm/helm_test.go index 79c22f8fd..0a14dc03e 100644 --- a/pkg/bundler/deployer/helm/helm_test.go +++ b/pkg/bundler/deployer/helm/helm_test.go @@ -555,9 +555,9 @@ func TestGenerate_DeployScriptComponentTimeouts(t *testing.T) { Type: recipe.ComponentTypeHelm, Source: "https://prometheus-community.github.io/helm-charts", }, - wantTimeout: `COMPONENT_HELM_TIMEOUT="20m"`, - wantComment: `Keep the default retry budget for kube-prometheus-stack`, - rejectRetryCap: true, + wantTimeout: `COMPONENT_HELM_TIMEOUT="20m"`, + wantRetryCap: `if [[ "${COMPONENT_MAX_RETRIES}" -gt 2 ]]`, + wantComment: `Allow the observed third-attempt Grafana success pattern`, }, } diff --git a/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl b/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl index e456bba58..02c04d0a9 100644 --- a/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl +++ b/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl @@ -387,9 +387,11 @@ if [[ "${COMPONENT_MAX_RETRIES}" -gt 1 ]]; then fi {{ else if eq .Name "kube-prometheus-stack" -}} COMPONENT_HELM_TIMEOUT="20m" -# Keep the default retry budget for kube-prometheus-stack. On cold H100 -# runners, Grafana can hit ProgressDeadlineExceeded multiple times before -# images and rollout state are warm enough for a later retry to succeed. +# Allow the observed third-attempt Grafana success pattern, but cap the budget +# so kube-prometheus-stack cannot consume most of the H100 workflow timeout. +if [[ "${COMPONENT_MAX_RETRIES}" -gt 2 ]]; then + COMPONENT_MAX_RETRIES="2" +fi {{ end -}} # Derive wait args: global --wait/--no-wait behavior + component timeout. if [[ "${NO_WAIT}" == "true" ]]; then From 7a3505ccec453aded444fe995d150861ec0c23dd Mon Sep 17 00:00:00 2001 From: Yuan Chen Date: Sat, 25 Apr 2026 18:50:33 -0700 Subject: [PATCH 09/21] ci: install ko for Karpenter KWOK --- .github/actions/install-karpenter-kwok/action.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.github/actions/install-karpenter-kwok/action.yml b/.github/actions/install-karpenter-kwok/action.yml index dab642174..c917b2abc 100644 --- a/.github/actions/install-karpenter-kwok/action.yml +++ b/.github/actions/install-karpenter-kwok/action.yml @@ -40,8 +40,15 @@ runs: shell: bash run: | echo "karpenter=$(yq eval '.testing_tools.karpenter' .settings.yaml)" >> "$GITHUB_OUTPUT" + echo "ko=$(yq eval '.build_tools.ko' .settings.yaml)" >> "$GITHUB_OUTPUT" echo "go=$(go env GOVERSION)" >> "$GITHUB_OUTPUT" + - name: Install ko + uses: ./.github/actions/setup-build-tools + with: + install_ko: 'true' + ko_version: ${{ steps.versions.outputs.ko }} + - name: Cache Karpenter Go build cache uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3 with: From c1ccd86ab53c8a653dd0f2d6fae3480af0d534b0 Mon Sep 17 00:00:00 2001 From: Yuan Chen Date: Sat, 25 Apr 2026 20:23:37 -0700 Subject: [PATCH 10/21] ci: retry KWOK Helm bootstrap --- kwok/scripts/run-all-recipes.sh | 34 ++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/kwok/scripts/run-all-recipes.sh b/kwok/scripts/run-all-recipes.sh index 459b054b5..6b4af1549 100755 --- a/kwok/scripts/run-all-recipes.sh +++ b/kwok/scripts/run-all-recipes.sh @@ -37,6 +37,31 @@ log_info() { echo -e "${GREEN}[INFO]${NC} $*"; } log_warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } log_error() { echo -e "${RED}[ERROR]${NC} $*"; } +retry_command() { + local description="$1" + shift + + local max_attempts="${KWOK_COMMAND_RETRIES:-3}" + local delay="${KWOK_COMMAND_RETRY_DELAY:-5}" + local attempt=1 + + while true; do + if "$@"; then + return 0 + fi + + if ((attempt >= max_attempts)); then + log_error "${description} failed after ${attempt} attempt(s)" + return 1 + fi + + log_warn "${description} failed (attempt ${attempt}/${max_attempts}); retrying in ${delay}s..." + sleep "${delay}" + attempt=$((attempt + 1)) + delay=$((delay * 2)) + done +} + # Find recipes with service criteria (testable cloud configurations) get_recipes() { for overlay in "${OVERLAYS_DIR}"/*.yaml; do @@ -68,10 +93,13 @@ ensure_cluster() { if ! kubectl get deployment -n kube-system kwok-controller &>/dev/null; then log_info "Installing KWOK controller..." - helm repo add kwok https://kwok.sigs.k8s.io/charts/ --force-update - helm upgrade --install kwok-controller kwok/kwok \ + retry_command "Adding KWOK Helm repository" \ + helm repo add kwok https://kwok.sigs.k8s.io/charts/ --force-update + retry_command "Installing KWOK controller" \ + helm upgrade --install kwok-controller kwok/kwok \ --namespace kube-system --set hostNetwork=true --wait - helm upgrade --install kwok-stage-fast kwok/stage-fast --namespace kube-system + retry_command "Installing KWOK stage-fast" \ + helm upgrade --install kwok-stage-fast kwok/stage-fast --namespace kube-system fi # Patch kindnet to exclude KWOK nodes From 34f115e3c20b927c9ec1e14d47858a47808b3dd9 Mon Sep 17 00:00:00 2001 From: Yuan Chen Date: Sat, 25 Apr 2026 20:45:06 -0700 Subject: [PATCH 11/21] ci: share control plane stability window --- .../check-control-plane-health/action.yml | 81 +++++++++++++------ 1 file changed, 57 insertions(+), 24 deletions(-) diff --git a/.github/actions/check-control-plane-health/action.yml b/.github/actions/check-control-plane-health/action.yml index 153ecfa2b..600310972 100644 --- a/.github/actions/check-control-plane-health/action.yml +++ b/.github/actions/check-control-plane-health/action.yml @@ -264,7 +264,6 @@ runs: local selector="component=${component}" local pods local initial_restarts - local final_restarts if ! pods=$(kubectl_kind -n "${NAMESPACE}" get pod -l "${selector}" -o name); then if ! try_recover_component "${component}" "failed to list pods in ${NAMESPACE} with selector ${selector}"; then @@ -294,9 +293,27 @@ runs: fi initial_restarts=$(restart_total "${component}") enforce_restart_budget "${component}" "${initial_restarts}" + INITIAL_RESTARTS["${component}"]="${initial_restarts}" + } + + verify_stability_window() { + local component + local initial_restarts + local final_restarts + local recovered=false - if [[ "${STABILITY_WINDOW}" != "0s" ]]; then - sleep "${STABILITY_WINDOW}" + if [[ "${STABILITY_WINDOW}" == "0s" ]]; then + return + fi + + echo "Observing control-plane stability for ${STABILITY_WINDOW}..." + sleep "${STABILITY_WINDOW}" + for component in ${COMPONENTS}; do + initial_restarts="${INITIAL_RESTARTS[${component}]:-}" + if [[ -z "${initial_restarts}" ]]; then + echo "::error::missing initial restart count for ${component}" + exit 1 + fi if ! wait_ready "${component}"; then if ! try_recover_component "${component}" "pods became unready during ${STABILITY_WINDOW}"; then echo "::error::${component} pods became unready during ${STABILITY_WINDOW}" @@ -306,13 +323,9 @@ runs: fi initial_restarts=$(restart_total "${component}") enforce_restart_budget "${component}" "${initial_restarts}" - sleep "${STABILITY_WINDOW}" - if ! wait_ready "${component}"; then - echo "::error::${component} pods became unready after recovery" - dump_component_diagnostics "${component}" - kubectl_kind get --raw='/readyz' || true - exit 1 - fi + INITIAL_RESTARTS["${component}"]="${initial_restarts}" + recovered=true + continue fi final_restarts=$(restart_total "${component}") enforce_restart_budget "${component}" "${final_restarts}" @@ -324,26 +337,46 @@ runs: fi initial_restarts=$(restart_total "${component}") enforce_restart_budget "${component}" "${initial_restarts}" - sleep "${STABILITY_WINDOW}" - if ! wait_ready "${component}"; then - echo "::error::${component} pods became unready after recovery" - dump_component_diagnostics "${component}" - kubectl_kind get --raw='/readyz' || true - exit 1 - fi - final_restarts=$(restart_total "${component}") - enforce_restart_budget "${component}" "${final_restarts}" - if (( final_restarts > initial_restarts )); then - echo "::error::${component} restartCount increased from ${initial_restarts} to ${final_restarts} after recovery" - dump_component_diagnostics "${component}" - exit 1 - fi + INITIAL_RESTARTS["${component}"]="${initial_restarts}" + recovered=true + continue fi + INITIAL_RESTARTS["${component}"]="${final_restarts}" + done + + if [[ "${recovered}" != "true" ]]; then return fi + + echo "::warning::control-plane recovery occurred; observing one additional ${STABILITY_WINDOW} stability window" + sleep "${STABILITY_WINDOW}" + for component in ${COMPONENTS}; do + initial_restarts="${INITIAL_RESTARTS[${component}]:-}" + if [[ -z "${initial_restarts}" ]]; then + echo "::error::missing post-recovery restart count for ${component}" + exit 1 + fi + if ! wait_ready "${component}"; then + echo "::error::${component} pods became unready after recovery" + dump_component_diagnostics "${component}" + kubectl_kind get --raw='/readyz' || true + exit 1 + fi + final_restarts=$(restart_total "${component}") + enforce_restart_budget "${component}" "${final_restarts}" + if (( final_restarts > initial_restarts )); then + echo "::error::${component} restartCount increased from ${initial_restarts} to ${final_restarts} after recovery" + dump_component_diagnostics "${component}" + exit 1 + fi + INITIAL_RESTARTS["${component}"]="${final_restarts}" + done } + declare -A INITIAL_RESTARTS=() + for component in ${COMPONENTS}; do check_component "${component}" done + verify_stability_window kubectl_kind get --raw='/readyz' From 73dfd1e4929a14eca1a03f0e00239770b6d937de Mon Sep 17 00:00:00 2001 From: Yuan Chen Date: Sat, 25 Apr 2026 22:12:31 -0700 Subject: [PATCH 12/21] ci: harden H100 runtime diagnostics --- .github/actions/gpu-cluster-setup/action.yml | 180 ++++++++++++++++++ .../actions/gpu-operator-install/action.yml | 13 +- .github/actions/gpu-test-cleanup/action.yml | 9 + .../workflows/gpu-h100-inference-test.yaml | 71 ++++--- .github/workflows/gpu-h100-training-test.yaml | 62 +++--- pkg/bundler/deployer/helm/helm_test.go | 6 +- .../deployer/helm/templates/deploy.sh.tmpl | 9 +- 7 files changed, 296 insertions(+), 54 deletions(-) diff --git a/.github/actions/gpu-cluster-setup/action.yml b/.github/actions/gpu-cluster-setup/action.yml index fd310e8fa..3800b4b99 100644 --- a/.github/actions/gpu-cluster-setup/action.yml +++ b/.github/actions/gpu-cluster-setup/action.yml @@ -40,6 +40,42 @@ inputs: description: 'Timeout for nvkind cluster create' required: false default: '900s' + control_plane_resource_patches: + description: 'Apply kubeadm patches that raise control-plane static pod resource requests' + required: false + default: 'false' + api_server_cpu_request: + description: 'kube-apiserver CPU request when control_plane_resource_patches is true' + required: false + default: '1000m' + api_server_memory_request: + description: 'kube-apiserver memory request when control_plane_resource_patches is true' + required: false + default: '1Gi' + controller_manager_cpu_request: + description: 'kube-controller-manager CPU request when control_plane_resource_patches is true' + required: false + default: '1000m' + controller_manager_memory_request: + description: 'kube-controller-manager memory request when control_plane_resource_patches is true' + required: false + default: '512Mi' + scheduler_cpu_request: + description: 'kube-scheduler CPU request when control_plane_resource_patches is true' + required: false + default: '500m' + scheduler_memory_request: + description: 'kube-scheduler memory request when control_plane_resource_patches is true' + required: false + default: '256Mi' + etcd_cpu_request: + description: 'etcd CPU request when control_plane_resource_patches is true' + required: false + default: '1000m' + etcd_memory_request: + description: 'etcd memory request when control_plane_resource_patches is true' + required: false + default: '1Gi' runs: using: 'composite' @@ -241,6 +277,15 @@ runs: env: KIND_NODE_IMAGE: ${{ inputs.kind_node_image }} CLUSTER_CREATE_TIMEOUT: ${{ inputs.cluster_create_timeout }} + CONTROL_PLANE_RESOURCE_PATCHES: ${{ inputs.control_plane_resource_patches }} + API_SERVER_CPU_REQUEST: ${{ inputs.api_server_cpu_request }} + API_SERVER_MEMORY_REQUEST: ${{ inputs.api_server_memory_request }} + CONTROLLER_MANAGER_CPU_REQUEST: ${{ inputs.controller_manager_cpu_request }} + CONTROLLER_MANAGER_MEMORY_REQUEST: ${{ inputs.controller_manager_memory_request }} + SCHEDULER_CPU_REQUEST: ${{ inputs.scheduler_cpu_request }} + SCHEDULER_MEMORY_REQUEST: ${{ inputs.scheduler_memory_request }} + ETCD_CPU_REQUEST: ${{ inputs.etcd_cpu_request }} + ETCD_MEMORY_REQUEST: ${{ inputs.etcd_memory_request }} run: | set -euo pipefail @@ -250,6 +295,131 @@ runs: CREATE_ARGS+=(--image="${KIND_NODE_IMAGE}") fi + case "${CONTROL_PLANE_RESOURCE_PATCHES}" in + true) ;; + ""|false) CONTROL_PLANE_RESOURCE_PATCHES=false ;; + *) + echo "::error::control_plane_resource_patches must be true or false, got '${CONTROL_PLANE_RESOURCE_PATCHES}'" + exit 1 + ;; + esac + + if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then + patch_dir="$(mktemp -d)" + config_template="$(mktemp)" + + # Keep heredoc body indentation aligned with this run block. GitHub + # Actions strips the common run: | indent before bash sees it. + cat > "${patch_dir}/kube-apiserver+strategic.yaml" < "${patch_dir}/kube-controller-manager+strategic.yaml" < "${patch_dir}/kube-scheduler+strategic.yaml" < "${patch_dir}/etcd+strategic.yaml" < "${config_template}" <<'EOF' + kind: Cluster + apiVersion: kind.x-k8s.io/v1alpha4 + {{- if hasKey $ "name" }} + name: {{ $.name }} + {{- end }} + nodes: + - role: control-plane + {{- if hasKey $ "image" }} + image: {{ $.image }} + {{- end }} + extraMounts: + - hostPath: __PATCH_DIR__ + containerPath: /patches + kubeadmConfigPatches: + - | + kind: InitConfiguration + patches: + directory: /patches + {{- range $.workers }} + - role: worker + {{- if hasKey $ "image" }} + image: {{ $.image }} + {{- end }} + + {{- if hasKey . "devices" }} + {{- $devices := .devices }} + {{- if not (kindIs "slice" $devices) }} + {{- $devices = list .devices }} + {{- end }} + extraMounts: + # We inject all NVIDIA GPUs using the nvidia-container-runtime. + # This requires `accept-nvidia-visible-devices-as-volume-mounts = true` be set + # in `/etc/nvidia-container-runtime/config.toml` + {{- range $d := $devices }} + - hostPath: /dev/null + containerPath: /var/run/nvidia-container-devices/{{ $d }} + {{- end }} + {{- end }} + {{- end }} + EOF + sed -i "s#__PATCH_DIR__#${patch_dir}#g" "${config_template}" + echo "Applying control-plane static pod resource patches from ${patch_dir}:" + for patch_file in "${patch_dir}"/*.yaml; do + echo "--- ${patch_file}" + sed 's/^/ /' "${patch_file}" + done + CREATE_ARGS+=(--config-template="${config_template}") + fi + set +e timeout "${CLUSTER_CREATE_TIMEOUT}" nvkind cluster create "${CREATE_ARGS[@]}" create_status=$? @@ -261,6 +431,16 @@ runs: kubectl --context="kind-${KIND_CLUSTER_NAME}" wait --for=condition=Ready nodes --all --timeout=300s kubectl --context="kind-${KIND_CLUSTER_NAME}" cluster-info kubectl --context="kind-${KIND_CLUSTER_NAME}" get nodes -o wide + kubectl --context="kind-${KIND_CLUSTER_NAME}" describe nodes | \ + grep -E "^(Name:|Capacity:|Allocatable:|Allocated resources:| cpu| memory| nvidia.com/gpu)" || true + + echo "=== Kind node container resources ===" + docker ps --filter "label=io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}" \ + --format '{{.Names}}' | sort | while read -r node_container; do + [[ -z "${node_container}" ]] && continue + docker inspect "${node_container}" \ + --format '{{.Name}} NanoCpus={{.HostConfig.NanoCpus}} CpuShares={{.HostConfig.CpuShares}} Memory={{.HostConfig.Memory}} MemoryReservation={{.HostConfig.MemoryReservation}}' + done echo "=== Control-plane resource requests/limits ===" kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kube-system \ diff --git a/.github/actions/gpu-operator-install/action.yml b/.github/actions/gpu-operator-install/action.yml index 86d247932..2727c5e6a 100644 --- a/.github/actions/gpu-operator-install/action.yml +++ b/.github/actions/gpu-operator-install/action.yml @@ -39,6 +39,10 @@ inputs: description: 'Continue deploying remaining bundle components after a component failure' required: false default: 'true' + deploy_trace: + description: 'Run generated deploy.sh with bash xtrace for CI diagnostics. Do not enable for bundles with secret command arguments.' + required: false + default: 'false' runs: using: 'composite' @@ -124,7 +128,14 @@ runs: else echo "Deploying bundle with default args" fi - ./deploy.sh "${DEPLOY_ARGS[@]}" + if [[ "${{ inputs.deploy_trace }}" == "true" ]]; then + echo "Deploy trace enabled: running deploy.sh with bash xtrace" + echo "::warning::deploy_trace prints shell command arguments; disable it for bundles with secret values" + export PS4='+ ${BASH_SOURCE}:${LINENO}: ' + bash -x ./deploy.sh "${DEPLOY_ARGS[@]}" + else + ./deploy.sh "${DEPLOY_ARGS[@]}" + fi - name: Wait for GPU operands (bundle) if: inputs.method == 'bundle' diff --git a/.github/actions/gpu-test-cleanup/action.yml b/.github/actions/gpu-test-cleanup/action.yml index 2e3ca2685..5d1cef3e4 100644 --- a/.github/actions/gpu-test-cleanup/action.yml +++ b/.github/actions/gpu-test-cleanup/action.yml @@ -71,6 +71,15 @@ runs: KIND_CLUSTER_NAME: ${{ inputs.cluster_name }} run: | timeout 300s kind delete cluster --name "${KIND_CLUSTER_NAME}" || true + kind_cluster_label="io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}" + remaining_containers=$(docker ps -aq --filter "label=${kind_cluster_label}") + if [[ -n "${remaining_containers}" ]]; then + echo "Removing leftover kind containers for ${KIND_CLUSTER_NAME}:" + docker ps -a --filter "label=${kind_cluster_label}" + docker rm -f ${remaining_containers} || true + fi + timeout 60s docker builder prune -f --filter "until=24h" || true + timeout 60s docker system prune -f --filter "until=24h" || true - name: Upload debug artifacts if: failure() diff --git a/.github/workflows/gpu-h100-inference-test.yaml b/.github/workflows/gpu-h100-inference-test.yaml index 95df8740e..95340fbc8 100644 --- a/.github/workflows/gpu-h100-inference-test.yaml +++ b/.github/workflows/gpu-h100-inference-test.yaml @@ -58,6 +58,7 @@ jobs: - 'pkg/evidence/**' - '.github/actions/gpu-test-cleanup/**' - '.github/actions/load-versions/**' + - 'pkg/bundler/deployer/helm/**' - 'tests/chainsaw/chainsaw-config.yaml' - 'tests/chainsaw/ai-conformance/main.go' - 'tests/chainsaw/ai-conformance/common/**' @@ -124,6 +125,7 @@ jobs: min_free_disk_gb: '50' min_available_memory_gb: '16' cluster_create_timeout: 900s + control_plane_resource_patches: 'true' - name: Build aicr uses: ./.github/actions/aicr-build @@ -131,12 +133,14 @@ jobs: build_snapshot_agent: 'false' validator_phases: 'none' + # Fast readiness gate after cluster setup. Stability windows start after + # runtime install, where component rollouts can stress the control plane. - name: Check control plane health uses: ./.github/actions/check-control-plane-health with: cluster_name: ${{ env.KIND_CLUSTER_NAME }} wait_timeout: 120s - stability_window: 60s + stability_window: 0s recover_unhealthy: 'true' - name: Install runtime bundle @@ -148,8 +152,9 @@ jobs: platform: dynamo wait: 'true' best_effort: 'false' + deploy_trace: 'true' - - name: Check control plane health after runtime install + - name: Check control plane health uses: ./.github/actions/check-control-plane-health with: cluster_name: ${{ env.KIND_CLUSTER_NAME }} @@ -176,7 +181,7 @@ jobs: # --- Install Karpenter + KWOK early to give monitoring stack settle time --- - - name: Check control plane health before Karpenter + - name: Check control plane health uses: ./.github/actions/check-control-plane-health with: cluster_name: ${{ env.KIND_CLUSTER_NAME }} @@ -194,7 +199,13 @@ jobs: # --- Health checks --- - - name: Check control plane health after Karpenter + - name: Install chainsaw + uses: ./.github/actions/setup-build-tools + with: + install_chainsaw: 'true' + chainsaw_version: '${{ steps.versions.outputs.chainsaw }}' + + - name: Check control plane health uses: ./.github/actions/check-control-plane-health with: cluster_name: ${{ env.KIND_CLUSTER_NAME }} @@ -202,32 +213,20 @@ jobs: stability_window: 60s recover_unhealthy: 'true' - - name: Install chainsaw - uses: ./.github/actions/setup-build-tools - with: - install_chainsaw: 'true' - chainsaw_version: '${{ steps.versions.outputs.chainsaw }}' - - name: Run chainsaw health checks + # The H100 stack can make namespace cleanup API calls slow under load. + # Keep cleanup enabled, but allow more than the default 30s deadline. run: | chainsaw test \ --test-dir tests/chainsaw/ai-conformance/kind-inference-dynamo \ - --config tests/chainsaw/chainsaw-config.yaml + --config tests/chainsaw/chainsaw-config.yaml \ + --cleanup-timeout 120s \ + --delete-timeout 120s # --- CNCF AI Conformance validation --- # Runs after the stack health checks so gateway and metrics validators # see a settled inference stack. - - name: Verify expected resources exist - timeout-minutes: 3 - run: | - go run ./tests/chainsaw/ai-conformance/ \ - --dir tests/chainsaw/ai-conformance/kind-inference-dynamo \ - --dir tests/chainsaw/ai-conformance/common \ - --dir tests/chainsaw/ai-conformance/kind-common \ - --kubeconfig="${HOME}/.kube/config" \ - --debug - - name: Build conformance validator image uses: ./.github/actions/aicr-build with: @@ -235,7 +234,7 @@ jobs: build_snapshot_agent: 'false' validator_phases: 'conformance' - - name: Check control plane health before conformance validation + - name: Check control plane health uses: ./.github/actions/check-control-plane-health with: cluster_name: ${{ env.KIND_CLUSTER_NAME }} @@ -276,6 +275,8 @@ jobs: validation-result.yaml if-no-files-found: warn + # --- Debug diagnostics (before cleanup so resources still exist) --- + - name: Debug diagnostics if: failure() timeout-minutes: 5 @@ -285,7 +286,26 @@ jobs: kubectl_kind() { timeout 30s kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" "$@" } + print_workload_images() { + local ns="$1" + kubectl_kind -n "${ns}" get deployment,daemonset,statefulset -o json 2>/dev/null \ + | jq -r ' + .items[] | + [ + .kind, + .metadata.namespace + "/" + .metadata.name, + (([.spec.template.spec.containers[]?.image] + + [.spec.template.spec.initContainers[]?.image]) | unique | join(",")) + ] | @tsv + ' || true + } + echo "=== Workload image inventory ===" + for NS in cert-manager gpu-operator monitoring skyhook nvsentinel nvidia-dra-driver \ + nvidia-network-operator kai-scheduler dynamo-system kgateway-system; do + echo "--- ${NS} ---" + print_workload_images "${NS}" + done echo "=== ClusterPolicy status ===" kubectl_kind get clusterpolicy -o yaml 2>/dev/null || true echo "=== GPU Operator pods ===" @@ -316,6 +336,13 @@ jobs: echo "=== KAI admission logs ===" kubectl_kind -n kai-scheduler \ logs deployment/admission --all-containers --tail=200 2>/dev/null || true + echo "=== KAI scheduler logs ===" + kubectl_kind -n kai-scheduler \ + logs deployment/kai-scheduler-default --tail=100 2>/dev/null || true + echo "=== KAI scheduler queues ===" + kubectl_kind get queues -A 2>/dev/null || true + echo "=== KAI scheduler podgroups ===" + kubectl_kind get podgroups -A 2>/dev/null || true echo "=== Recent events (kai-scheduler) ===" kubectl_kind -n kai-scheduler get events --sort-by='.lastTimestamp' 2>/dev/null | tail -50 || true echo "=== Custom metrics API ===" diff --git a/.github/workflows/gpu-h100-training-test.yaml b/.github/workflows/gpu-h100-training-test.yaml index 6193544fa..5ac900685 100644 --- a/.github/workflows/gpu-h100-training-test.yaml +++ b/.github/workflows/gpu-h100-training-test.yaml @@ -58,6 +58,7 @@ jobs: - 'pkg/evidence/**' - '.github/actions/gpu-test-cleanup/**' - '.github/actions/load-versions/**' + - 'pkg/bundler/deployer/helm/**' - 'tests/chainsaw/chainsaw-config.yaml' - 'tests/chainsaw/ai-conformance/main.go' - 'tests/chainsaw/ai-conformance/common/**' @@ -120,6 +121,7 @@ jobs: min_free_disk_gb: '50' min_available_memory_gb: '16' cluster_create_timeout: 900s + control_plane_resource_patches: 'true' - name: Build aicr uses: ./.github/actions/aicr-build @@ -127,12 +129,14 @@ jobs: build_snapshot_agent: 'false' validator_phases: 'none' + # Fast readiness gate after cluster setup. Stability windows start after + # runtime install, where component rollouts can stress the control plane. - name: Check control plane health uses: ./.github/actions/check-control-plane-health with: cluster_name: ${{ env.KIND_CLUSTER_NAME }} wait_timeout: 120s - stability_window: 60s + stability_window: 0s recover_unhealthy: 'true' - name: Install runtime bundle @@ -145,8 +149,9 @@ jobs: platform: kubeflow wait: 'true' best_effort: 'false' + deploy_trace: 'true' - - name: Check control plane health after runtime install + - name: Check control plane health id: post_runtime_control_plane_health uses: ./.github/actions/check-control-plane-health with: @@ -174,7 +179,7 @@ jobs: # --- Install Karpenter + KWOK early to give monitoring stack settle time --- - - name: Check control plane health before Karpenter + - name: Check control plane health uses: ./.github/actions/check-control-plane-health with: cluster_name: ${{ env.KIND_CLUSTER_NAME }} @@ -192,7 +197,13 @@ jobs: # --- Health checks --- - - name: Check control plane health after Karpenter + - name: Install chainsaw + uses: ./.github/actions/setup-build-tools + with: + install_chainsaw: 'true' + chainsaw_version: '${{ steps.versions.outputs.chainsaw }}' + + - name: Check control plane health uses: ./.github/actions/check-control-plane-health with: cluster_name: ${{ env.KIND_CLUSTER_NAME }} @@ -200,32 +211,20 @@ jobs: stability_window: 60s recover_unhealthy: 'true' - - name: Install chainsaw - uses: ./.github/actions/setup-build-tools - with: - install_chainsaw: 'true' - chainsaw_version: '${{ steps.versions.outputs.chainsaw }}' - - name: Run chainsaw health checks + # The H100 stack can make namespace cleanup API calls slow under load. + # Keep cleanup enabled, but allow more than the default 30s deadline. run: | chainsaw test \ --test-dir tests/chainsaw/ai-conformance/kind-training-kubeflow \ - --config tests/chainsaw/chainsaw-config.yaml + --config tests/chainsaw/chainsaw-config.yaml \ + --cleanup-timeout 120s \ + --delete-timeout 120s # --- CNCF AI Conformance validation --- # Runs last to ensure the DCGM → Prometheus → adapter pipeline # has had time to bootstrap (pod-autoscaling check needs live metric data). - - name: Verify expected resources exist - timeout-minutes: 3 - run: | - go run ./tests/chainsaw/ai-conformance/ \ - --dir tests/chainsaw/ai-conformance/kind-training-kubeflow \ - --dir tests/chainsaw/ai-conformance/common \ - --dir tests/chainsaw/ai-conformance/kind-common \ - --kubeconfig="${HOME}/.kube/config" \ - --debug - - name: Build conformance validator image uses: ./.github/actions/aicr-build with: @@ -233,7 +232,7 @@ jobs: build_snapshot_agent: 'false' validator_phases: 'conformance' - - name: Check control plane health before conformance validation + - name: Check control plane health uses: ./.github/actions/check-control-plane-health with: cluster_name: ${{ env.KIND_CLUSTER_NAME }} @@ -281,7 +280,26 @@ jobs: kubectl_kind() { timeout 30s kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" "$@" } + print_workload_images() { + local ns="$1" + kubectl_kind -n "${ns}" get deployment,daemonset,statefulset -o json 2>/dev/null \ + | jq -r ' + .items[] | + [ + .kind, + .metadata.namespace + "/" + .metadata.name, + (([.spec.template.spec.containers[]?.image] + + [.spec.template.spec.initContainers[]?.image]) | unique | join(",")) + ] | @tsv + ' || true + } + echo "=== Workload image inventory ===" + for NS in cert-manager gpu-operator monitoring skyhook nvsentinel nvidia-dra-driver \ + nvidia-network-operator kai-scheduler kubeflow; do + echo "--- ${NS} ---" + print_workload_images "${NS}" + done echo "=== Grafana deployment ===" kubectl_kind -n monitoring get deployment grafana -o wide 2>/dev/null || true echo "=== Grafana pods ===" diff --git a/pkg/bundler/deployer/helm/helm_test.go b/pkg/bundler/deployer/helm/helm_test.go index 0a14dc03e..8524e4bb7 100644 --- a/pkg/bundler/deployer/helm/helm_test.go +++ b/pkg/bundler/deployer/helm/helm_test.go @@ -555,9 +555,9 @@ func TestGenerate_DeployScriptComponentTimeouts(t *testing.T) { Type: recipe.ComponentTypeHelm, Source: "https://prometheus-community.github.io/helm-charts", }, - wantTimeout: `COMPONENT_HELM_TIMEOUT="20m"`, - wantRetryCap: `if [[ "${COMPONENT_MAX_RETRIES}" -gt 2 ]]`, - wantComment: `Allow the observed third-attempt Grafana success pattern`, + wantTimeout: `COMPONENT_HELM_TIMEOUT="${HELM_TIMEOUT}"`, + wantComment: `preserve the default retry`, + rejectRetryCap: true, }, } diff --git a/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl b/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl index 02c04d0a9..8359fc07f 100644 --- a/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl +++ b/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl @@ -386,12 +386,9 @@ if [[ "${COMPONENT_MAX_RETRIES}" -gt 1 ]]; then COMPONENT_MAX_RETRIES="1" fi {{ else if eq .Name "kube-prometheus-stack" -}} -COMPONENT_HELM_TIMEOUT="20m" -# Allow the observed third-attempt Grafana success pattern, but cap the budget -# so kube-prometheus-stack cannot consume most of the H100 workflow timeout. -if [[ "${COMPONENT_MAX_RETRIES}" -gt 2 ]]; then - COMPONENT_MAX_RETRIES="2" -fi +# Grafana can trip its Deployment progress deadline before a longer Helm +# timeout helps. Keep the default 10m timeout and preserve the default retry +# budget so later upgrades can succeed after images and controllers settle. {{ end -}} # Derive wait args: global --wait/--no-wait behavior + component timeout. if [[ "${NO_WAIT}" == "true" ]]; then From 2887460ad90e1041277eaf06a3e236a173c4f063 Mon Sep 17 00:00:00 2001 From: Yuan Chen Date: Sun, 26 Apr 2026 07:50:33 -0700 Subject: [PATCH 13/21] ci: harden H100 control plane and Dynamo retries --- .github/actions/gpu-cluster-setup/action.yml | 64 ++++++++++++++++--- .../actions/gpu-operator-install/action.yml | 13 +--- .../workflows/gpu-h100-inference-test.yaml | 2 +- .github/workflows/gpu-h100-training-test.yaml | 2 +- docs/user/cli-reference.md | 2 +- pkg/bundler/deployer/helm/helm_test.go | 8 +++ .../deployer/helm/templates/deploy.sh.tmpl | 4 ++ 7 files changed, 72 insertions(+), 23 deletions(-) diff --git a/.github/actions/gpu-cluster-setup/action.yml b/.github/actions/gpu-cluster-setup/action.yml index 3800b4b99..d20d7cbdd 100644 --- a/.github/actions/gpu-cluster-setup/action.yml +++ b/.github/actions/gpu-cluster-setup/action.yml @@ -44,6 +44,10 @@ inputs: description: 'Apply kubeadm patches that raise control-plane static pod resource requests' required: false default: 'false' + disable_control_plane_leader_election: + description: 'Disable kube-controller-manager and kube-scheduler leader election for single-control-plane kind CI clusters' + required: false + default: 'false' api_server_cpu_request: description: 'kube-apiserver CPU request when control_plane_resource_patches is true' required: false @@ -278,6 +282,7 @@ runs: KIND_NODE_IMAGE: ${{ inputs.kind_node_image }} CLUSTER_CREATE_TIMEOUT: ${{ inputs.cluster_create_timeout }} CONTROL_PLANE_RESOURCE_PATCHES: ${{ inputs.control_plane_resource_patches }} + DISABLE_CONTROL_PLANE_LEADER_ELECTION: ${{ inputs.disable_control_plane_leader_election }} API_SERVER_CPU_REQUEST: ${{ inputs.api_server_cpu_request }} API_SERVER_MEMORY_REQUEST: ${{ inputs.api_server_memory_request }} CONTROLLER_MANAGER_CPU_REQUEST: ${{ inputs.controller_manager_cpu_request }} @@ -304,12 +309,22 @@ runs: ;; esac - if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then + case "${DISABLE_CONTROL_PLANE_LEADER_ELECTION}" in + true) ;; + ""|false) DISABLE_CONTROL_PLANE_LEADER_ELECTION=false ;; + *) + echo "::error::disable_control_plane_leader_election must be true or false, got '${DISABLE_CONTROL_PLANE_LEADER_ELECTION}'" + exit 1 + ;; + esac + + if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" || "${DISABLE_CONTROL_PLANE_LEADER_ELECTION}" == "true" ]]; then patch_dir="$(mktemp -d)" config_template="$(mktemp)" # Keep heredoc body indentation aligned with this run block. GitHub # Actions strips the common run: | indent before bash sees it. + if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then cat > "${patch_dir}/kube-apiserver+strategic.yaml" < "${config_template}" <<'EOF' kind: Cluster @@ -381,14 +397,42 @@ runs: {{- if hasKey $ "image" }} image: {{ $.image }} {{- end }} + EOF + if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then + cat >> "${config_template}" <> "${config_template}" <<'EOF' kubeadmConfigPatches: + EOF + if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then + cat >> "${config_template}" <<'EOF' - | kind: InitConfiguration + apiVersion: kubeadm.k8s.io/v1beta4 patches: directory: /patches + EOF + fi + if [[ "${DISABLE_CONTROL_PLANE_LEADER_ELECTION}" == "true" ]]; then + cat >> "${config_template}" <<'EOF' + - | + kind: ClusterConfiguration + apiVersion: kubeadm.k8s.io/v1beta4 + controllerManager: + extraArgs: + - name: leader-elect + value: "false" + scheduler: + extraArgs: + - name: leader-elect + value: "false" + EOF + fi + cat >> "${config_template}" <<'EOF' {{- range $.workers }} - role: worker {{- if hasKey $ "image" }} @@ -411,12 +455,16 @@ runs: {{- end }} {{- end }} EOF - sed -i "s#__PATCH_DIR__#${patch_dir}#g" "${config_template}" - echo "Applying control-plane static pod resource patches from ${patch_dir}:" - for patch_file in "${patch_dir}"/*.yaml; do - echo "--- ${patch_file}" - sed 's/^/ /' "${patch_file}" - done + if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then + echo "Applying control-plane static pod resource patches from ${patch_dir}:" + for patch_file in "${patch_dir}"/*.yaml; do + echo "--- ${patch_file}" + sed 's/^/ /' "${patch_file}" + done + fi + if [[ "${DISABLE_CONTROL_PLANE_LEADER_ELECTION}" == "true" ]]; then + echo "Disabling kube-controller-manager and kube-scheduler leader election for single-control-plane kind CI." + fi CREATE_ARGS+=(--config-template="${config_template}") fi diff --git a/.github/actions/gpu-operator-install/action.yml b/.github/actions/gpu-operator-install/action.yml index 2727c5e6a..86d247932 100644 --- a/.github/actions/gpu-operator-install/action.yml +++ b/.github/actions/gpu-operator-install/action.yml @@ -39,10 +39,6 @@ inputs: description: 'Continue deploying remaining bundle components after a component failure' required: false default: 'true' - deploy_trace: - description: 'Run generated deploy.sh with bash xtrace for CI diagnostics. Do not enable for bundles with secret command arguments.' - required: false - default: 'false' runs: using: 'composite' @@ -128,14 +124,7 @@ runs: else echo "Deploying bundle with default args" fi - if [[ "${{ inputs.deploy_trace }}" == "true" ]]; then - echo "Deploy trace enabled: running deploy.sh with bash xtrace" - echo "::warning::deploy_trace prints shell command arguments; disable it for bundles with secret values" - export PS4='+ ${BASH_SOURCE}:${LINENO}: ' - bash -x ./deploy.sh "${DEPLOY_ARGS[@]}" - else - ./deploy.sh "${DEPLOY_ARGS[@]}" - fi + ./deploy.sh "${DEPLOY_ARGS[@]}" - name: Wait for GPU operands (bundle) if: inputs.method == 'bundle' diff --git a/.github/workflows/gpu-h100-inference-test.yaml b/.github/workflows/gpu-h100-inference-test.yaml index 95340fbc8..ef08a55e0 100644 --- a/.github/workflows/gpu-h100-inference-test.yaml +++ b/.github/workflows/gpu-h100-inference-test.yaml @@ -126,6 +126,7 @@ jobs: min_available_memory_gb: '16' cluster_create_timeout: 900s control_plane_resource_patches: 'true' + disable_control_plane_leader_election: 'true' - name: Build aicr uses: ./.github/actions/aicr-build @@ -152,7 +153,6 @@ jobs: platform: dynamo wait: 'true' best_effort: 'false' - deploy_trace: 'true' - name: Check control plane health uses: ./.github/actions/check-control-plane-health diff --git a/.github/workflows/gpu-h100-training-test.yaml b/.github/workflows/gpu-h100-training-test.yaml index 5ac900685..f3317b72b 100644 --- a/.github/workflows/gpu-h100-training-test.yaml +++ b/.github/workflows/gpu-h100-training-test.yaml @@ -122,6 +122,7 @@ jobs: min_available_memory_gb: '16' cluster_create_timeout: 900s control_plane_resource_patches: 'true' + disable_control_plane_leader_election: 'true' - name: Build aicr uses: ./.github/actions/aicr-build @@ -149,7 +150,6 @@ jobs: platform: kubeflow wait: 'true' best_effort: 'false' - deploy_trace: 'true' - name: Check control plane health id: post_runtime_control_plane_health diff --git a/docs/user/cli-reference.md b/docs/user/cli-reference.md index 302684d5a..862c098b4 100644 --- a/docs/user/cli-reference.md +++ b/docs/user/cli-reference.md @@ -1318,7 +1318,7 @@ After `helm install`, the same manifests are re-applied as post-install to ensur **Async components:** -Components that use operator patterns with custom resources that reconcile asynchronously (e.g., `kai-scheduler`) are installed without `--wait` to avoid Helm timing out on CR readiness. +Components that use operator patterns with custom resources that reconcile asynchronously (e.g., `kai-scheduler`) are installed without `--wait` to avoid Helm timing out on CR readiness. The `dynamo-platform` Helm command uses client-side apply so retries and upgrades do not conflict with Grove-managed webhook certificate Secret data, while preserving Helm `--wait` behavior. ##### DRA kubelet plugin registration diff --git a/pkg/bundler/deployer/helm/helm_test.go b/pkg/bundler/deployer/helm/helm_test.go index 8524e4bb7..7b52b74cf 100644 --- a/pkg/bundler/deployer/helm/helm_test.go +++ b/pkg/bundler/deployer/helm/helm_test.go @@ -528,6 +528,7 @@ func TestGenerate_DeployScriptComponentTimeouts(t *testing.T) { wantTimeout string wantRetryAssignment string wantRetryCap string + wantApplyArgs string wantComment string rejectRetryCap bool }{ @@ -544,6 +545,7 @@ func TestGenerate_DeployScriptComponentTimeouts(t *testing.T) { wantTimeout: `COMPONENT_HELM_TIMEOUT="30m"`, wantRetryAssignment: `COMPONENT_MAX_RETRIES="1"`, wantRetryCap: `if [[ "${COMPONENT_MAX_RETRIES}" -gt 1 ]]`, + wantApplyArgs: `COMPONENT_HELM_APPLY_ARGS=(--server-side=false)`, }, { name: "kube-prometheus-stack", @@ -609,6 +611,12 @@ func TestGenerate_DeployScriptComponentTimeouts(t *testing.T) { if tt.wantRetryCap != "" && !strings.Contains(componentBlock, tt.wantRetryCap) { t.Errorf("deploy.sh missing %s retry cap %q", tt.component.Name, tt.wantRetryCap) } + if tt.wantApplyArgs != "" && !strings.Contains(componentBlock, tt.wantApplyArgs) { + t.Errorf("deploy.sh missing %s apply args %q", tt.component.Name, tt.wantApplyArgs) + } + if tt.wantApplyArgs != "" && !strings.Contains(script[blockStart:], `"${COMPONENT_HELM_APPLY_ARGS[@]}"`) { + t.Errorf("deploy.sh missing %s apply args in helm command", tt.component.Name) + } if tt.wantComment != "" && !strings.Contains(componentBlock, tt.wantComment) { t.Errorf("deploy.sh missing %s retry rationale", tt.component.Name) } diff --git a/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl b/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl index 8359fc07f..711135bf4 100644 --- a/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl +++ b/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl @@ -375,6 +375,7 @@ retry "{{ .Name }} pre-install manifests" apply_ignoring_crd_race "${SCRIPT_DIR} # unnecessary retry cycles. COMPONENT_HELM_TIMEOUT="${HELM_TIMEOUT}" COMPONENT_MAX_RETRIES="${MAX_RETRIES}" +COMPONENT_HELM_APPLY_ARGS=() {{ if eq .Name "kai-scheduler" -}} COMPONENT_HELM_TIMEOUT="30m" if [[ "${COMPONENT_MAX_RETRIES}" -gt 1 ]]; then @@ -382,6 +383,7 @@ if [[ "${COMPONENT_MAX_RETRIES}" -gt 1 ]]; then fi {{ else if eq .Name "dynamo-platform" -}} COMPONENT_HELM_TIMEOUT="30m" +COMPONENT_HELM_APPLY_ARGS=(--server-side=false) if [[ "${COMPONENT_MAX_RETRIES}" -gt 1 ]]; then COMPONENT_MAX_RETRIES="1" fi @@ -405,6 +407,7 @@ fi helm_retry "{{ .Name }} helm install" "{{ .Namespace }}" \ "${COMPONENT_MAX_RETRIES}" \ helm upgrade --install {{ .Name }} {{ .Repository }}/{{ .ChartName }} \ + "${COMPONENT_HELM_APPLY_ARGS[@]}" \ {{ if .Version }}--version {{ .Version }} \ {{ end -}} -n {{ .Namespace }} --create-namespace \ @@ -416,6 +419,7 @@ helm_retry "{{ .Name }} helm install" "{{ .Namespace }}" \ helm_retry "{{ .Name }} helm install" "{{ .Namespace }}" \ "${COMPONENT_MAX_RETRIES}" \ helm upgrade --install {{ .Name }} {{ .ChartName }} \ + "${COMPONENT_HELM_APPLY_ARGS[@]}" \ --repo {{ .Repository }} \ {{ if .Version }}--version {{ .Version }} \ {{ end -}} From 23b3d5fa26daff9238ed7f193248c5cd71d359c4 Mon Sep 17 00:00:00 2001 From: Yuan Chen Date: Sun, 26 Apr 2026 09:39:15 -0700 Subject: [PATCH 14/21] ci: harden H100 control plane and Dynamo retries --- .../check-control-plane-health/action.yml | 59 ++++++++++--------- .github/actions/gpu-cluster-setup/action.yml | 56 +++++++++++++----- .../workflows/gpu-h100-inference-test.yaml | 2 +- .github/workflows/gpu-h100-training-test.yaml | 2 +- docs/user/cli-reference.md | 2 +- pkg/bundler/deployer/helm/helm_test.go | 17 +++++- .../deployer/helm/templates/deploy.sh.tmpl | 35 ++++++++++- 7 files changed, 120 insertions(+), 53 deletions(-) diff --git a/.github/actions/check-control-plane-health/action.yml b/.github/actions/check-control-plane-health/action.yml index 600310972..fdb9b4a2e 100644 --- a/.github/actions/check-control-plane-health/action.yml +++ b/.github/actions/check-control-plane-health/action.yml @@ -32,7 +32,7 @@ inputs: required: false default: 60s max_restarts: - description: 'Maximum tolerated restart count for each control-plane component' + description: 'Deprecated compatibility input; historical restart counts are reported but not capped' required: false default: '1' stability_window: @@ -62,7 +62,6 @@ runs: NAMESPACE: ${{ inputs.namespace }} COMPONENTS: ${{ inputs.components }} WAIT_TIMEOUT: ${{ inputs.wait_timeout }} - MAX_RESTARTS: ${{ inputs.max_restarts }} STABILITY_WINDOW: ${{ inputs.stability_window }} RECOVER_UNHEALTHY: ${{ inputs.recover_unhealthy }} RECOVERY_COMPONENTS: ${{ inputs.recovery_components }} @@ -70,13 +69,6 @@ runs: run: | set -euo pipefail - MAX_RESTARTS="${MAX_RESTARTS#"${MAX_RESTARTS%%[![:space:]]*}"}" - MAX_RESTARTS="${MAX_RESTARTS%"${MAX_RESTARTS##*[![:space:]]}"}" - if ! [[ "${MAX_RESTARTS}" =~ ^[0-9]+$ ]]; then - echo "::error::max_restarts must be a non-negative integer, got '${MAX_RESTARTS}'" - exit 1 - fi - MAX_RECOVERY_ATTEMPTS="${MAX_RECOVERY_ATTEMPTS#"${MAX_RECOVERY_ATTEMPTS%%[![:space:]]*}"}" MAX_RECOVERY_ATTEMPTS="${MAX_RECOVERY_ATTEMPTS%"${MAX_RECOVERY_ATTEMPTS##*[![:space:]]}"}" if ! [[ "${MAX_RECOVERY_ATTEMPTS}" =~ ^[0-9]+$ ]]; then @@ -152,14 +144,30 @@ runs: echo "${total}" } - enforce_restart_budget() { + report_restart_baseline() { local component="$1" local restart_count="$2" - if (( restart_count > MAX_RESTARTS )); then - echo "::error::${component} restartCount=${restart_count} exceeds max_restarts=${MAX_RESTARTS}" - dump_component_diagnostics "${component}" - kubectl_kind get --raw='/readyz' || true + if (( restart_count > 0 )); then + echo "::warning::${component} has historical restartCount=${restart_count}; checking current readiness and stability window only" + return + fi + echo "${component} restartCount=${restart_count}" + } + + dump_control_plane_summary() { + echo "=== Control-plane pod restart summary ===" + kubectl_kind -n "${NAMESPACE}" get pods -l tier=control-plane -o wide || true + kubectl_kind -n "${NAMESPACE}" get pods -l tier=control-plane \ + -o jsonpath='{range .items[*]}{.metadata.name}{" restartCount="}{range .status.containerStatuses[*]}{.restartCount}{" "}{end}{"\n"}{end}' || true + } + + require_readyz() { + local reason="$1" + + if ! kubectl_kind get --raw='/readyz'; then + echo "::error::kube-apiserver /readyz failed ${reason}" + dump_control_plane_summary exit 1 fi } @@ -170,6 +178,7 @@ runs: local pods local pod + dump_control_plane_summary kubectl_kind -n "${NAMESPACE}" get pod -l "${selector}" -o wide || true kubectl_kind -n "${NAMESPACE}" describe pod -l "${selector}" || true kubectl_kind -n "${NAMESPACE}" get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true @@ -292,7 +301,7 @@ runs: fi fi initial_restarts=$(restart_total "${component}") - enforce_restart_budget "${component}" "${initial_restarts}" + report_restart_baseline "${component}" "${initial_restarts}" INITIAL_RESTARTS["${component}"]="${initial_restarts}" } @@ -322,24 +331,17 @@ runs: exit 1 fi initial_restarts=$(restart_total "${component}") - enforce_restart_budget "${component}" "${initial_restarts}" + report_restart_baseline "${component}" "${initial_restarts}" INITIAL_RESTARTS["${component}"]="${initial_restarts}" recovered=true continue fi final_restarts=$(restart_total "${component}") - enforce_restart_budget "${component}" "${final_restarts}" if (( final_restarts > initial_restarts )); then - if ! try_recover_component "${component}" "restartCount increased from ${initial_restarts} to ${final_restarts} during ${STABILITY_WINDOW}"; then - echo "::error::${component} restartCount increased from ${initial_restarts} to ${final_restarts} during ${STABILITY_WINDOW}" - dump_component_diagnostics "${component}" - exit 1 - fi - initial_restarts=$(restart_total "${component}") - enforce_restart_budget "${component}" "${initial_restarts}" - INITIAL_RESTARTS["${component}"]="${initial_restarts}" - recovered=true - continue + echo "::error::${component} restartCount increased from ${initial_restarts} to ${final_restarts} during ${STABILITY_WINDOW}" + dump_component_diagnostics "${component}" + kubectl_kind get --raw='/readyz' || true + exit 1 fi INITIAL_RESTARTS["${component}"]="${final_restarts}" done @@ -363,7 +365,6 @@ runs: exit 1 fi final_restarts=$(restart_total "${component}") - enforce_restart_budget "${component}" "${final_restarts}" if (( final_restarts > initial_restarts )); then echo "::error::${component} restartCount increased from ${initial_restarts} to ${final_restarts} after recovery" dump_component_diagnostics "${component}" @@ -379,4 +380,4 @@ runs: check_component "${component}" done verify_stability_window - kubectl_kind get --raw='/readyz' + require_readyz "after stability window" diff --git a/.github/actions/gpu-cluster-setup/action.yml b/.github/actions/gpu-cluster-setup/action.yml index d20d7cbdd..55325e870 100644 --- a/.github/actions/gpu-cluster-setup/action.yml +++ b/.github/actions/gpu-cluster-setup/action.yml @@ -44,10 +44,22 @@ inputs: description: 'Apply kubeadm patches that raise control-plane static pod resource requests' required: false default: 'false' - disable_control_plane_leader_election: - description: 'Disable kube-controller-manager and kube-scheduler leader election for single-control-plane kind CI clusters' + control_plane_leader_election_tuning: + description: 'Increase kube-controller-manager and kube-scheduler leader election timeouts for slow CI control planes' required: false default: 'false' + leader_election_lease_duration: + description: 'Leader election lease duration when control_plane_leader_election_tuning is true' + required: false + default: '120s' + leader_election_renew_deadline: + description: 'Leader election renew deadline when control_plane_leader_election_tuning is true' + required: false + default: '90s' + leader_election_retry_period: + description: 'Leader election retry period when control_plane_leader_election_tuning is true' + required: false + default: '10s' api_server_cpu_request: description: 'kube-apiserver CPU request when control_plane_resource_patches is true' required: false @@ -282,7 +294,10 @@ runs: KIND_NODE_IMAGE: ${{ inputs.kind_node_image }} CLUSTER_CREATE_TIMEOUT: ${{ inputs.cluster_create_timeout }} CONTROL_PLANE_RESOURCE_PATCHES: ${{ inputs.control_plane_resource_patches }} - DISABLE_CONTROL_PLANE_LEADER_ELECTION: ${{ inputs.disable_control_plane_leader_election }} + CONTROL_PLANE_LEADER_ELECTION_TUNING: ${{ inputs.control_plane_leader_election_tuning }} + LEADER_ELECTION_LEASE_DURATION: ${{ inputs.leader_election_lease_duration }} + LEADER_ELECTION_RENEW_DEADLINE: ${{ inputs.leader_election_renew_deadline }} + LEADER_ELECTION_RETRY_PERIOD: ${{ inputs.leader_election_retry_period }} API_SERVER_CPU_REQUEST: ${{ inputs.api_server_cpu_request }} API_SERVER_MEMORY_REQUEST: ${{ inputs.api_server_memory_request }} CONTROLLER_MANAGER_CPU_REQUEST: ${{ inputs.controller_manager_cpu_request }} @@ -309,16 +324,16 @@ runs: ;; esac - case "${DISABLE_CONTROL_PLANE_LEADER_ELECTION}" in + case "${CONTROL_PLANE_LEADER_ELECTION_TUNING}" in true) ;; - ""|false) DISABLE_CONTROL_PLANE_LEADER_ELECTION=false ;; + ""|false) CONTROL_PLANE_LEADER_ELECTION_TUNING=false ;; *) - echo "::error::disable_control_plane_leader_election must be true or false, got '${DISABLE_CONTROL_PLANE_LEADER_ELECTION}'" + echo "::error::control_plane_leader_election_tuning must be true or false, got '${CONTROL_PLANE_LEADER_ELECTION_TUNING}'" exit 1 ;; esac - if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" || "${DISABLE_CONTROL_PLANE_LEADER_ELECTION}" == "true" ]]; then + if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" || "${CONTROL_PLANE_LEADER_ELECTION_TUNING}" == "true" ]]; then patch_dir="$(mktemp -d)" config_template="$(mktemp)" @@ -417,19 +432,27 @@ runs: directory: /patches EOF fi - if [[ "${DISABLE_CONTROL_PLANE_LEADER_ELECTION}" == "true" ]]; then - cat >> "${config_template}" <<'EOF' + if [[ "${CONTROL_PLANE_LEADER_ELECTION_TUNING}" == "true" ]]; then + cat >> "${config_template}" <> "${config_template}" <<'EOF' @@ -462,8 +485,11 @@ runs: sed 's/^/ /' "${patch_file}" done fi - if [[ "${DISABLE_CONTROL_PLANE_LEADER_ELECTION}" == "true" ]]; then - echo "Disabling kube-controller-manager and kube-scheduler leader election for single-control-plane kind CI." + if [[ "${CONTROL_PLANE_LEADER_ELECTION_TUNING}" == "true" ]]; then + echo "Increasing kube-controller-manager and kube-scheduler leader election timeouts for slow CI control planes:" + echo " lease-duration=${LEADER_ELECTION_LEASE_DURATION}" + echo " renew-deadline=${LEADER_ELECTION_RENEW_DEADLINE}" + echo " retry-period=${LEADER_ELECTION_RETRY_PERIOD}" fi CREATE_ARGS+=(--config-template="${config_template}") fi diff --git a/.github/workflows/gpu-h100-inference-test.yaml b/.github/workflows/gpu-h100-inference-test.yaml index ef08a55e0..38a82bd88 100644 --- a/.github/workflows/gpu-h100-inference-test.yaml +++ b/.github/workflows/gpu-h100-inference-test.yaml @@ -126,7 +126,7 @@ jobs: min_available_memory_gb: '16' cluster_create_timeout: 900s control_plane_resource_patches: 'true' - disable_control_plane_leader_election: 'true' + control_plane_leader_election_tuning: 'true' - name: Build aicr uses: ./.github/actions/aicr-build diff --git a/.github/workflows/gpu-h100-training-test.yaml b/.github/workflows/gpu-h100-training-test.yaml index f3317b72b..f5b95fb8c 100644 --- a/.github/workflows/gpu-h100-training-test.yaml +++ b/.github/workflows/gpu-h100-training-test.yaml @@ -122,7 +122,7 @@ jobs: min_available_memory_gb: '16' cluster_create_timeout: 900s control_plane_resource_patches: 'true' - disable_control_plane_leader_election: 'true' + control_plane_leader_election_tuning: 'true' - name: Build aicr uses: ./.github/actions/aicr-build diff --git a/docs/user/cli-reference.md b/docs/user/cli-reference.md index 862c098b4..d5f340c49 100644 --- a/docs/user/cli-reference.md +++ b/docs/user/cli-reference.md @@ -1318,7 +1318,7 @@ After `helm install`, the same manifests are re-applied as post-install to ensur **Async components:** -Components that use operator patterns with custom resources that reconcile asynchronously (e.g., `kai-scheduler`) are installed without `--wait` to avoid Helm timing out on CR readiness. The `dynamo-platform` Helm command uses client-side apply so retries and upgrades do not conflict with Grove-managed webhook certificate Secret data, while preserving Helm `--wait` behavior. +Components that use operator patterns with custom resources that reconcile asynchronously (e.g., `kai-scheduler`) are installed without `--wait` to avoid Helm timing out on CR readiness. The `dynamo-platform` Helm command uses client-side apply so retries and upgrades do not conflict with Grove-managed webhook certificate Secret data, while preserving Helm `--wait` behavior. Dynamo installs use a 20 minute per-attempt timeout and cap the retry budget at 3 retries because cold-start failures often involve operator webhook and certificate resources settling across attempts. ##### DRA kubelet plugin registration diff --git a/pkg/bundler/deployer/helm/helm_test.go b/pkg/bundler/deployer/helm/helm_test.go index 7b52b74cf..e5d06fa1e 100644 --- a/pkg/bundler/deployer/helm/helm_test.go +++ b/pkg/bundler/deployer/helm/helm_test.go @@ -530,6 +530,7 @@ func TestGenerate_DeployScriptComponentTimeouts(t *testing.T) { wantRetryCap string wantApplyArgs string wantComment string + wantSnippets []string rejectRetryCap bool }{ { @@ -542,10 +543,15 @@ func TestGenerate_DeployScriptComponentTimeouts(t *testing.T) { Type: recipe.ComponentTypeHelm, Source: "oci://nvcr.io/nvidia/ai-dynamo", }, - wantTimeout: `COMPONENT_HELM_TIMEOUT="30m"`, - wantRetryAssignment: `COMPONENT_MAX_RETRIES="1"`, - wantRetryCap: `if [[ "${COMPONENT_MAX_RETRIES}" -gt 1 ]]`, + wantTimeout: `COMPONENT_HELM_TIMEOUT="20m"`, + wantRetryAssignment: `COMPONENT_MAX_RETRIES="3"`, + wantRetryCap: `if [[ "${COMPONENT_MAX_RETRIES}" -gt 3 ]]`, wantApplyArgs: `COMPONENT_HELM_APPLY_ARGS=(--server-side=false)`, + wantSnippets: []string{ + `dump_dynamo_platform_helm_diagnostics "${namespace}"`, + `deployment/dynamo-platform-dynamo-operator-controller-manager`, + `--previous --tail=200`, + }, }, { name: "kube-prometheus-stack", @@ -620,6 +626,11 @@ func TestGenerate_DeployScriptComponentTimeouts(t *testing.T) { if tt.wantComment != "" && !strings.Contains(componentBlock, tt.wantComment) { t.Errorf("deploy.sh missing %s retry rationale", tt.component.Name) } + for _, snippet := range tt.wantSnippets { + if !strings.Contains(script, snippet) { + t.Errorf("deploy.sh missing %s snippet %q", tt.component.Name, snippet) + } + } if tt.rejectRetryCap && retryCapPattern.MatchString(componentBlock) { t.Errorf("deploy.sh should not cap %s retries in its component block", tt.component.Name) } diff --git a/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl b/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl index 711135bf4..cd4e786cd 100644 --- a/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl +++ b/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl @@ -144,6 +144,34 @@ function dump_kai_scheduler_helm_diagnostics() { echo " --- End ${namespace} diagnostics ---" } +function dump_dynamo_platform_helm_diagnostics() { + local namespace="$1" + if [[ "${namespace}" != "dynamo-system" ]]; then + return + fi + + echo " --- ${namespace} diagnostics ---" + echo " Deployments:" + kubectl get deployments -n "${namespace}" -o wide 2>/dev/null || true + echo " Jobs:" + kubectl get jobs -n "${namespace}" 2>/dev/null || true + echo " Pods:" + kubectl get pods -n "${namespace}" -o wide 2>/dev/null || true + echo " Pod descriptions:" + kubectl describe pods -n "${namespace}" 2>/dev/null || true + echo " Dynamo operator manager logs:" + kubectl logs deployment/dynamo-platform-dynamo-operator-controller-manager -n "${namespace}" -c manager --tail=200 2>/dev/null || true + echo " Dynamo operator manager previous logs:" + kubectl logs deployment/dynamo-platform-dynamo-operator-controller-manager -n "${namespace}" -c manager --previous --tail=200 2>/dev/null || true + echo " Grove operator logs:" + kubectl logs deployment/grove-operator -n "${namespace}" --all-containers --tail=100 2>/dev/null || true + echo " Grove operator previous logs:" + kubectl logs deployment/grove-operator -n "${namespace}" --all-containers --previous --tail=100 2>/dev/null || true + echo " Recent events:" + kubectl get events -n "${namespace}" --sort-by='.lastTimestamp' 2>/dev/null | tail -50 || true + echo " --- End ${namespace} diagnostics ---" +} + # helm_retry contract: # helm_retry "" "" "" [args...] # Callers must pass the retry budget as the third positional argument before the @@ -161,6 +189,7 @@ function helm_retry() { fi attempt=$((attempt + 1)) dump_kai_scheduler_helm_diagnostics "${namespace}" + dump_dynamo_platform_helm_diagnostics "${namespace}" if [[ ${attempt} -gt ${max_retries} ]]; then echo "ERROR: ${desc} failed after ${attempt} attempts" return 1 @@ -382,10 +411,10 @@ if [[ "${COMPONENT_MAX_RETRIES}" -gt 1 ]]; then COMPONENT_MAX_RETRIES="1" fi {{ else if eq .Name "dynamo-platform" -}} -COMPONENT_HELM_TIMEOUT="30m" +COMPONENT_HELM_TIMEOUT="20m" COMPONENT_HELM_APPLY_ARGS=(--server-side=false) -if [[ "${COMPONENT_MAX_RETRIES}" -gt 1 ]]; then - COMPONENT_MAX_RETRIES="1" +if [[ "${COMPONENT_MAX_RETRIES}" -gt 3 ]]; then + COMPONENT_MAX_RETRIES="3" fi {{ else if eq .Name "kube-prometheus-stack" -}} # Grafana can trip its Deployment progress deadline before a longer Helm From 0f308d68e8aa1e40ca73a403c251a7ac1a476108 Mon Sep 17 00:00:00 2001 From: Yuan Chen Date: Sun, 26 Apr 2026 14:36:45 -0700 Subject: [PATCH 15/21] Stabilize H100 GPU CI checks --- .../check-control-plane-health/action.yml | 104 ++++++++++++++++- .github/actions/gpu-cluster-setup/action.yml | 101 +++++++++++++++- .github/actions/gpu-test-cleanup/action.yml | 69 ++++++++++- .../workflows/gpu-h100-inference-test.yaml | 11 +- .github/workflows/gpu-h100-training-test.yaml | 13 +-- .github/workflows/gpu-smoke-test.yaml | 6 + docs/user/cli-reference.md | 4 +- pkg/bundler/deployer/helm/helm_test.go | 110 ++++++++++++++++-- .../deployer/helm/templates/README.md.tmpl | 20 +++- .../helm/templates/component-README.md.tmpl | 32 ++++- .../deployer/helm/templates/deploy.sh.tmpl | 7 +- tests/chainsaw/ai-conformance/README.md | 3 +- .../kind-common/assert-monitoring.yaml | 85 ++++++++++++++ .../kind-inference-dynamo/chainsaw-test.yaml | 4 +- .../kind-training-kubeflow/chainsaw-test.yaml | 4 +- 15 files changed, 520 insertions(+), 53 deletions(-) create mode 100644 tests/chainsaw/ai-conformance/kind-common/assert-monitoring.yaml diff --git a/.github/actions/check-control-plane-health/action.yml b/.github/actions/check-control-plane-health/action.yml index fdb9b4a2e..582d78746 100644 --- a/.github/actions/check-control-plane-health/action.yml +++ b/.github/actions/check-control-plane-health/action.yml @@ -85,6 +85,9 @@ runs: echo "::error::stability_window must be a duration like 0s, 60s, or 2m; got '${STABILITY_WINDOW}'" exit 1 fi + if [[ "${STABILITY_WINDOW}" =~ ^0+[smh]$ ]]; then + STABILITY_WINDOW="0s" + fi RECOVER_UNHEALTHY="${RECOVER_UNHEALTHY#"${RECOVER_UNHEALTHY%%[![:space:]]*}"}" RECOVER_UNHEALTHY="${RECOVER_UNHEALTHY%"${RECOVER_UNHEALTHY##*[![:space:]]}"}" @@ -167,11 +170,109 @@ runs: if ! kubectl_kind get --raw='/readyz'; then echo "::error::kube-apiserver /readyz failed ${reason}" - dump_control_plane_summary + dump_all_control_plane_runtime_diagnostics exit 1 fi } + dump_api_server_health() { + local endpoint + + for endpoint in '/livez?verbose' '/readyz?verbose' '/healthz'; do + echo "=== kube-apiserver ${endpoint} ===" + kubectl_kind get --raw="${endpoint}" || true + done + } + + dump_kind_node_runtime_summary() { + local node="${KIND_CLUSTER_NAME}-control-plane" + + if ! docker_timeout inspect "${node}" >/dev/null 2>&1; then + echo "::warning::cannot collect node runtime summary: kind node container ${node} not found" + return + fi + + echo "=== ${node} docker stats ===" + docker_timeout stats --no-stream \ + --format 'table {{.Name}}\t{{.CPUPerc}}\t{{.MemUsage}}\t{{.NetIO}}\t{{.BlockIO}}\t{{.PIDs}}' \ + "${node}" || true + + echo "=== ${node} docker inspect state ===" + docker_timeout inspect \ + --format 'status={{.State.Status}} running={{.State.Running}} oomKilled={{.State.OOMKilled}} pid={{.State.Pid}} started={{.State.StartedAt}} finished={{.State.FinishedAt}}' \ + "${node}" || true + + echo "=== ${node} node pressure snapshot ===" + docker_timeout exec "${node}" sh -c ' + date + uptime || true + free -h || true + df -h / /var/lib/containerd /var/lib/kubelet 2>/dev/null || df -h + echo "--- top cpu/memory processes ---" + ps -eo pid,ppid,stat,etime,%cpu,%mem,comm,args --sort=-%cpu | head -40 || true + ' || true + + echo "=== ${node} CRI pod/container summary ===" + docker_timeout exec "${node}" crictl pods || true + docker_timeout exec "${node}" crictl ps -a || true + docker_timeout exec "${node}" crictl stats || true + } + + dump_static_pod_runtime_diagnostics() { + local component="$1" + local node="${KIND_CLUSTER_NAME}-control-plane" + local container_ids + local container_id + local count=0 + + if ! docker_timeout inspect "${node}" >/dev/null 2>&1; then + echo "::warning::cannot collect ${component} runtime diagnostics: kind node container ${node} not found" + return + fi + + echo "=== ${node} ${component} static pod manifest ===" + docker_timeout exec "${node}" sh -c "sed -n '1,220p' /etc/kubernetes/manifests/${component}.yaml" || true + + echo "=== ${node} ${component} CRI containers ===" + docker_timeout exec "${node}" crictl ps -a --name "${component}" || true + + container_ids=$(docker_timeout exec "${node}" crictl ps -a --name "${component}" -q 2>/dev/null || true) + for container_id in ${container_ids}; do + count=$((count + 1)) + if (( count > 8 )); then + echo "Skipping remaining ${component} CRI containers after first 8 entries." + break + fi + + echo "=== ${node} crictl inspect ${component} ${container_id} ===" + docker_timeout exec "${node}" crictl inspect "${container_id}" || true + echo "=== ${node} crictl logs ${component} ${container_id} ===" + docker_timeout exec "${node}" crictl logs --tail=200 "${container_id}" || true + done + + echo "=== ${node} kubelet journal (${component}) ===" + docker_timeout exec "${node}" journalctl -u kubelet --since '45 minutes ago' --no-pager 2>/dev/null \ + | grep -Ei "${component}|static pod|mirror pod|probe|liveness|readiness|startup|back-off|backoff|container|failed|error|oom|killed" \ + | tail -200 || true + + echo "=== ${node} containerd journal (${component}) ===" + docker_timeout exec "${node}" journalctl -u containerd --since '45 minutes ago' --no-pager 2>/dev/null \ + | grep -Ei "${component}|container|task|shim|deadline|failed|error|oom|killed" \ + | tail -200 || true + } + + dump_all_control_plane_runtime_diagnostics() { + local component + + dump_control_plane_summary + dump_api_server_health + dump_kind_node_runtime_summary + for component in ${COMPONENTS}; do + dump_static_pod_runtime_diagnostics "${component}" + kubectl_kind -n kube-system get lease "${component}" -o yaml 2>/dev/null || true + done + } + dump_component_diagnostics() { local component="$1" local selector="component=${component}" @@ -192,6 +293,7 @@ runs: kubectl_kind -n "${NAMESPACE}" logs "${pod}" --all-containers --previous --tail=100 2>/dev/null || true done <<< "${pods}" + dump_all_control_plane_runtime_diagnostics kubectl_kind -n kube-system get lease "${component}" -o yaml 2>/dev/null || true } diff --git a/.github/actions/gpu-cluster-setup/action.yml b/.github/actions/gpu-cluster-setup/action.yml index 55325e870..93ab80cef 100644 --- a/.github/actions/gpu-cluster-setup/action.yml +++ b/.github/actions/gpu-cluster-setup/action.yml @@ -51,11 +51,11 @@ inputs: leader_election_lease_duration: description: 'Leader election lease duration when control_plane_leader_election_tuning is true' required: false - default: '120s' + default: '300s' leader_election_renew_deadline: description: 'Leader election renew deadline when control_plane_leader_election_tuning is true' required: false - default: '90s' + default: '240s' leader_election_retry_period: description: 'Leader election retry period when control_plane_leader_election_tuning is true' required: false @@ -427,13 +427,27 @@ runs: cat >> "${config_template}" <<'EOF' - | kind: InitConfiguration - apiVersion: kubeadm.k8s.io/v1beta4 patches: directory: /patches EOF fi if [[ "${CONTROL_PLANE_LEADER_ELECTION_TUNING}" == "true" ]]; then + # kind v0.31 renders kubeadm v1beta3. Keep a v1beta4 patch too so + # this remains valid when a future kind image switches API versions. cat >> "${config_template}" < /tmp/debug-artifacts/all-resources.txt || true kubectl_kind get events --all-namespaces --sort-by='.lastTimestamp' > /tmp/debug-artifacts/events.txt || true + kubectl_kind get --raw='/livez?verbose' > /tmp/debug-artifacts/apiserver-livez.txt 2>&1 || true + kubectl_kind get --raw='/readyz?verbose' > /tmp/debug-artifacts/apiserver-readyz.txt 2>&1 || true + kubectl_kind -n kube-system get pods -l tier=control-plane -o wide \ + > /tmp/debug-artifacts/control-plane-pods.txt 2>&1 || true + kubectl_kind -n kube-system get events --sort-by='.lastTimestamp' \ + > /tmp/debug-artifacts/kube-system-events.txt 2>&1 || true + for component in ${CONTROL_PLANE_COMPONENTS}; do + kubectl_kind -n kube-system describe pod -l "component=${component}" \ + > "/tmp/debug-artifacts/${component}-describe.txt" 2>&1 || true + kubectl_kind -n kube-system logs -l "component=${component}" --all-containers --tail=300 \ + > "/tmp/debug-artifacts/${component}-logs.txt" 2>&1 || true + kubectl_kind -n kube-system logs -l "component=${component}" --all-containers --previous --tail=300 \ + > "/tmp/debug-artifacts/${component}-previous-logs.txt" 2>&1 || true + kubectl_kind -n kube-system get lease "${component}" -o yaml \ + > "/tmp/debug-artifacts/${component}-lease.yaml" 2>&1 || true + done kubectl_kind -n gpu-operator get pods -o wide > /tmp/debug-artifacts/gpu-operator-pods.txt || true kubectl_kind -n gpu-operator logs -l app=nvidia-device-plugin-daemonset --tail=100 > /tmp/debug-artifacts/device-plugin-logs.txt || true kubectl_kind -n gpu-operator logs -l app.kubernetes.io/component=gpu-operator --tail=100 > /tmp/debug-artifacts/gpu-operator-logs.txt || true @@ -55,8 +72,54 @@ runs: echo "No recipe.yaml or bundle directory found; skipping runtime bundle archive" fi + docker ps --filter "label=io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}" \ + --format '{{.Names}}' | sort | while read -r node_container; do + [[ -z "${node_container}" ]] && continue + node_file="${node_container//[^A-Za-z0-9_.-]/_}" + timeout 30s docker exec "${node_container}" journalctl -u kubelet \ + --since "90 minutes ago" --no-pager \ + > "/tmp/debug-artifacts/${node_file}-kubelet-journal.txt" 2>&1 || true + timeout 30s docker exec "${node_container}" journalctl -u containerd \ + --since "90 minutes ago" --no-pager \ + > "/tmp/debug-artifacts/${node_file}-containerd-journal.txt" 2>&1 || true + timeout 30s docker exec "${node_container}" crictl ps -a \ + > "/tmp/debug-artifacts/${node_file}-crictl-ps-a.txt" 2>&1 || true + timeout 30s docker exec "${node_container}" crictl pods \ + > "/tmp/debug-artifacts/${node_file}-crictl-pods.txt" 2>&1 || true + timeout 30s docker exec "${node_container}" crictl stats \ + > "/tmp/debug-artifacts/${node_file}-crictl-stats.txt" 2>&1 || true + timeout 30s docker exec "${node_container}" sh -c ' + date + uptime || true + free -h || true + df -h / /var/lib/containerd /var/lib/kubelet 2>/dev/null || df -h + echo "--- top cpu/memory processes ---" + ps -eo pid,ppid,stat,etime,%cpu,%mem,comm,args --sort=-%cpu | head -40 || true + ' > "/tmp/debug-artifacts/${node_file}-node-pressure.txt" 2>&1 || true + timeout 120s docker exec "${node_container}" sh -c ' + for component in kube-apiserver kube-controller-manager kube-scheduler etcd; do + echo "=== ${component} static pod manifest ===" + sed -n "1,220p" "/etc/kubernetes/manifests/${component}.yaml" 2>/dev/null || true + echo "=== ${component} CRI containers ===" + crictl ps -a --name "${component}" || true + count=0 + for container_id in $(crictl ps -a --name "${component}" -q 2>/dev/null); do + count=$((count + 1)) + if [ "${count}" -gt 8 ]; then + echo "Skipping remaining ${component} CRI containers after first 8 entries." + break + fi + echo "=== crictl inspect ${component} ${container_id} ===" + crictl inspect "${container_id}" || true + echo "=== crictl logs ${component} ${container_id} ===" + crictl logs --tail=300 "${container_id}" || true + done + done + ' > "/tmp/debug-artifacts/${node_file}-control-plane-cri.txt" 2>&1 || true + done + - name: Export kind logs - if: failure() + if: failure() || cancelled() shell: bash env: KIND_CLUSTER_NAME: ${{ inputs.cluster_name }} @@ -82,7 +145,7 @@ runs: timeout 60s docker system prune -f --filter "until=24h" || true - name: Upload debug artifacts - if: failure() + if: failure() || cancelled() uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0 with: name: ${{ inputs.artifact_name_prefix }}-${{ github.run_id }} diff --git a/.github/workflows/gpu-h100-inference-test.yaml b/.github/workflows/gpu-h100-inference-test.yaml index 38a82bd88..081bb0261 100644 --- a/.github/workflows/gpu-h100-inference-test.yaml +++ b/.github/workflows/gpu-h100-inference-test.yaml @@ -128,10 +128,10 @@ jobs: control_plane_resource_patches: 'true' control_plane_leader_election_tuning: 'true' - - name: Build aicr + - name: Build aicr and snapshot agent image uses: ./.github/actions/aicr-build with: - build_snapshot_agent: 'false' + build_snapshot_agent: 'true' validator_phases: 'none' # Fast readiness gate after cluster setup. Stability windows start after @@ -164,13 +164,6 @@ jobs: # --- Snapshot and GPU validation --- - - name: Build snapshot agent image - uses: ./.github/actions/aicr-build - with: - build_cli: 'false' - build_snapshot_agent: 'true' - validator_phases: 'none' - - name: Snapshot and validate GPU uses: ./.github/actions/gpu-snapshot-validate with: diff --git a/.github/workflows/gpu-h100-training-test.yaml b/.github/workflows/gpu-h100-training-test.yaml index f5b95fb8c..a617bc718 100644 --- a/.github/workflows/gpu-h100-training-test.yaml +++ b/.github/workflows/gpu-h100-training-test.yaml @@ -124,10 +124,10 @@ jobs: control_plane_resource_patches: 'true' control_plane_leader_election_tuning: 'true' - - name: Build aicr + - name: Build aicr and snapshot agent image uses: ./.github/actions/aicr-build with: - build_snapshot_agent: 'false' + build_snapshot_agent: 'true' validator_phases: 'none' # Fast readiness gate after cluster setup. Stability windows start after @@ -160,15 +160,6 @@ jobs: stability_window: 60s recover_unhealthy: 'true' - # --- Snapshot and GPU validation --- - - - name: Build snapshot agent image - uses: ./.github/actions/aicr-build - with: - build_cli: 'false' - build_snapshot_agent: 'true' - validator_phases: 'none' - - name: Snapshot and validate GPU uses: ./.github/actions/gpu-snapshot-validate with: diff --git a/.github/workflows/gpu-smoke-test.yaml b/.github/workflows/gpu-smoke-test.yaml index d5b8c5c74..25d968b67 100644 --- a/.github/workflows/gpu-smoke-test.yaml +++ b/.github/workflows/gpu-smoke-test.yaml @@ -88,6 +88,12 @@ jobs: - name: Set up GPU cluster uses: ./.github/actions/gpu-cluster-setup + with: + # Keep smoke runner preflight explicit so action default changes do not + # silently alter L40G coverage. + min_gpu_count: '1' + min_free_disk_gb: '20' + min_available_memory_gb: '8' - name: Build aicr uses: ./.github/actions/aicr-build diff --git a/docs/user/cli-reference.md b/docs/user/cli-reference.md index d5f340c49..a6dd85b1d 100644 --- a/docs/user/cli-reference.md +++ b/docs/user/cli-reference.md @@ -1308,6 +1308,8 @@ Unknown flags are rejected with an error to catch typos (e.g., `--best-effort`). The deploy script retries failed `helm upgrade --install` and `kubectl apply` operations with exponential backoff. By default, each operation is retried up to 5 times (6 total attempts). The backoff delay increases quadratically: 5s, 20s, 45s, 80s, 120s (capped) between retries. +On slower H100 CI runners, `kube-prometheus-stack` can hit Grafana's Deployment progress deadline before a longer Helm timeout would help. The deploy script intentionally keeps the default timeout and retry budget for `kube-prometheus-stack` so subsequent upgrade attempts can succeed after image pulls and controllers settle. Kind H100 Chainsaw health checks do not require Grafana because AICR conformance metrics use Prometheus, DCGM exporter, and prometheus-adapter directly. + Use `--retries 0` to disable retries (fail-fast behavior). When `--best-effort` is also set, retries are exhausted first before falling through to best-effort handling. **Pre-install manifests and CRD ordering:** @@ -1318,7 +1320,7 @@ After `helm install`, the same manifests are re-applied as post-install to ensur **Async components:** -Components that use operator patterns with custom resources that reconcile asynchronously (e.g., `kai-scheduler`) are installed without `--wait` to avoid Helm timing out on CR readiness. The `dynamo-platform` Helm command uses client-side apply so retries and upgrades do not conflict with Grove-managed webhook certificate Secret data, while preserving Helm `--wait` behavior. Dynamo installs use a 20 minute per-attempt timeout and cap the retry budget at 3 retries because cold-start failures often involve operator webhook and certificate resources settling across attempts. +Components that use operator patterns with custom resources that reconcile asynchronously (e.g., `kai-scheduler`) are installed without `--wait` to avoid Helm timing out on CR readiness. Kai Scheduler installs use a 30 minute per-attempt timeout and cap the retry budget at 1 retry so hook diagnostics surface quickly on cold runners. The `dynamo-platform` Helm command uses client-side apply so retries and upgrades do not conflict with Grove-managed webhook certificate Secret data, while preserving Helm `--wait` behavior. Dynamo installs use a 20 minute per-attempt timeout and cap the retry budget at 3 retries because cold-start failures often involve operator webhook and certificate resources settling across attempts. ##### DRA kubelet plugin registration diff --git a/pkg/bundler/deployer/helm/helm_test.go b/pkg/bundler/deployer/helm/helm_test.go index e5d06fa1e..ff41c88e4 100644 --- a/pkg/bundler/deployer/helm/helm_test.go +++ b/pkg/bundler/deployer/helm/helm_test.go @@ -518,20 +518,47 @@ func TestGenerate_DeployScriptKaiSchedulerTimeout(t *testing.T) { if !strings.Contains(script, `kubectl describe pods -n "${namespace}"`) { t.Error("deploy.sh missing pod diagnostics") } + + rootReadmeContent, err := os.ReadFile(filepath.Join(outputDir, "README.md")) + if err != nil { + t.Fatalf("failed to read root README: %v", err) + } + componentReadmeContent, err := os.ReadFile(filepath.Join(outputDir, "kai-scheduler", "README.md")) + if err != nil { + t.Fatalf("failed to read component README: %v", err) + } + rootReadme := string(rootReadmeContent) + componentReadme := string(componentReadmeContent) + if !strings.Contains(rootReadme, `--timeout 30m`) { + t.Error("root README missing kai-scheduler 30m timeout") + } + if !strings.Contains(componentReadme, `--timeout 30m`) { + t.Error("component README missing kai-scheduler 30m timeout") + } + if strings.Contains(componentReadme, `--wait --timeout 30m`) { + t.Error("component README should document kai-scheduler without --wait") + } + if strings.Contains(componentReadme, `--wait --timeout 10m`) { + t.Error("component README should not use default timeout for kai-scheduler") + } } func TestGenerate_DeployScriptComponentTimeouts(t *testing.T) { retryCapPattern := regexp.MustCompile(`(?m)(if \[\[ "\$\{COMPONENT_MAX_RETRIES\}" -gt \d+ \]\]|COMPONENT_MAX_RETRIES="\d+")`) + applyArgsExpansion := `${COMPONENT_HELM_APPLY_ARGS[@]+"${COMPONENT_HELM_APPLY_ARGS[@]}"}` tests := []struct { - name string - component recipe.ComponentRef - wantTimeout string - wantRetryAssignment string - wantRetryCap string - wantApplyArgs string - wantComment string - wantSnippets []string - rejectRetryCap bool + name string + component recipe.ComponentRef + wantTimeout string + wantRetryAssignment string + wantRetryCap string + wantApplyArgs string + wantComment string + wantSnippets []string + wantReadmeSnippets []string + rejectSnippets []string + rejectReadmeSnippets []string + rejectRetryCap bool }{ { name: "dynamo-platform", @@ -552,6 +579,10 @@ func TestGenerate_DeployScriptComponentTimeouts(t *testing.T) { `deployment/dynamo-platform-dynamo-operator-controller-manager`, `--previous --tail=200`, }, + wantReadmeSnippets: []string{ + `--server-side=false`, + `--wait --timeout 20m`, + }, }, { name: "kube-prometheus-stack", @@ -567,6 +598,33 @@ func TestGenerate_DeployScriptComponentTimeouts(t *testing.T) { wantComment: `preserve the default retry`, rejectRetryCap: true, }, + { + name: "ordinary component defaults", + component: recipe.ComponentRef{ + Name: "gpu-operator", + Namespace: "gpu-operator", + Chart: "gpu-operator", + Version: "v25.10.1", + Type: recipe.ComponentTypeHelm, + Source: "https://helm.ngc.nvidia.com/nvidia", + }, + wantTimeout: `COMPONENT_HELM_TIMEOUT="${HELM_TIMEOUT}"`, + wantApplyArgs: `COMPONENT_HELM_APPLY_ARGS=()`, + wantReadmeSnippets: []string{ + `--wait --timeout 10m`, + }, + rejectSnippets: []string{ + `--server-side=false`, + `COMPONENT_MAX_RETRIES="1"`, + `COMPONENT_MAX_RETRIES="3"`, + }, + rejectReadmeSnippets: []string{ + `--server-side=false`, + `--wait --timeout 20m`, + `--timeout 30m`, + }, + rejectRetryCap: true, + }, } for _, tt := range tests { @@ -620,7 +678,7 @@ func TestGenerate_DeployScriptComponentTimeouts(t *testing.T) { if tt.wantApplyArgs != "" && !strings.Contains(componentBlock, tt.wantApplyArgs) { t.Errorf("deploy.sh missing %s apply args %q", tt.component.Name, tt.wantApplyArgs) } - if tt.wantApplyArgs != "" && !strings.Contains(script[blockStart:], `"${COMPONENT_HELM_APPLY_ARGS[@]}"`) { + if tt.wantApplyArgs != "" && !strings.Contains(script[blockStart:], applyArgsExpansion) { t.Errorf("deploy.sh missing %s apply args in helm command", tt.component.Name) } if tt.wantComment != "" && !strings.Contains(componentBlock, tt.wantComment) { @@ -631,9 +689,41 @@ func TestGenerate_DeployScriptComponentTimeouts(t *testing.T) { t.Errorf("deploy.sh missing %s snippet %q", tt.component.Name, snippet) } } + for _, snippet := range tt.rejectSnippets { + if strings.Contains(componentBlock, snippet) { + t.Errorf("deploy.sh should not include %s snippet %q", tt.component.Name, snippet) + } + } if tt.rejectRetryCap && retryCapPattern.MatchString(componentBlock) { t.Errorf("deploy.sh should not cap %s retries in its component block", tt.component.Name) } + + rootReadmeContent, err := os.ReadFile(filepath.Join(outputDir, "README.md")) + if err != nil { + t.Fatalf("failed to read root README: %v", err) + } + componentReadmeContent, err := os.ReadFile(filepath.Join(outputDir, tt.component.Name, "README.md")) + if err != nil { + t.Fatalf("failed to read component README: %v", err) + } + rootReadme := string(rootReadmeContent) + componentReadme := string(componentReadmeContent) + for _, snippet := range tt.wantReadmeSnippets { + if !strings.Contains(rootReadme, snippet) { + t.Errorf("root README missing %s snippet %q", tt.component.Name, snippet) + } + if !strings.Contains(componentReadme, snippet) { + t.Errorf("component README missing %s snippet %q", tt.component.Name, snippet) + } + } + for _, snippet := range tt.rejectReadmeSnippets { + if strings.Contains(rootReadme, snippet) { + t.Errorf("root README should not include %s snippet %q", tt.component.Name, snippet) + } + if strings.Contains(componentReadme, snippet) { + t.Errorf("component README should not include %s snippet %q", tt.component.Name, snippet) + } + } }) } } diff --git a/pkg/bundler/deployer/helm/templates/README.md.tmpl b/pkg/bundler/deployer/helm/templates/README.md.tmpl index 3c3e874f4..ba3cef380 100644 --- a/pkg/bundler/deployer/helm/templates/README.md.tmpl +++ b/pkg/bundler/deployer/helm/templates/README.md.tmpl @@ -77,19 +77,31 @@ kustomize build '{{ .Repository }}//{{ .Path }}{{ if .Tag }}?ref={{ .Tag }}{{ en ```bash {{ if .IsOCI -}} helm upgrade --install {{ .Name }} {{ .Repository }}/{{ .ChartName }} \ + {{ if eq .Name "dynamo-platform" }}--server-side=false \ + {{ end -}} --version {{ .Version }} \ -n {{ .Namespace }} --create-namespace \ -f {{ .Name }}/values.yaml \ -f {{ .Name }}/cluster-values.yaml \ - --wait --timeout 10m + {{ if eq .Name "kai-scheduler" -}} + --timeout 30m + {{ else -}} + --wait --timeout {{ if eq .Name "dynamo-platform" }}20m{{ else }}10m{{ end }} + {{ end -}} {{ else -}} helm upgrade --install {{ .Name }} {{ .ChartName }} \ + {{ if eq .Name "dynamo-platform" }}--server-side=false \ + {{ end -}} --repo {{ .Repository }} \ --version {{ .Version }} \ -n {{ .Namespace }} --create-namespace \ -f {{ .Name }}/values.yaml \ -f {{ .Name }}/cluster-values.yaml \ - --wait --timeout 10m + {{ if eq .Name "kai-scheduler" -}} + --timeout 30m + {{ else -}} + --wait --timeout {{ if eq .Name "dynamo-platform" }}20m{{ else }}10m{{ end }} + {{ end -}} {{ end -}} ``` {{ end -}} @@ -119,7 +131,9 @@ Each Helm component has two values files in its directory: ## Upgrade -To upgrade a specific Helm component: +To upgrade a specific Helm component, use the generic form below. Some +components require component-specific flags; use the component subdirectory +`README.md` for the exact command. ```bash helm upgrade --version -n -f /values.yaml -f /cluster-values.yaml --wait --timeout 10m diff --git a/pkg/bundler/deployer/helm/templates/component-README.md.tmpl b/pkg/bundler/deployer/helm/templates/component-README.md.tmpl index 068bfcd28..7797779a0 100644 --- a/pkg/bundler/deployer/helm/templates/component-README.md.tmpl +++ b/pkg/bundler/deployer/helm/templates/component-README.md.tmpl @@ -43,19 +43,31 @@ Namespace: {{ .Namespace }} ```bash {{ if .IsOCI -}} helm upgrade --install {{ .Name }} {{ .Repository }}/{{ .ChartName }} \ + {{ if eq .Name "dynamo-platform" }}--server-side=false \ + {{ end -}} --version {{ .Version }} \ -n {{ .Namespace }} --create-namespace \ -f values.yaml \ -f cluster-values.yaml \ - --wait --timeout 10m + {{ if eq .Name "kai-scheduler" -}} + --timeout 30m + {{ else -}} + --wait --timeout {{ if eq .Name "dynamo-platform" }}20m{{ else }}10m{{ end }} + {{ end -}} {{ else -}} helm upgrade --install {{ .Name }} {{ .ChartName }} \ + {{ if eq .Name "dynamo-platform" }}--server-side=false \ + {{ end -}} --repo {{ .Repository }} \ --version {{ .Version }} \ -n {{ .Namespace }} --create-namespace \ -f values.yaml \ -f cluster-values.yaml \ - --wait --timeout 10m + {{ if eq .Name "kai-scheduler" -}} + --timeout 30m + {{ else -}} + --wait --timeout {{ if eq .Name "dynamo-platform" }}20m{{ else }}10m{{ end }} + {{ end -}} {{ end -}} ``` {{ if .HasManifests }} @@ -70,19 +82,31 @@ kubectl apply -f manifests/ ```bash {{ if .IsOCI -}} helm upgrade {{ .Name }} {{ .Repository }}/{{ .ChartName }} \ + {{ if eq .Name "dynamo-platform" }}--server-side=false \ + {{ end -}} --version {{ .Version }} \ -n {{ .Namespace }} \ -f values.yaml \ -f cluster-values.yaml \ - --wait --timeout 10m + {{ if eq .Name "kai-scheduler" -}} + --timeout 30m + {{ else -}} + --wait --timeout {{ if eq .Name "dynamo-platform" }}20m{{ else }}10m{{ end }} + {{ end -}} {{ else -}} helm upgrade {{ .Name }} {{ .ChartName }} \ + {{ if eq .Name "dynamo-platform" }}--server-side=false \ + {{ end -}} --repo {{ .Repository }} \ --version {{ .Version }} \ -n {{ .Namespace }} \ -f values.yaml \ -f cluster-values.yaml \ - --wait --timeout 10m + {{ if eq .Name "kai-scheduler" -}} + --timeout 30m + {{ else -}} + --wait --timeout {{ if eq .Name "dynamo-platform" }}20m{{ else }}10m{{ end }} + {{ end -}} {{ end -}} ``` diff --git a/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl b/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl index cd4e786cd..7bd03a356 100644 --- a/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl +++ b/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl @@ -418,7 +418,8 @@ if [[ "${COMPONENT_MAX_RETRIES}" -gt 3 ]]; then fi {{ else if eq .Name "kube-prometheus-stack" -}} # Grafana can trip its Deployment progress deadline before a longer Helm -# timeout helps. Keep the default 10m timeout and preserve the default retry +# timeout helps, especially on slower H100 CI runners under image-pull and +# control-plane load. Keep the default 10m timeout and preserve the default retry # budget so later upgrades can succeed after images and controllers settle. {{ end -}} # Derive wait args: global --wait/--no-wait behavior + component timeout. @@ -436,7 +437,7 @@ fi helm_retry "{{ .Name }} helm install" "{{ .Namespace }}" \ "${COMPONENT_MAX_RETRIES}" \ helm upgrade --install {{ .Name }} {{ .Repository }}/{{ .ChartName }} \ - "${COMPONENT_HELM_APPLY_ARGS[@]}" \ + ${COMPONENT_HELM_APPLY_ARGS[@]+"${COMPONENT_HELM_APPLY_ARGS[@]}"} \ {{ if .Version }}--version {{ .Version }} \ {{ end -}} -n {{ .Namespace }} --create-namespace \ @@ -448,7 +449,7 @@ helm_retry "{{ .Name }} helm install" "{{ .Namespace }}" \ helm_retry "{{ .Name }} helm install" "{{ .Namespace }}" \ "${COMPONENT_MAX_RETRIES}" \ helm upgrade --install {{ .Name }} {{ .ChartName }} \ - "${COMPONENT_HELM_APPLY_ARGS[@]}" \ + ${COMPONENT_HELM_APPLY_ARGS[@]+"${COMPONENT_HELM_APPLY_ARGS[@]}"} \ --repo {{ .Repository }} \ {{ if .Version }}--version {{ .Version }} \ {{ end -}} diff --git a/tests/chainsaw/ai-conformance/README.md b/tests/chainsaw/ai-conformance/README.md index b1a88e9d4..a69b88f13 100644 --- a/tests/chainsaw/ai-conformance/README.md +++ b/tests/chainsaw/ai-conformance/README.md @@ -73,10 +73,11 @@ tests/chainsaw/ai-conformance/ │ ├── assert-cert-manager.yaml # cert-manager healthy │ ├── assert-dra-driver.yaml # DRA driver healthy │ ├── assert-kai-scheduler.yaml # KAI scheduler healthy -│ ├── assert-monitoring.yaml # Prometheus stack healthy +│ ├── assert-monitoring.yaml # Prometheus stack healthy with Grafana │ └── assert-skyhook.yaml # Skyhook operator healthy ├── kind-common/ # Shared Kind-only assertions │ ├── assert-gpu-operator.yaml # GPU operator healthy on kind +│ ├── assert-monitoring.yaml # Prometheus stack healthy without Grafana │ ├── assert-network-operator.yaml # Network operator healthy on kind │ └── assert-nvsentinel.yaml # NVSentinel healthy on kind ├── kind-inference-dynamo/ # Kind + H100 + inference + dynamo leaf suite diff --git a/tests/chainsaw/ai-conformance/kind-common/assert-monitoring.yaml b/tests/chainsaw/ai-conformance/kind-common/assert-monitoring.yaml new file mode 100644 index 000000000..868be3fea --- /dev/null +++ b/tests/chainsaw/ai-conformance/kind-common/assert-monitoring.yaml @@ -0,0 +1,85 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Assert kind monitoring stack components required by H100 CI are healthy. +# Grafana is intentionally not asserted here because conformance metrics use +# Prometheus, DCGM exporter, and prometheus-adapter directly. + +# Prometheus Operator - manages Prometheus, Alertmanager, and ServiceMonitor CRs +apiVersion: apps/v1 +kind: Deployment +metadata: + name: kube-prometheus-operator + namespace: monitoring +status: + (conditions[?type == 'Available']): + - status: "True" +--- +# kube-state-metrics - Kubernetes object state metrics +apiVersion: apps/v1 +kind: Deployment +metadata: + name: kube-state-metrics + namespace: monitoring +status: + (conditions[?type == 'Available']): + - status: "True" +--- +# Prometheus StatefulSet - time series database +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: prometheus-kube-prometheus-prometheus + namespace: monitoring +status: + (readyReplicas > `0`): true +--- +# Alertmanager StatefulSet - alert routing and silencing +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: alertmanager-kube-prometheus-alertmanager + namespace: monitoring +status: + (readyReplicas > `0`): true +--- +# Prometheus Node Exporter DaemonSet - node-level hardware/OS metrics +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: prometheus-node-exporter + namespace: monitoring +status: + (numberReady > `0`): true + (desiredNumberScheduled > `0`): true +--- +# k8s-ephemeral-storage-metrics - ephemeral storage usage metrics +apiVersion: apps/v1 +kind: Deployment +metadata: + name: k8s-ephemeral-storage-metrics + namespace: monitoring +status: + (conditions[?type == 'Available']): + - status: "True" +--- +# Prometheus Adapter - custom metrics API for HPA +apiVersion: apps/v1 +kind: Deployment +metadata: + name: prometheus-adapter + namespace: monitoring +status: + (conditions[?type == 'Available']): + - status: "True" diff --git a/tests/chainsaw/ai-conformance/kind-inference-dynamo/chainsaw-test.yaml b/tests/chainsaw/ai-conformance/kind-inference-dynamo/chainsaw-test.yaml index 85aa33ab6..cac236b32 100644 --- a/tests/chainsaw/ai-conformance/kind-inference-dynamo/chainsaw-test.yaml +++ b/tests/chainsaw/ai-conformance/kind-inference-dynamo/chainsaw-test.yaml @@ -65,10 +65,10 @@ spec: # ── Monitoring ───────────────────────────────────────────────────── - name: assert-monitoring - description: Verify kube-prometheus-stack, ephemeral storage metrics, and prometheus-adapter. + description: Verify kind monitoring stack without Grafana. try: - assert: - file: ../common/assert-monitoring.yaml + file: ../kind-common/assert-monitoring.yaml # ── kgateway ─────────────────────────────────────────────────────── - name: assert-kgateway diff --git a/tests/chainsaw/ai-conformance/kind-training-kubeflow/chainsaw-test.yaml b/tests/chainsaw/ai-conformance/kind-training-kubeflow/chainsaw-test.yaml index e3d2b35a9..20332ad64 100644 --- a/tests/chainsaw/ai-conformance/kind-training-kubeflow/chainsaw-test.yaml +++ b/tests/chainsaw/ai-conformance/kind-training-kubeflow/chainsaw-test.yaml @@ -60,10 +60,10 @@ spec: file: ../kind-common/assert-gpu-operator.yaml - name: assert-monitoring - description: Verify kube-prometheus-stack, ephemeral storage metrics, and prometheus-adapter. + description: Verify kind monitoring stack without Grafana. try: - assert: - file: ../common/assert-monitoring.yaml + file: ../kind-common/assert-monitoring.yaml - name: assert-skyhook description: Verify Skyhook operator controller-manager is available. From 22304c97f35cbe2287cdd91d780ad0b956007459 Mon Sep 17 00:00:00 2001 From: Yuan Chen Date: Sun, 26 Apr 2026 14:43:55 -0700 Subject: [PATCH 16/21] Address H100 CI review feedback --- .../check-control-plane-health/action.yml | 54 +++++++++++++----- .github/actions/gpu-cluster-setup/action.yml | 55 +++++++++++++++++++ .../actions/install-karpenter-kwok/action.yml | 23 ++++++++ .../workflows/gpu-h100-inference-test.yaml | 8 +++ .github/workflows/gpu-h100-training-test.yaml | 8 +++ docs/user/cli-reference.md | 6 +- .../deployer/helm/templates/deploy.sh.tmpl | 3 + 7 files changed, 142 insertions(+), 15 deletions(-) diff --git a/.github/actions/check-control-plane-health/action.yml b/.github/actions/check-control-plane-health/action.yml index 582d78746..59b52e3f0 100644 --- a/.github/actions/check-control-plane-health/action.yml +++ b/.github/actions/check-control-plane-health/action.yml @@ -69,6 +69,16 @@ runs: run: | set -euo pipefail + validate_duration_input() { + local input_name="$1" + local input_value="$2" + + if ! [[ "${input_value}" =~ ^[0-9]+[smh]$ ]]; then + echo "::error::${input_name} must be a duration like 60s, 2m, or 1h; got '${input_value}'" + exit 1 + fi + } + MAX_RECOVERY_ATTEMPTS="${MAX_RECOVERY_ATTEMPTS#"${MAX_RECOVERY_ATTEMPTS%%[![:space:]]*}"}" MAX_RECOVERY_ATTEMPTS="${MAX_RECOVERY_ATTEMPTS%"${MAX_RECOVERY_ATTEMPTS##*[![:space:]]}"}" if ! [[ "${MAX_RECOVERY_ATTEMPTS}" =~ ^[0-9]+$ ]]; then @@ -76,15 +86,16 @@ runs: exit 1 fi + WAIT_TIMEOUT="${WAIT_TIMEOUT#"${WAIT_TIMEOUT%%[![:space:]]*}"}" + WAIT_TIMEOUT="${WAIT_TIMEOUT%"${WAIT_TIMEOUT##*[![:space:]]}"}" + validate_duration_input wait_timeout "${WAIT_TIMEOUT}" + STABILITY_WINDOW="${STABILITY_WINDOW#"${STABILITY_WINDOW%%[![:space:]]*}"}" STABILITY_WINDOW="${STABILITY_WINDOW%"${STABILITY_WINDOW##*[![:space:]]}"}" if [[ -z "${STABILITY_WINDOW}" ]]; then STABILITY_WINDOW="0s" fi - if ! [[ "${STABILITY_WINDOW}" =~ ^[0-9]+[smh]$ ]]; then - echo "::error::stability_window must be a duration like 0s, 60s, or 2m; got '${STABILITY_WINDOW}'" - exit 1 - fi + validate_duration_input stability_window "${STABILITY_WINDOW}" if [[ "${STABILITY_WINDOW}" =~ ^0+[smh]$ ]]; then STABILITY_WINDOW="0s" fi @@ -107,7 +118,11 @@ runs: timeout 30s docker "$@" } + STATIC_POD_RECREATE_SETTLE_SECONDS=5 + RESTART_COUNT_ATTEMPTS=3 + RESTART_COUNT_RETRY_SLEEP_SECONDS=2 declare -A RECOVERY_ATTEMPTS=() + declare -A INITIAL_RESTARTS=() kubectl_kind get --raw='/readyz' || true @@ -127,15 +142,26 @@ runs: local restart_counts local restart_count local total=0 + local attempt + + for ((attempt = 1; attempt <= RESTART_COUNT_ATTEMPTS; attempt++)); do + if restart_counts=$(kubectl_kind -n "${NAMESPACE}" get pod -l "${selector}" \ + -o jsonpath='{range .items[*]}{range .status.containerStatuses[*]}{.restartCount}{"\n"}{end}{end}'); then + if [[ -n "${restart_counts}" ]]; then + break + fi + echo "::warning::no container statuses found for ${component} pods (attempt ${attempt}/${RESTART_COUNT_ATTEMPTS})" >&2 + else + echo "::warning::failed to read restart counts for ${component} pods (attempt ${attempt}/${RESTART_COUNT_ATTEMPTS})" >&2 + fi + + if (( attempt < RESTART_COUNT_ATTEMPTS )); then + sleep "${RESTART_COUNT_RETRY_SLEEP_SECONDS}" + fi + done - if ! restart_counts=$(kubectl_kind -n "${NAMESPACE}" get pod -l "${selector}" \ - -o jsonpath='{range .items[*]}{range .status.containerStatuses[*]}{.restartCount}{"\n"}{end}{end}'); then - echo "::error::failed to read restart counts for ${component} pods" >&2 - dump_component_diagnostics "${component}" >&2 - exit 1 - fi if [[ -z "${restart_counts}" ]]; then - echo "::error::no container statuses found for ${component} pods" >&2 + echo "::error::no container statuses found for ${component} pods after ${RESTART_COUNT_ATTEMPTS} attempts" >&2 dump_component_diagnostics "${component}" >&2 exit 1 fi @@ -358,7 +384,9 @@ runs: fi done - sleep 5 + # Give kubelet a short interval to observe the stopped CRI container + # and refresh the mirror pod before kubectl wait reads pod status. + sleep "${STATIC_POD_RECREATE_SETTLE_SECONDS}" if ! wait_ready "${component}"; then echo "::warning::${component} did not recover after static pod container restart" dump_component_diagnostics "${component}" @@ -476,8 +504,6 @@ runs: done } - declare -A INITIAL_RESTARTS=() - for component in ${COMPONENTS}; do check_component "${component}" done diff --git a/.github/actions/gpu-cluster-setup/action.yml b/.github/actions/gpu-cluster-setup/action.yml index 93ab80cef..21d27800c 100644 --- a/.github/actions/gpu-cluster-setup/action.yml +++ b/.github/actions/gpu-cluster-setup/action.yml @@ -309,6 +309,60 @@ runs: run: | set -euo pipefail + validate_duration_input() { + local input_name="$1" + local input_value="$2" + + if ! [[ "${input_value}" =~ ^[0-9]+[smh]$ ]]; then + echo "::error::${input_name} must be a duration like 300s, 10m, or 1h; got '${input_value}'" + exit 1 + fi + } + + validate_generated_control_plane_config() { + if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then + for patch_file in "${patch_dir}"/*.yaml; do + if ! grep -Fxq 'apiVersion: v1' "${patch_file}" || + ! grep -Fxq 'kind: Pod' "${patch_file}" || + ! grep -Eq '^[[:space:]]+resources:$' "${patch_file}"; then + echo "::error::rendered static pod patch ${patch_file} is missing expected top-level YAML" + sed 's/^/ /' "${patch_file}" || true + exit 1 + fi + done + + if ! grep -Eq '^[[:space:]]*extraMounts:$' "${config_template}" || + ! grep -Fq 'directory: /patches' "${config_template}"; then + echo "::error::rendered kind config is missing control-plane patch mounts" + sed 's/^/ /' "${config_template}" || true + exit 1 + fi + fi + + if [[ "${CONTROL_PLANE_LEADER_ELECTION_TUNING}" == "true" ]]; then + for expected in \ + 'apiVersion: kubeadm.k8s.io/v1beta3' \ + 'apiVersion: kubeadm.k8s.io/v1beta4' \ + "leader-elect-lease-duration: \"${LEADER_ELECTION_LEASE_DURATION}\"" \ + "leader-elect-renew-deadline: \"${LEADER_ELECTION_RENEW_DEADLINE}\"" \ + "leader-elect-retry-period: \"${LEADER_ELECTION_RETRY_PERIOD}\"" \ + "value: \"${LEADER_ELECTION_LEASE_DURATION}\"" \ + "value: \"${LEADER_ELECTION_RENEW_DEADLINE}\"" \ + "value: \"${LEADER_ELECTION_RETRY_PERIOD}\""; do + if ! grep -Fq "${expected}" "${config_template}"; then + echo "::error::rendered kind config is missing expected leader election setting: ${expected}" + sed 's/^/ /' "${config_template}" || true + exit 1 + fi + done + fi + } + + validate_duration_input cluster_create_timeout "${CLUSTER_CREATE_TIMEOUT}" + validate_duration_input leader_election_lease_duration "${LEADER_ELECTION_LEASE_DURATION}" + validate_duration_input leader_election_renew_deadline "${LEADER_ELECTION_RENEW_DEADLINE}" + validate_duration_input leader_election_retry_period "${LEADER_ELECTION_RETRY_PERIOD}" + CREATE_ARGS=(--name="${KIND_CLUSTER_NAME}") if [[ -n "${KIND_NODE_IMAGE}" ]]; then echo "Using kind node image: ${KIND_NODE_IMAGE}" @@ -505,6 +559,7 @@ runs: echo " renew-deadline=${LEADER_ELECTION_RENEW_DEADLINE}" echo " retry-period=${LEADER_ELECTION_RETRY_PERIOD}" fi + validate_generated_control_plane_config CREATE_ARGS+=(--config-template="${config_template}") fi diff --git a/.github/actions/install-karpenter-kwok/action.yml b/.github/actions/install-karpenter-kwok/action.yml index c917b2abc..f66848e6f 100644 --- a/.github/actions/install-karpenter-kwok/action.yml +++ b/.github/actions/install-karpenter-kwok/action.yml @@ -70,5 +70,28 @@ runs: KARPENTER_HELM_TIMEOUT: ${{ inputs.karpenter_helm_timeout }} run: | set -euo pipefail + validate_duration_input() { + local input_name="$1" + local input_value="$2" + + if ! [[ "${input_value}" =~ ^[0-9]+[smh]$ ]]; then + echo "::error::${input_name} must be a duration like 300s, 10m, or 1h; got '${input_value}'" + exit 1 + fi + } + + validate_seconds_input() { + local input_name="$1" + local input_value="$2" + + if ! [[ "${input_value}" =~ ^[0-9]+$ ]]; then + echo "::error::${input_name} must be an integer number of seconds, got '${input_value}'" + exit 1 + fi + } + + validate_duration_input kwok_helm_timeout "${KWOK_HELM_TIMEOUT}" + validate_seconds_input ko_build_timeout "${KO_BUILD_TIMEOUT}" + validate_duration_input karpenter_helm_timeout "${KARPENTER_HELM_TIMEOUT}" bash kwok/scripts/install-karpenter-kwok.sh kubectl --context="kind-${KIND_CLUSTER_NAME}" apply -f kwok/manifests/karpenter/nodepool.yaml diff --git a/.github/workflows/gpu-h100-inference-test.yaml b/.github/workflows/gpu-h100-inference-test.yaml index 081bb0261..4ef7c43e7 100644 --- a/.github/workflows/gpu-h100-inference-test.yaml +++ b/.github/workflows/gpu-h100-inference-test.yaml @@ -154,6 +154,8 @@ jobs: wait: 'true' best_effort: 'false' + # Runtime install creates many CRDs, webhooks, and controllers. Keep a + # stability window here to catch KCM/scheduler restarts before snapshot. - name: Check control plane health uses: ./.github/actions/check-control-plane-health with: @@ -174,6 +176,8 @@ jobs: # --- Install Karpenter + KWOK early to give monitoring stack settle time --- + # Snapshot deploys a GPU Job and exercises cluster discovery; verify the + # control plane stayed stable before adding Karpenter/KWOK. - name: Check control plane health uses: ./.github/actions/check-control-plane-health with: @@ -198,6 +202,8 @@ jobs: install_chainsaw: 'true' chainsaw_version: '${{ steps.versions.outputs.chainsaw }}' + # This gate is effectively post-Karpenter/KWOK; the chainsaw setup above + # only installs a runner-side binary. - name: Check control plane health uses: ./.github/actions/check-control-plane-health with: @@ -227,6 +233,8 @@ jobs: build_snapshot_agent: 'false' validator_phases: 'conformance' + # Validator image build/load can contend with Docker and kind containerd; + # verify the control plane before the final conformance workload. - name: Check control plane health uses: ./.github/actions/check-control-plane-health with: diff --git a/.github/workflows/gpu-h100-training-test.yaml b/.github/workflows/gpu-h100-training-test.yaml index a617bc718..ae96a49a7 100644 --- a/.github/workflows/gpu-h100-training-test.yaml +++ b/.github/workflows/gpu-h100-training-test.yaml @@ -151,6 +151,8 @@ jobs: wait: 'true' best_effort: 'false' + # Runtime install creates many CRDs, webhooks, and controllers. Keep a + # stability window here to catch KCM/scheduler restarts before snapshot. - name: Check control plane health id: post_runtime_control_plane_health uses: ./.github/actions/check-control-plane-health @@ -170,6 +172,8 @@ jobs: # --- Install Karpenter + KWOK early to give monitoring stack settle time --- + # Snapshot deploys a GPU Job and exercises cluster discovery; verify the + # control plane stayed stable before adding Karpenter/KWOK. - name: Check control plane health uses: ./.github/actions/check-control-plane-health with: @@ -194,6 +198,8 @@ jobs: install_chainsaw: 'true' chainsaw_version: '${{ steps.versions.outputs.chainsaw }}' + # This gate is effectively post-Karpenter/KWOK; the chainsaw setup above + # only installs a runner-side binary. - name: Check control plane health uses: ./.github/actions/check-control-plane-health with: @@ -223,6 +229,8 @@ jobs: build_snapshot_agent: 'false' validator_phases: 'conformance' + # Validator image build/load can contend with Docker and kind containerd; + # verify the control plane before the final conformance workload. - name: Check control plane health uses: ./.github/actions/check-control-plane-health with: diff --git a/docs/user/cli-reference.md b/docs/user/cli-reference.md index a6dd85b1d..ba62aef1a 100644 --- a/docs/user/cli-reference.md +++ b/docs/user/cli-reference.md @@ -1320,7 +1320,11 @@ After `helm install`, the same manifests are re-applied as post-install to ensur **Async components:** -Components that use operator patterns with custom resources that reconcile asynchronously (e.g., `kai-scheduler`) are installed without `--wait` to avoid Helm timing out on CR readiness. Kai Scheduler installs use a 30 minute per-attempt timeout and cap the retry budget at 1 retry so hook diagnostics surface quickly on cold runners. The `dynamo-platform` Helm command uses client-side apply so retries and upgrades do not conflict with Grove-managed webhook certificate Secret data, while preserving Helm `--wait` behavior. Dynamo installs use a 20 minute per-attempt timeout and cap the retry budget at 3 retries because cold-start failures often involve operator webhook and certificate resources settling across attempts. +Components that use operator patterns with custom resources that reconcile asynchronously use component-specific install behavior: + +- `kai-scheduler` installs without `--wait` to avoid Helm timing out on CR readiness. It uses a 30 minute per-attempt timeout and caps the retry budget at 1 retry so hook diagnostics surface quickly on cold runners. +- `dynamo-platform` uses client-side apply so retries and upgrades do not conflict with Grove-managed webhook certificate Secret data, while preserving Helm `--wait` behavior. Dynamo installs use a 20 minute per-attempt timeout and cap the retry budget at 3 retries because cold-start failures often involve operator webhook and certificate resources settling across attempts. +- `kube-prometheus-stack` keeps the default timeout and retry budget because Grafana can hit its Deployment progress deadline before a longer Helm timeout would help, especially on slower H100 CI runners under image-pull and control-plane load. ##### DRA kubelet plugin registration diff --git a/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl b/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl index 7bd03a356..826ab08ed 100644 --- a/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl +++ b/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl @@ -412,6 +412,9 @@ if [[ "${COMPONENT_MAX_RETRIES}" -gt 1 ]]; then fi {{ else if eq .Name "dynamo-platform" -}} COMPONENT_HELM_TIMEOUT="20m" +# Grove owns the generated webhook certificate Secret data after install. +# Client-side apply avoids server-side field ownership conflicts during retries. +# Requires Helm v4+ for --server-side=false; AICR bundles pin Helm v4 in .settings.yaml. COMPONENT_HELM_APPLY_ARGS=(--server-side=false) if [[ "${COMPONENT_MAX_RETRIES}" -gt 3 ]]; then COMPONENT_MAX_RETRIES="3" From 553051c3621cab63cf631c7a3851edb14b123b0e Mon Sep 17 00:00:00 2001 From: Yuan Chen Date: Sun, 26 Apr 2026 14:53:42 -0700 Subject: [PATCH 17/21] Avoid pull request events for GPU runners --- .github/workflows/gpu-h100-inference-test.yaml | 6 +++--- .github/workflows/gpu-h100-training-test.yaml | 6 +++--- .github/workflows/gpu-smoke-test.yaml | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/.github/workflows/gpu-h100-inference-test.yaml b/.github/workflows/gpu-h100-inference-test.yaml index 4ef7c43e7..27d9d27cc 100644 --- a/.github/workflows/gpu-h100-inference-test.yaml +++ b/.github/workflows/gpu-h100-inference-test.yaml @@ -20,8 +20,6 @@ on: push: branches: - "pull-request/[0-9]+" - pull_request: - types: [labeled] workflow_dispatch: {} # Allow manual runs permissions: @@ -86,11 +84,13 @@ jobs: gpu-inference-test: needs: [check-paths] + # NVIDIA self-hosted GPU runners reject pull_request event jobs before + # checkout. PR GPU coverage runs through the pull-request/ push + # mirror after ok-to-test approval. if: > always() && ( github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || - (github.event_name == 'pull_request' && github.event.label.name == 'run-gpu-tests') || (github.event_name == 'push' && needs.check-paths.outputs.should-run == 'true') ) name: GPU Inference Test (nvkind + H100 x2) diff --git a/.github/workflows/gpu-h100-training-test.yaml b/.github/workflows/gpu-h100-training-test.yaml index ae96a49a7..972e38f73 100644 --- a/.github/workflows/gpu-h100-training-test.yaml +++ b/.github/workflows/gpu-h100-training-test.yaml @@ -20,8 +20,6 @@ on: push: branches: - "pull-request/[0-9]+" - pull_request: - types: [labeled] workflow_dispatch: {} # Allow manual runs permissions: @@ -82,11 +80,13 @@ jobs: gpu-training-test: needs: [check-paths] + # NVIDIA self-hosted GPU runners reject pull_request event jobs before + # checkout. PR GPU coverage runs through the pull-request/ push + # mirror after ok-to-test approval. if: > always() && ( github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || - (github.event_name == 'pull_request' && github.event.label.name == 'run-gpu-tests') || (github.event_name == 'push' && needs.check-paths.outputs.should-run == 'true') ) name: GPU Training Test (nvkind + H100 x2) diff --git a/.github/workflows/gpu-smoke-test.yaml b/.github/workflows/gpu-smoke-test.yaml index 25d968b67..805548afc 100644 --- a/.github/workflows/gpu-smoke-test.yaml +++ b/.github/workflows/gpu-smoke-test.yaml @@ -20,8 +20,6 @@ on: push: branches: - "pull-request/[0-9]+" - pull_request: - types: [labeled] workflow_dispatch: {} # Allow manual runs permissions: @@ -62,11 +60,13 @@ jobs: gpu-smoke-test: needs: [check-paths] + # NVIDIA self-hosted GPU runners reject pull_request event jobs before + # checkout. PR GPU coverage runs through the pull-request/ push + # mirror after ok-to-test approval. if: > always() && ( github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || - (github.event_name == 'pull_request' && github.event.label.name == 'run-gpu-tests') || (github.event_name == 'push' && needs.check-paths.outputs.should-run == 'true') ) name: GPU Smoke Test (nvkind + L40G) From ff99d6af322f6cb67407c9b91a896438d673eaeb Mon Sep 17 00:00:00 2001 From: Yuan Chen Date: Sun, 26 Apr 2026 15:14:31 -0700 Subject: [PATCH 18/21] Address GPU CI review feedback --- .github/actions/aicr-build/action.yml | 79 +-- .github/actions/aicr-build/build-cli.sh | 10 + .../aicr-build/build-snapshot-agent.sh | 18 + .../aicr-build/build-validator-images.sh | 35 ++ .github/actions/aicr-build/stage-cli.sh | 4 + .../check-control-plane-health/action.yml | 444 +-------------- .../check-control-plane-health.sh | 457 +++++++++++++++ .github/actions/gpu-cluster-setup/action.yml | 521 +----------------- .../check-runner-capacity.sh | 29 + .../configure-nvidia-container-toolkit.sh | 21 + .../create-gpu-kind-cluster.sh | 411 ++++++++++++++ .../delete-stale-kind-cluster.sh | 39 ++ .../increase-inotify-limits.sh | 19 + .../gpu-cluster-setup/install-nvkind.sh | 19 + .../gpu-cluster-setup/runner-preflight.sh | 69 +++ .../validate-docker-gpu-access.sh | 17 + .../actions/gpu-cluster-setup/validate-env.sh | 21 + .../gpu-cluster-setup/warm-kind-node-image.sh | 29 + .../actions/gpu-operator-install/action.yml | 98 +--- .../gpu-operator-install/generate-bundle.sh | 23 + .../gpu-operator-install/generate-recipe.sh | 29 + .../gpu-operator-install/install-bundle.sh | 35 ++ .../install-gpu-operator-helm.sh | 29 + .../wait-gpu-operands-bundle.sh | 38 ++ .../wait-gpu-operands-helm.sh | 22 + .../actions/gpu-snapshot-validate/action.yml | 56 +- .../debug-snapshot-job.sh | 35 ++ .../gpu-snapshot-validate/run-snapshot.sh | 26 + .../validate-snapshot-gpu.sh | 31 ++ .github/actions/gpu-test-cleanup/action.yml | 105 +--- .../gpu-test-cleanup/cleanup-kind-cluster.sh | 32 ++ .../collect-debug-artifacts.sh | 104 ++++ .../gpu-test-cleanup/export-kind-logs.sh | 19 + .../actions/install-karpenter-kwok/action.yml | 34 +- .../install-karpenter-kwok.sh | 41 ++ .../resolve-versions.sh | 20 + .github/scripts/gpu-chainsaw-health.sh | 10 + .github/scripts/gpu-debug-diagnostics.sh | 146 +++++ .github/scripts/gpu-smoke-run-nvidia-smi.sh | 24 + .github/scripts/gpu-validate-conformance.sh | 15 + .../workflows/gpu-h100-inference-test.yaml | 125 +---- .github/workflows/gpu-h100-training-test.yaml | 101 +--- .github/workflows/gpu-smoke-test.yaml | 39 +- docs/user/cli-reference.md | 4 +- pkg/bundler/deployer/helm/helm_test.go | 4 + .../deployer/helm/templates/README.md.tmpl | 3 + .../helm/templates/component-README.md.tmpl | 3 + .../deployer/helm/templates/deploy.sh.tmpl | 31 +- 48 files changed, 1975 insertions(+), 1549 deletions(-) create mode 100644 .github/actions/aicr-build/build-cli.sh create mode 100644 .github/actions/aicr-build/build-snapshot-agent.sh create mode 100644 .github/actions/aicr-build/build-validator-images.sh create mode 100644 .github/actions/aicr-build/stage-cli.sh create mode 100644 .github/actions/check-control-plane-health/check-control-plane-health.sh create mode 100644 .github/actions/gpu-cluster-setup/check-runner-capacity.sh create mode 100644 .github/actions/gpu-cluster-setup/configure-nvidia-container-toolkit.sh create mode 100644 .github/actions/gpu-cluster-setup/create-gpu-kind-cluster.sh create mode 100644 .github/actions/gpu-cluster-setup/delete-stale-kind-cluster.sh create mode 100644 .github/actions/gpu-cluster-setup/increase-inotify-limits.sh create mode 100644 .github/actions/gpu-cluster-setup/install-nvkind.sh create mode 100644 .github/actions/gpu-cluster-setup/runner-preflight.sh create mode 100644 .github/actions/gpu-cluster-setup/validate-docker-gpu-access.sh create mode 100644 .github/actions/gpu-cluster-setup/validate-env.sh create mode 100644 .github/actions/gpu-cluster-setup/warm-kind-node-image.sh create mode 100644 .github/actions/gpu-operator-install/generate-bundle.sh create mode 100644 .github/actions/gpu-operator-install/generate-recipe.sh create mode 100644 .github/actions/gpu-operator-install/install-bundle.sh create mode 100644 .github/actions/gpu-operator-install/install-gpu-operator-helm.sh create mode 100644 .github/actions/gpu-operator-install/wait-gpu-operands-bundle.sh create mode 100644 .github/actions/gpu-operator-install/wait-gpu-operands-helm.sh create mode 100644 .github/actions/gpu-snapshot-validate/debug-snapshot-job.sh create mode 100644 .github/actions/gpu-snapshot-validate/run-snapshot.sh create mode 100644 .github/actions/gpu-snapshot-validate/validate-snapshot-gpu.sh create mode 100644 .github/actions/gpu-test-cleanup/cleanup-kind-cluster.sh create mode 100644 .github/actions/gpu-test-cleanup/collect-debug-artifacts.sh create mode 100644 .github/actions/gpu-test-cleanup/export-kind-logs.sh create mode 100644 .github/actions/install-karpenter-kwok/install-karpenter-kwok.sh create mode 100644 .github/actions/install-karpenter-kwok/resolve-versions.sh create mode 100644 .github/scripts/gpu-chainsaw-health.sh create mode 100644 .github/scripts/gpu-debug-diagnostics.sh create mode 100644 .github/scripts/gpu-smoke-run-nvidia-smi.sh create mode 100644 .github/scripts/gpu-validate-conformance.sh diff --git a/.github/actions/aicr-build/action.yml b/.github/actions/aicr-build/action.yml index 14d6a595b..671392215 100644 --- a/.github/actions/aicr-build/action.yml +++ b/.github/actions/aicr-build/action.yml @@ -41,91 +41,22 @@ runs: shell: bash env: GOFLAGS: -mod=vendor - run: | - set -euo pipefail - mkdir -p dist - if [[ -x dist/aicr ]]; then - echo "Reusing existing dist/aicr" - exit 0 - fi - CGO_ENABLED=0 go build -trimpath -o dist/aicr ./cmd/aicr + run: bash "${{ github.action_path }}/build-cli.sh" - name: Build snapshot agent image and load into kind if: inputs.build_snapshot_agent == 'true' shell: bash - run: | - set -euo pipefail - # Build snapshot agent image with CUDA base (provides nvidia-smi for GPU detection). - # Uses cuda:base (~250MB) instead of cuda:runtime (~1.8GB) — only nvidia-smi is needed. - # GPU test workflows use --image=ko.local:smoke-test for aicr snapshot. - docker build -t ko.local:smoke-test -f - . <<'DOCKERFILE' - FROM nvcr.io/nvidia/cuda:13.1.0-base-ubuntu24.04 - COPY dist/aicr /usr/local/bin/aicr - ENTRYPOINT ["/usr/local/bin/aicr"] - DOCKERFILE - - # Load onto all nodes. The snapshot agent requests nvidia.com/gpu but - # does not set a node selector, so it can land on any GPU-capable node - # including the control-plane (e.g., T4 smoke test). - # - # Timeout is intentionally generous (900s per attempt). H100 self-hosted - # runners transfer images over a shared Docker-in-Docker bridge; large - # CUDA base images (~250MB compressed) combined with I/O contention from - # parallel GPU operator pods regularly exceed the previous 600s limit. - timeout 900 kind load docker-image ko.local:smoke-test --name "${KIND_CLUSTER_NAME}" || { - echo "::warning::kind load attempt 1 failed for ko.local:smoke-test, retrying..." - timeout 900 kind load docker-image ko.local:smoke-test --name "${KIND_CLUSTER_NAME}" - } + run: bash "${{ github.action_path }}/build-snapshot-agent.sh" - name: Build validator images and load into kind if: "!(inputs.validator_phases == 'none' || (inputs.validator_phases == '' && inputs.build_validators == 'false'))" shell: bash env: GOFLAGS: -mod=vendor - run: | - set -euo pipefail - # Determine which validator phases to build. - # validator_phases takes precedence; build_validators is a deprecated fallback. - if [[ -n "${{ inputs.validator_phases }}" ]]; then - if [[ "${{ inputs.validator_phases }}" == "none" ]]; then - echo "Skipping validator builds (validator_phases=none)" - exit 0 - fi - PHASES="${{ inputs.validator_phases }}" - else - # Default: build all phases (backwards compatible) - PHASES="deployment,performance,conformance" - fi - - # Compile only the requested validator binaries. - mkdir -p dist/validator - for phase in ${PHASES//,/ }; do - echo "Building validator binary: ${phase}" - CGO_ENABLED=0 go build -trimpath -o "dist/validator/${phase}" "./validators/${phase}" - done - - for phase in ${PHASES//,/ }; do - mkdir -p "validators/${phase}/testdata" - docker build -t "ko.local/aicr-validators/${phase}:latest" -f - . <&2 - else - echo "::warning::failed to read restart counts for ${component} pods (attempt ${attempt}/${RESTART_COUNT_ATTEMPTS})" >&2 - fi - - if (( attempt < RESTART_COUNT_ATTEMPTS )); then - sleep "${RESTART_COUNT_RETRY_SLEEP_SECONDS}" - fi - done - - if [[ -z "${restart_counts}" ]]; then - echo "::error::no container statuses found for ${component} pods after ${RESTART_COUNT_ATTEMPTS} attempts" >&2 - dump_component_diagnostics "${component}" >&2 - exit 1 - fi - - while IFS= read -r restart_count; do - [[ -z "${restart_count}" ]] && continue - total=$((total + restart_count)) - done <<< "${restart_counts}" - echo "${total}" - } - - report_restart_baseline() { - local component="$1" - local restart_count="$2" - - if (( restart_count > 0 )); then - echo "::warning::${component} has historical restartCount=${restart_count}; checking current readiness and stability window only" - return - fi - echo "${component} restartCount=${restart_count}" - } - - dump_control_plane_summary() { - echo "=== Control-plane pod restart summary ===" - kubectl_kind -n "${NAMESPACE}" get pods -l tier=control-plane -o wide || true - kubectl_kind -n "${NAMESPACE}" get pods -l tier=control-plane \ - -o jsonpath='{range .items[*]}{.metadata.name}{" restartCount="}{range .status.containerStatuses[*]}{.restartCount}{" "}{end}{"\n"}{end}' || true - } - - require_readyz() { - local reason="$1" - - if ! kubectl_kind get --raw='/readyz'; then - echo "::error::kube-apiserver /readyz failed ${reason}" - dump_all_control_plane_runtime_diagnostics - exit 1 - fi - } - - dump_api_server_health() { - local endpoint - - for endpoint in '/livez?verbose' '/readyz?verbose' '/healthz'; do - echo "=== kube-apiserver ${endpoint} ===" - kubectl_kind get --raw="${endpoint}" || true - done - } - - dump_kind_node_runtime_summary() { - local node="${KIND_CLUSTER_NAME}-control-plane" - - if ! docker_timeout inspect "${node}" >/dev/null 2>&1; then - echo "::warning::cannot collect node runtime summary: kind node container ${node} not found" - return - fi - - echo "=== ${node} docker stats ===" - docker_timeout stats --no-stream \ - --format 'table {{.Name}}\t{{.CPUPerc}}\t{{.MemUsage}}\t{{.NetIO}}\t{{.BlockIO}}\t{{.PIDs}}' \ - "${node}" || true - - echo "=== ${node} docker inspect state ===" - docker_timeout inspect \ - --format 'status={{.State.Status}} running={{.State.Running}} oomKilled={{.State.OOMKilled}} pid={{.State.Pid}} started={{.State.StartedAt}} finished={{.State.FinishedAt}}' \ - "${node}" || true - - echo "=== ${node} node pressure snapshot ===" - docker_timeout exec "${node}" sh -c ' - date - uptime || true - free -h || true - df -h / /var/lib/containerd /var/lib/kubelet 2>/dev/null || df -h - echo "--- top cpu/memory processes ---" - ps -eo pid,ppid,stat,etime,%cpu,%mem,comm,args --sort=-%cpu | head -40 || true - ' || true - - echo "=== ${node} CRI pod/container summary ===" - docker_timeout exec "${node}" crictl pods || true - docker_timeout exec "${node}" crictl ps -a || true - docker_timeout exec "${node}" crictl stats || true - } - - dump_static_pod_runtime_diagnostics() { - local component="$1" - local node="${KIND_CLUSTER_NAME}-control-plane" - local container_ids - local container_id - local count=0 - - if ! docker_timeout inspect "${node}" >/dev/null 2>&1; then - echo "::warning::cannot collect ${component} runtime diagnostics: kind node container ${node} not found" - return - fi - - echo "=== ${node} ${component} static pod manifest ===" - docker_timeout exec "${node}" sh -c "sed -n '1,220p' /etc/kubernetes/manifests/${component}.yaml" || true - - echo "=== ${node} ${component} CRI containers ===" - docker_timeout exec "${node}" crictl ps -a --name "${component}" || true - - container_ids=$(docker_timeout exec "${node}" crictl ps -a --name "${component}" -q 2>/dev/null || true) - for container_id in ${container_ids}; do - count=$((count + 1)) - if (( count > 8 )); then - echo "Skipping remaining ${component} CRI containers after first 8 entries." - break - fi - - echo "=== ${node} crictl inspect ${component} ${container_id} ===" - docker_timeout exec "${node}" crictl inspect "${container_id}" || true - echo "=== ${node} crictl logs ${component} ${container_id} ===" - docker_timeout exec "${node}" crictl logs --tail=200 "${container_id}" || true - done - - echo "=== ${node} kubelet journal (${component}) ===" - docker_timeout exec "${node}" journalctl -u kubelet --since '45 minutes ago' --no-pager 2>/dev/null \ - | grep -Ei "${component}|static pod|mirror pod|probe|liveness|readiness|startup|back-off|backoff|container|failed|error|oom|killed" \ - | tail -200 || true - - echo "=== ${node} containerd journal (${component}) ===" - docker_timeout exec "${node}" journalctl -u containerd --since '45 minutes ago' --no-pager 2>/dev/null \ - | grep -Ei "${component}|container|task|shim|deadline|failed|error|oom|killed" \ - | tail -200 || true - } - - dump_all_control_plane_runtime_diagnostics() { - local component - - dump_control_plane_summary - dump_api_server_health - dump_kind_node_runtime_summary - for component in ${COMPONENTS}; do - dump_static_pod_runtime_diagnostics "${component}" - kubectl_kind -n kube-system get lease "${component}" -o yaml 2>/dev/null || true - done - } - - dump_component_diagnostics() { - local component="$1" - local selector="component=${component}" - local pods - local pod - - dump_control_plane_summary - kubectl_kind -n "${NAMESPACE}" get pod -l "${selector}" -o wide || true - kubectl_kind -n "${NAMESPACE}" describe pod -l "${selector}" || true - kubectl_kind -n "${NAMESPACE}" get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true - - pods=$(kubectl_kind -n "${NAMESPACE}" get pod -l "${selector}" -o name 2>/dev/null || true) - while IFS= read -r pod; do - [[ -z "${pod}" ]] && continue - echo "=== ${pod} logs ===" - kubectl_kind -n "${NAMESPACE}" logs "${pod}" --all-containers --tail=100 2>/dev/null || true - echo "=== ${pod} previous logs ===" - kubectl_kind -n "${NAMESPACE}" logs "${pod}" --all-containers --previous --tail=100 2>/dev/null || true - done <<< "${pods}" - - dump_all_control_plane_runtime_diagnostics - kubectl_kind -n kube-system get lease "${component}" -o yaml 2>/dev/null || true - } - - is_recovery_component() { - local component="$1" - local candidate - - for candidate in ${RECOVERY_COMPONENTS}; do - if [[ "${candidate}" == "${component}" ]]; then - return 0 - fi - done - return 1 - } - - try_recover_component() { - local component="$1" - local reason="$2" - local node="${KIND_CLUSTER_NAME}-control-plane" - local attempt - local container_ids - local container_id - - if [[ "${RECOVER_UNHEALTHY}" != "true" ]]; then - return 1 - fi - if (( MAX_RECOVERY_ATTEMPTS == 0 )); then - return 1 - fi - if ! is_recovery_component "${component}"; then - return 1 - fi - - attempt="${RECOVERY_ATTEMPTS[${component}]:-0}" - if (( attempt >= MAX_RECOVERY_ATTEMPTS )); then - return 1 - fi - RECOVERY_ATTEMPTS["${component}"]=$((attempt + 1)) - - echo "::warning::${component} is unhealthy (${reason}); restarting static pod container (attempt $((attempt + 1))/${MAX_RECOVERY_ATTEMPTS})" - dump_component_diagnostics "${component}" - - if ! docker_timeout inspect "${node}" >/dev/null 2>&1; then - echo "::warning::cannot recover ${component}: kind node container ${node} not found" - return 1 - fi - - if ! container_ids=$(docker_timeout exec "${node}" crictl ps --name "${component}" -q 2>/dev/null); then - echo "::warning::cannot recover ${component}: timed out or failed to list containers in ${node}" - return 1 - fi - if [[ -z "${container_ids}" ]]; then - echo "::warning::cannot recover ${component}: no running container found in ${node}" - return 1 - fi - - for container_id in ${container_ids}; do - echo "Stopping ${component} container ${container_id} in ${node}..." - if ! docker_timeout exec "${node}" crictl stop "${container_id}"; then - echo "::warning::failed to stop ${component} container ${container_id}" - return 1 - fi - done - - # Give kubelet a short interval to observe the stopped CRI container - # and refresh the mirror pod before kubectl wait reads pod status. - sleep "${STATIC_POD_RECREATE_SETTLE_SECONDS}" - if ! wait_ready "${component}"; then - echo "::warning::${component} did not recover after static pod container restart" - dump_component_diagnostics "${component}" - kubectl_kind get --raw='/readyz' || true - return 1 - fi - - echo "${component} recovered after static pod container restart." - return 0 - } - - check_component() { - local component="$1" - local selector="component=${component}" - local pods - local initial_restarts - - if ! pods=$(kubectl_kind -n "${NAMESPACE}" get pod -l "${selector}" -o name); then - if ! try_recover_component "${component}" "failed to list pods in ${NAMESPACE} with selector ${selector}"; then - echo "::error::failed to list ${component} pods in ${NAMESPACE} with selector ${selector}" - kubectl_kind -n "${NAMESPACE}" get pods -o wide || true - exit 1 - fi - if ! pods=$(kubectl_kind -n "${NAMESPACE}" get pod -l "${selector}" -o name); then - echo "::error::failed to list ${component} pods after recovery" - kubectl_kind -n "${NAMESPACE}" get pods -o wide || true - exit 1 - fi - fi - if [[ -z "${pods}" ]]; then - echo "::error::no ${component} pods found in ${NAMESPACE} with selector ${selector}" - kubectl_kind -n "${NAMESPACE}" get pods -o wide || true - exit 1 - fi - - if ! wait_ready "${component}"; then - if ! try_recover_component "${component}" "pods did not become Ready within ${WAIT_TIMEOUT}"; then - echo "::error::${component} pods did not become Ready within ${WAIT_TIMEOUT}" - dump_component_diagnostics "${component}" - kubectl_kind get --raw='/readyz' || true - exit 1 - fi - fi - initial_restarts=$(restart_total "${component}") - report_restart_baseline "${component}" "${initial_restarts}" - INITIAL_RESTARTS["${component}"]="${initial_restarts}" - } - - verify_stability_window() { - local component - local initial_restarts - local final_restarts - local recovered=false - - if [[ "${STABILITY_WINDOW}" == "0s" ]]; then - return - fi - - echo "Observing control-plane stability for ${STABILITY_WINDOW}..." - sleep "${STABILITY_WINDOW}" - for component in ${COMPONENTS}; do - initial_restarts="${INITIAL_RESTARTS[${component}]:-}" - if [[ -z "${initial_restarts}" ]]; then - echo "::error::missing initial restart count for ${component}" - exit 1 - fi - if ! wait_ready "${component}"; then - if ! try_recover_component "${component}" "pods became unready during ${STABILITY_WINDOW}"; then - echo "::error::${component} pods became unready during ${STABILITY_WINDOW}" - dump_component_diagnostics "${component}" - kubectl_kind get --raw='/readyz' || true - exit 1 - fi - initial_restarts=$(restart_total "${component}") - report_restart_baseline "${component}" "${initial_restarts}" - INITIAL_RESTARTS["${component}"]="${initial_restarts}" - recovered=true - continue - fi - final_restarts=$(restart_total "${component}") - if (( final_restarts > initial_restarts )); then - echo "::error::${component} restartCount increased from ${initial_restarts} to ${final_restarts} during ${STABILITY_WINDOW}" - dump_component_diagnostics "${component}" - kubectl_kind get --raw='/readyz' || true - exit 1 - fi - INITIAL_RESTARTS["${component}"]="${final_restarts}" - done - - if [[ "${recovered}" != "true" ]]; then - return - fi - - echo "::warning::control-plane recovery occurred; observing one additional ${STABILITY_WINDOW} stability window" - sleep "${STABILITY_WINDOW}" - for component in ${COMPONENTS}; do - initial_restarts="${INITIAL_RESTARTS[${component}]:-}" - if [[ -z "${initial_restarts}" ]]; then - echo "::error::missing post-recovery restart count for ${component}" - exit 1 - fi - if ! wait_ready "${component}"; then - echo "::error::${component} pods became unready after recovery" - dump_component_diagnostics "${component}" - kubectl_kind get --raw='/readyz' || true - exit 1 - fi - final_restarts=$(restart_total "${component}") - if (( final_restarts > initial_restarts )); then - echo "::error::${component} restartCount increased from ${initial_restarts} to ${final_restarts} after recovery" - dump_component_diagnostics "${component}" - exit 1 - fi - INITIAL_RESTARTS["${component}"]="${final_restarts}" - done - } - - for component in ${COMPONENTS}; do - check_component "${component}" - done - verify_stability_window - require_readyz "after stability window" + run: bash "${{ github.action_path }}/check-control-plane-health.sh" diff --git a/.github/actions/check-control-plane-health/check-control-plane-health.sh b/.github/actions/check-control-plane-health/check-control-plane-health.sh new file mode 100644 index 000000000..3614df47f --- /dev/null +++ b/.github/actions/check-control-plane-health/check-control-plane-health.sh @@ -0,0 +1,457 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +validate_duration_input() { + local input_name="$1" + local input_value="$2" + + if ! [[ "${input_value}" =~ ^[0-9]+[smh]$ ]]; then + echo "::error::${input_name} must be a duration like 60s, 2m, or 1h; got '${input_value}'" + exit 1 + fi +} + +MAX_RECOVERY_ATTEMPTS="${MAX_RECOVERY_ATTEMPTS#"${MAX_RECOVERY_ATTEMPTS%%[![:space:]]*}"}" +MAX_RECOVERY_ATTEMPTS="${MAX_RECOVERY_ATTEMPTS%"${MAX_RECOVERY_ATTEMPTS##*[![:space:]]}"}" +if ! [[ "${MAX_RECOVERY_ATTEMPTS}" =~ ^[0-9]+$ ]]; then + echo "::error::max_recovery_attempts must be a non-negative integer, got '${MAX_RECOVERY_ATTEMPTS}'" + exit 1 +fi + +WAIT_TIMEOUT="${WAIT_TIMEOUT#"${WAIT_TIMEOUT%%[![:space:]]*}"}" +WAIT_TIMEOUT="${WAIT_TIMEOUT%"${WAIT_TIMEOUT##*[![:space:]]}"}" +validate_duration_input wait_timeout "${WAIT_TIMEOUT}" + +STABILITY_WINDOW="${STABILITY_WINDOW#"${STABILITY_WINDOW%%[![:space:]]*}"}" +STABILITY_WINDOW="${STABILITY_WINDOW%"${STABILITY_WINDOW##*[![:space:]]}"}" +if [[ -z "${STABILITY_WINDOW}" ]]; then + STABILITY_WINDOW="0s" +fi +validate_duration_input stability_window "${STABILITY_WINDOW}" +if [[ "${STABILITY_WINDOW}" =~ ^0+[smh]$ ]]; then + STABILITY_WINDOW="0s" +fi + +RECOVER_UNHEALTHY="${RECOVER_UNHEALTHY#"${RECOVER_UNHEALTHY%%[![:space:]]*}"}" +RECOVER_UNHEALTHY="${RECOVER_UNHEALTHY%"${RECOVER_UNHEALTHY##*[![:space:]]}"}" +case "${RECOVER_UNHEALTHY}" in + true|false) ;; + *) + echo "::error::recover_unhealthy must be true or false, got '${RECOVER_UNHEALTHY}'" + exit 1 + ;; +esac + +kubectl_kind() { + timeout 30s kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" "$@" +} + +docker_timeout() { + timeout 30s docker "$@" +} + +STATIC_POD_RECREATE_SETTLE_SECONDS=5 +RESTART_COUNT_ATTEMPTS=3 +RESTART_COUNT_RETRY_SLEEP_SECONDS=2 +declare -A RECOVERY_ATTEMPTS=() +declare -A INITIAL_RESTARTS=() + +kubectl_kind get --raw='/readyz' || true + +wait_ready() { + local component="$1" + local selector="component=${component}" + + if ! timeout "${WAIT_TIMEOUT}" kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" -n "${NAMESPACE}" \ + wait --for=condition=Ready pod -l "${selector}" --timeout="${WAIT_TIMEOUT}"; then + return 1 + fi +} + +restart_total() { + local component="$1" + local selector="component=${component}" + local restart_counts + local restart_count + local total=0 + local attempt + + for ((attempt = 1; attempt <= RESTART_COUNT_ATTEMPTS; attempt++)); do + if restart_counts=$(kubectl_kind -n "${NAMESPACE}" get pod -l "${selector}" \ + -o jsonpath='{range .items[*]}{range .status.containerStatuses[*]}{.restartCount}{"\n"}{end}{end}'); then + if [[ -n "${restart_counts}" ]]; then + break + fi + echo "::warning::no container statuses found for ${component} pods (attempt ${attempt}/${RESTART_COUNT_ATTEMPTS})" >&2 + else + echo "::warning::failed to read restart counts for ${component} pods (attempt ${attempt}/${RESTART_COUNT_ATTEMPTS})" >&2 + fi + + if (( attempt < RESTART_COUNT_ATTEMPTS )); then + sleep "${RESTART_COUNT_RETRY_SLEEP_SECONDS}" + fi + done + + if [[ -z "${restart_counts}" ]]; then + echo "::error::no container statuses found for ${component} pods after ${RESTART_COUNT_ATTEMPTS} attempts" >&2 + dump_component_diagnostics "${component}" >&2 + exit 1 + fi + + while IFS= read -r restart_count; do + [[ -z "${restart_count}" ]] && continue + total=$((total + restart_count)) + done <<< "${restart_counts}" + echo "${total}" +} + +report_restart_baseline() { + local component="$1" + local restart_count="$2" + + if (( restart_count > 0 )); then + echo "::warning::${component} has historical restartCount=${restart_count}; checking current readiness and stability window only" + return + fi + echo "${component} restartCount=${restart_count}" +} + +dump_control_plane_summary() { + echo "=== Control-plane pod restart summary ===" + kubectl_kind -n "${NAMESPACE}" get pods -l tier=control-plane -o wide || true + kubectl_kind -n "${NAMESPACE}" get pods -l tier=control-plane \ + -o jsonpath='{range .items[*]}{.metadata.name}{" restartCount="}{range .status.containerStatuses[*]}{.restartCount}{" "}{end}{"\n"}{end}' || true +} + +require_readyz() { + local reason="$1" + + if ! kubectl_kind get --raw='/readyz'; then + echo "::error::kube-apiserver /readyz failed ${reason}" + dump_all_control_plane_runtime_diagnostics + exit 1 + fi +} + +dump_api_server_health() { + local endpoint + + for endpoint in '/livez?verbose' '/readyz?verbose' '/healthz'; do + echo "=== kube-apiserver ${endpoint} ===" + kubectl_kind get --raw="${endpoint}" || true + done +} + +dump_kind_node_runtime_summary() { + local node="${KIND_CLUSTER_NAME}-control-plane" + + if ! docker_timeout inspect "${node}" >/dev/null 2>&1; then + echo "::warning::cannot collect node runtime summary: kind node container ${node} not found" + return + fi + + echo "=== ${node} docker stats ===" + docker_timeout stats --no-stream \ + --format 'table {{.Name}}\t{{.CPUPerc}}\t{{.MemUsage}}\t{{.NetIO}}\t{{.BlockIO}}\t{{.PIDs}}' \ + "${node}" || true + + echo "=== ${node} docker inspect state ===" + docker_timeout inspect \ + --format 'status={{.State.Status}} running={{.State.Running}} oomKilled={{.State.OOMKilled}} pid={{.State.Pid}} started={{.State.StartedAt}} finished={{.State.FinishedAt}}' \ + "${node}" || true + + echo "=== ${node} node pressure snapshot ===" + docker_timeout exec "${node}" sh -c ' + date + uptime || true + free -h || true + df -h / /var/lib/containerd /var/lib/kubelet 2>/dev/null || df -h + echo "--- top cpu/memory processes ---" + ps -eo pid,ppid,stat,etime,%cpu,%mem,comm,args --sort=-%cpu | head -40 || true + ' || true + + echo "=== ${node} CRI pod/container summary ===" + docker_timeout exec "${node}" crictl pods || true + docker_timeout exec "${node}" crictl ps -a || true + docker_timeout exec "${node}" crictl stats || true +} + +dump_static_pod_runtime_diagnostics() { + local component="$1" + local node="${KIND_CLUSTER_NAME}-control-plane" + local container_ids + local container_id + local count=0 + + if ! docker_timeout inspect "${node}" >/dev/null 2>&1; then + echo "::warning::cannot collect ${component} runtime diagnostics: kind node container ${node} not found" + return + fi + + echo "=== ${node} ${component} static pod manifest ===" + docker_timeout exec "${node}" sh -c "sed -n '1,220p' /etc/kubernetes/manifests/${component}.yaml" || true + + echo "=== ${node} ${component} CRI containers ===" + docker_timeout exec "${node}" crictl ps -a --name "${component}" || true + + container_ids=$(docker_timeout exec "${node}" crictl ps -a --name "${component}" -q 2>/dev/null || true) + for container_id in ${container_ids}; do + count=$((count + 1)) + if (( count > 8 )); then + echo "Skipping remaining ${component} CRI containers after first 8 entries." + break + fi + + echo "=== ${node} crictl inspect ${component} ${container_id} ===" + docker_timeout exec "${node}" crictl inspect "${container_id}" || true + echo "=== ${node} crictl logs ${component} ${container_id} ===" + docker_timeout exec "${node}" crictl logs --tail=200 "${container_id}" || true + done + + echo "=== ${node} kubelet journal (${component}) ===" + docker_timeout exec "${node}" journalctl -u kubelet --since '45 minutes ago' --no-pager 2>/dev/null \ + | grep -Ei "${component}|static pod|mirror pod|probe|liveness|readiness|startup|back-off|backoff|container|failed|error|oom|killed" \ + | tail -200 || true + + echo "=== ${node} containerd journal (${component}) ===" + docker_timeout exec "${node}" journalctl -u containerd --since '45 minutes ago' --no-pager 2>/dev/null \ + | grep -Ei "${component}|container|task|shim|deadline|failed|error|oom|killed" \ + | tail -200 || true +} + +dump_all_control_plane_runtime_diagnostics() { + local component + + dump_control_plane_summary + dump_api_server_health + dump_kind_node_runtime_summary + for component in ${COMPONENTS}; do + dump_static_pod_runtime_diagnostics "${component}" + kubectl_kind -n "${NAMESPACE}" get lease "${component}" -o yaml 2>/dev/null || true + done +} + +dump_component_diagnostics() { + local component="$1" + local selector="component=${component}" + local pods + local pod + + dump_control_plane_summary + kubectl_kind -n "${NAMESPACE}" get pod -l "${selector}" -o wide || true + kubectl_kind -n "${NAMESPACE}" describe pod -l "${selector}" || true + kubectl_kind -n "${NAMESPACE}" get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true + + pods=$(kubectl_kind -n "${NAMESPACE}" get pod -l "${selector}" -o name 2>/dev/null || true) + while IFS= read -r pod; do + [[ -z "${pod}" ]] && continue + echo "=== ${pod} logs ===" + kubectl_kind -n "${NAMESPACE}" logs "${pod}" --all-containers --tail=100 2>/dev/null || true + echo "=== ${pod} previous logs ===" + kubectl_kind -n "${NAMESPACE}" logs "${pod}" --all-containers --previous --tail=100 2>/dev/null || true + done <<< "${pods}" + + dump_all_control_plane_runtime_diagnostics + kubectl_kind -n "${NAMESPACE}" get lease "${component}" -o yaml 2>/dev/null || true +} + +is_recovery_component() { + local component="$1" + local candidate + + for candidate in ${RECOVERY_COMPONENTS}; do + if [[ "${candidate}" == "${component}" ]]; then + return 0 + fi + done + return 1 +} + +try_recover_component() { + local component="$1" + local reason="$2" + local node="${KIND_CLUSTER_NAME}-control-plane" + local attempt + local container_ids + local container_id + + if [[ "${RECOVER_UNHEALTHY}" != "true" ]]; then + return 1 + fi + if (( MAX_RECOVERY_ATTEMPTS == 0 )); then + return 1 + fi + if ! is_recovery_component "${component}"; then + return 1 + fi + + attempt="${RECOVERY_ATTEMPTS[${component}]:-0}" + if (( attempt >= MAX_RECOVERY_ATTEMPTS )); then + return 1 + fi + RECOVERY_ATTEMPTS["${component}"]=$((attempt + 1)) + + echo "::warning::${component} is unhealthy (${reason}); restarting static pod container (attempt $((attempt + 1))/${MAX_RECOVERY_ATTEMPTS})" + dump_component_diagnostics "${component}" + + if ! docker_timeout inspect "${node}" >/dev/null 2>&1; then + echo "::warning::cannot recover ${component}: kind node container ${node} not found" + return 1 + fi + + if ! container_ids=$(docker_timeout exec "${node}" crictl ps --name "${component}" -q 2>/dev/null); then + echo "::warning::cannot recover ${component}: timed out or failed to list containers in ${node}" + return 1 + fi + if [[ -z "${container_ids}" ]]; then + echo "::warning::cannot recover ${component}: no running container found in ${node}" + return 1 + fi + + for container_id in ${container_ids}; do + echo "Stopping ${component} container ${container_id} in ${node}..." + if ! docker_timeout exec "${node}" crictl stop "${container_id}"; then + echo "::warning::failed to stop ${component} container ${container_id}" + return 1 + fi + done + + # Give kubelet a short interval to observe the stopped CRI container + # and refresh the mirror pod before kubectl wait reads pod status. + sleep "${STATIC_POD_RECREATE_SETTLE_SECONDS}" + if ! wait_ready "${component}"; then + echo "::warning::${component} did not recover after static pod container restart" + dump_component_diagnostics "${component}" + kubectl_kind get --raw='/readyz' || true + return 1 + fi + + echo "${component} recovered after static pod container restart." + return 0 +} + +check_component() { + local component="$1" + local selector="component=${component}" + local pods + local initial_restarts + + if ! pods=$(kubectl_kind -n "${NAMESPACE}" get pod -l "${selector}" -o name); then + if ! try_recover_component "${component}" "failed to list pods in ${NAMESPACE} with selector ${selector}"; then + echo "::error::failed to list ${component} pods in ${NAMESPACE} with selector ${selector}" + kubectl_kind -n "${NAMESPACE}" get pods -o wide || true + exit 1 + fi + if ! pods=$(kubectl_kind -n "${NAMESPACE}" get pod -l "${selector}" -o name); then + echo "::error::failed to list ${component} pods after recovery" + kubectl_kind -n "${NAMESPACE}" get pods -o wide || true + exit 1 + fi + fi + if [[ -z "${pods}" ]]; then + echo "::error::no ${component} pods found in ${NAMESPACE} with selector ${selector}" + kubectl_kind -n "${NAMESPACE}" get pods -o wide || true + exit 1 + fi + + if ! wait_ready "${component}"; then + if ! try_recover_component "${component}" "pods did not become Ready within ${WAIT_TIMEOUT}"; then + echo "::error::${component} pods did not become Ready within ${WAIT_TIMEOUT}" + dump_component_diagnostics "${component}" + kubectl_kind get --raw='/readyz' || true + exit 1 + fi + fi + initial_restarts=$(restart_total "${component}") + report_restart_baseline "${component}" "${initial_restarts}" + INITIAL_RESTARTS["${component}"]="${initial_restarts}" +} + +verify_stability_window() { + local component + local initial_restarts + local final_restarts + local recovered=false + + if [[ "${STABILITY_WINDOW}" == "0s" ]]; then + return + fi + + echo "Observing control-plane stability for ${STABILITY_WINDOW}..." + sleep "${STABILITY_WINDOW}" + for component in ${COMPONENTS}; do + initial_restarts="${INITIAL_RESTARTS[${component}]:-}" + if [[ -z "${initial_restarts}" ]]; then + echo "::error::missing initial restart count for ${component}" + exit 1 + fi + if ! wait_ready "${component}"; then + if ! try_recover_component "${component}" "pods became unready during ${STABILITY_WINDOW}"; then + echo "::error::${component} pods became unready during ${STABILITY_WINDOW}" + dump_component_diagnostics "${component}" + kubectl_kind get --raw='/readyz' || true + exit 1 + fi + initial_restarts=$(restart_total "${component}") + report_restart_baseline "${component}" "${initial_restarts}" + INITIAL_RESTARTS["${component}"]="${initial_restarts}" + recovered=true + continue + fi + final_restarts=$(restart_total "${component}") + if (( final_restarts > initial_restarts )); then + echo "::error::${component} restartCount increased from ${initial_restarts} to ${final_restarts} during ${STABILITY_WINDOW}" + dump_component_diagnostics "${component}" + kubectl_kind get --raw='/readyz' || true + exit 1 + fi + INITIAL_RESTARTS["${component}"]="${final_restarts}" + done + + if [[ "${recovered}" != "true" ]]; then + return + fi + + echo "::warning::control-plane recovery occurred; observing one additional ${STABILITY_WINDOW} stability window" + sleep "${STABILITY_WINDOW}" + for component in ${COMPONENTS}; do + initial_restarts="${INITIAL_RESTARTS[${component}]:-}" + if [[ -z "${initial_restarts}" ]]; then + echo "::error::missing post-recovery restart count for ${component}" + exit 1 + fi + if ! wait_ready "${component}"; then + echo "::error::${component} pods became unready after recovery" + dump_component_diagnostics "${component}" + kubectl_kind get --raw='/readyz' || true + exit 1 + fi + final_restarts=$(restart_total "${component}") + if (( final_restarts > initial_restarts )); then + echo "::error::${component} restartCount increased from ${initial_restarts} to ${final_restarts} after recovery" + dump_component_diagnostics "${component}" + exit 1 + fi + INITIAL_RESTARTS["${component}"]="${final_restarts}" + done +} + +for component in ${COMPONENTS}; do + check_component "${component}" +done +verify_stability_window +require_readyz "after stability window" diff --git a/.github/actions/gpu-cluster-setup/action.yml b/.github/actions/gpu-cluster-setup/action.yml index 21d27800c..324ce7a8f 100644 --- a/.github/actions/gpu-cluster-setup/action.yml +++ b/.github/actions/gpu-cluster-setup/action.yml @@ -99,12 +99,7 @@ runs: - name: Validate environment shell: bash - run: | - if [[ -z "${KIND_CLUSTER_NAME:-}" ]]; then - echo "::error::KIND_CLUSTER_NAME environment variable must be set by the calling workflow" - exit 1 - fi - + run: bash "${{ github.action_path }}/validate-env.sh" - name: Load versions id: versions uses: ./.github/actions/load-versions @@ -130,10 +125,9 @@ runs: - name: Install nvkind shell: bash - run: | - go install "github.com/NVIDIA/nvkind/cmd/nvkind@${{ steps.versions.outputs.nvkind }}" - nvkind --help - + env: + NVKIND_VERSION: ${{ steps.versions.outputs.nvkind }} + run: bash "${{ github.action_path }}/install-nvkind.sh" - name: Runner preflight shell: bash env: @@ -141,153 +135,32 @@ runs: MIN_GPU_COUNT: ${{ inputs.min_gpu_count }} MIN_FREE_DISK_GB: ${{ inputs.min_free_disk_gb }} MIN_AVAILABLE_MEMORY_GB: ${{ inputs.min_available_memory_gb }} - run: | - set -euo pipefail - - echo "=== Runner baseline ===" - date -u - hostname - uptime - nproc - free -h - df -h / - df -ih / - - for value_name in MIN_GPU_COUNT MIN_FREE_DISK_GB MIN_AVAILABLE_MEMORY_GB; do - value="${!value_name}" - if ! [[ "${value}" =~ ^[0-9]+$ ]]; then - echo "::error::${value_name} must be an integer, got '${value}'" - exit 1 - fi - done - - echo "=== Docker health ===" - docker info >/dev/null - docker version - - echo "=== Host GPUs ===" - nvidia-smi -L - nvidia-smi - - mapfile -t gpu_names < <(nvidia-smi --query-gpu=name --format=csv,noheader) - if [[ -n "${GPU_MODEL_PATTERN}" ]]; then - set +e - gpu_count=$(printf '%s\n' "${gpu_names[@]}" | grep -Eic -- "${GPU_MODEL_PATTERN}") - grep_status=$? - set -e - if (( grep_status == 2 )); then - echo "::error::invalid gpu_model_pattern regex: ${GPU_MODEL_PATTERN}" - exit 1 - fi - if (( grep_status != 0 )); then - gpu_count=0 - fi - echo "Visible GPUs matching '${GPU_MODEL_PATTERN}': ${gpu_count}" - else - gpu_count="${#gpu_names[@]}" - echo "Visible GPUs: ${gpu_count}" - fi - - if (( gpu_count < MIN_GPU_COUNT )); then - echo "::error::visible GPU count ${gpu_count} is below required minimum ${MIN_GPU_COUNT}" - exit 1 - fi - - echo "=== Existing kind state ===" - kind get clusters || true - docker ps -a --filter "label=io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}" || true - + run: bash "${{ github.action_path }}/runner-preflight.sh" - name: Configure NVIDIA Container Toolkit for kind shell: bash - run: | - sudo nvidia-ctk runtime configure --runtime=docker --set-as-default --cdi.enabled - sudo nvidia-ctk config --set accept-nvidia-visible-devices-as-volume-mounts=true --in-place - sudo nvidia-ctk config --set accept-nvidia-visible-devices-envvar-when-unprivileged=false --in-place - sudo systemctl restart docker - + run: bash "${{ github.action_path }}/configure-nvidia-container-toolkit.sh" - name: Validate Docker GPU access shell: bash - run: | - set -euo pipefail - timeout 300s docker run --rm -v /dev/null:/var/run/nvidia-container-devices/all ubuntu:22.04 nvidia-smi -L - + run: bash "${{ github.action_path }}/validate-docker-gpu-access.sh" - name: Increase inotify limits shell: bash - run: | - sudo sysctl -w fs.inotify.max_user_watches=524288 - sudo sysctl -w fs.inotify.max_user_instances=1024 - + run: bash "${{ github.action_path }}/increase-inotify-limits.sh" - name: Delete stale kind cluster shell: bash - run: | - set -euo pipefail - kind_cluster_label="io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}" - if kind get clusters | grep -Fxq "${KIND_CLUSTER_NAME}"; then - echo "Deleting stale kind cluster: ${KIND_CLUSTER_NAME}" - if ! timeout 180s kind delete cluster --name "${KIND_CLUSTER_NAME}"; then - echo "::warning::kind delete cluster timed out or failed; falling back to direct container cleanup" - fi - else - echo "No stale kind cluster named ${KIND_CLUSTER_NAME}" - fi - - remaining_containers=$(docker ps -aq --filter "label=${kind_cluster_label}") - if [[ -n "${remaining_containers}" ]]; then - echo "Removing stale containers for ${KIND_CLUSTER_NAME}:" - docker ps -a --filter "label=${kind_cluster_label}" - docker rm -f ${remaining_containers} - fi - - remaining_containers=$(docker ps -aq --filter "label=${kind_cluster_label}") - if [[ -n "${remaining_containers}" ]]; then - echo "::error::stale containers still remain for ${KIND_CLUSTER_NAME}:" - docker ps -a --filter "label=${kind_cluster_label}" - exit 1 - fi - + run: bash "${{ github.action_path }}/delete-stale-kind-cluster.sh" - name: Check runner capacity shell: bash env: MIN_FREE_DISK_GB: ${{ inputs.min_free_disk_gb }} MIN_AVAILABLE_MEMORY_GB: ${{ inputs.min_available_memory_gb }} - run: | - set -euo pipefail - free_disk_gb=$(df -BG --output=avail / | tail -1 | tr -dc '0-9') - if (( free_disk_gb < MIN_FREE_DISK_GB )); then - echo "::error::free disk on / is ${free_disk_gb}GiB, need at least ${MIN_FREE_DISK_GB}GiB" - exit 1 - fi - - available_memory_gb=$(free -g | awk '/^Mem:/ {print $7}') - if (( available_memory_gb < MIN_AVAILABLE_MEMORY_GB )); then - echo "::error::available memory is ${available_memory_gb}GiB, need at least ${MIN_AVAILABLE_MEMORY_GB}GiB" - exit 1 - fi - - echo "Runner capacity is sufficient: disk=${free_disk_gb}GiB memory=${available_memory_gb}GiB" - + run: bash "${{ github.action_path }}/check-runner-capacity.sh" - name: Warm kind node image if: ${{ inputs.kind_node_image != '' }} shell: bash env: KIND_NODE_IMAGE: ${{ inputs.kind_node_image }} MIN_FREE_DISK_GB: ${{ inputs.min_free_disk_gb }} - run: | - set -euo pipefail - echo "=== Kind node image cache ===" - if docker image inspect "${KIND_NODE_IMAGE}" >/dev/null 2>&1; then - echo "Kind node image already cached: ${KIND_NODE_IMAGE}" - else - echo "Pulling kind node image: ${KIND_NODE_IMAGE}" - timeout 600s docker pull "${KIND_NODE_IMAGE}" - fi - free_disk_gb=$(df -BG --output=avail / | tail -1 | tr -dc '0-9') - if (( free_disk_gb < MIN_FREE_DISK_GB )); then - echo "::error::free disk on / is ${free_disk_gb}GiB after warming ${KIND_NODE_IMAGE}, need at least ${MIN_FREE_DISK_GB}GiB" - exit 1 - fi - echo "Runner disk remains sufficient after kind image warm-up: ${free_disk_gb}GiB" - + run: bash "${{ github.action_path }}/warm-kind-node-image.sh" - name: Create GPU-enabled kind cluster shell: bash env: @@ -306,377 +179,7 @@ runs: SCHEDULER_MEMORY_REQUEST: ${{ inputs.scheduler_memory_request }} ETCD_CPU_REQUEST: ${{ inputs.etcd_cpu_request }} ETCD_MEMORY_REQUEST: ${{ inputs.etcd_memory_request }} - run: | - set -euo pipefail - - validate_duration_input() { - local input_name="$1" - local input_value="$2" - - if ! [[ "${input_value}" =~ ^[0-9]+[smh]$ ]]; then - echo "::error::${input_name} must be a duration like 300s, 10m, or 1h; got '${input_value}'" - exit 1 - fi - } - - validate_generated_control_plane_config() { - if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then - for patch_file in "${patch_dir}"/*.yaml; do - if ! grep -Fxq 'apiVersion: v1' "${patch_file}" || - ! grep -Fxq 'kind: Pod' "${patch_file}" || - ! grep -Eq '^[[:space:]]+resources:$' "${patch_file}"; then - echo "::error::rendered static pod patch ${patch_file} is missing expected top-level YAML" - sed 's/^/ /' "${patch_file}" || true - exit 1 - fi - done - - if ! grep -Eq '^[[:space:]]*extraMounts:$' "${config_template}" || - ! grep -Fq 'directory: /patches' "${config_template}"; then - echo "::error::rendered kind config is missing control-plane patch mounts" - sed 's/^/ /' "${config_template}" || true - exit 1 - fi - fi - - if [[ "${CONTROL_PLANE_LEADER_ELECTION_TUNING}" == "true" ]]; then - for expected in \ - 'apiVersion: kubeadm.k8s.io/v1beta3' \ - 'apiVersion: kubeadm.k8s.io/v1beta4' \ - "leader-elect-lease-duration: \"${LEADER_ELECTION_LEASE_DURATION}\"" \ - "leader-elect-renew-deadline: \"${LEADER_ELECTION_RENEW_DEADLINE}\"" \ - "leader-elect-retry-period: \"${LEADER_ELECTION_RETRY_PERIOD}\"" \ - "value: \"${LEADER_ELECTION_LEASE_DURATION}\"" \ - "value: \"${LEADER_ELECTION_RENEW_DEADLINE}\"" \ - "value: \"${LEADER_ELECTION_RETRY_PERIOD}\""; do - if ! grep -Fq "${expected}" "${config_template}"; then - echo "::error::rendered kind config is missing expected leader election setting: ${expected}" - sed 's/^/ /' "${config_template}" || true - exit 1 - fi - done - fi - } - - validate_duration_input cluster_create_timeout "${CLUSTER_CREATE_TIMEOUT}" - validate_duration_input leader_election_lease_duration "${LEADER_ELECTION_LEASE_DURATION}" - validate_duration_input leader_election_renew_deadline "${LEADER_ELECTION_RENEW_DEADLINE}" - validate_duration_input leader_election_retry_period "${LEADER_ELECTION_RETRY_PERIOD}" - - CREATE_ARGS=(--name="${KIND_CLUSTER_NAME}") - if [[ -n "${KIND_NODE_IMAGE}" ]]; then - echo "Using kind node image: ${KIND_NODE_IMAGE}" - CREATE_ARGS+=(--image="${KIND_NODE_IMAGE}") - fi - - case "${CONTROL_PLANE_RESOURCE_PATCHES}" in - true) ;; - ""|false) CONTROL_PLANE_RESOURCE_PATCHES=false ;; - *) - echo "::error::control_plane_resource_patches must be true or false, got '${CONTROL_PLANE_RESOURCE_PATCHES}'" - exit 1 - ;; - esac - - case "${CONTROL_PLANE_LEADER_ELECTION_TUNING}" in - true) ;; - ""|false) CONTROL_PLANE_LEADER_ELECTION_TUNING=false ;; - *) - echo "::error::control_plane_leader_election_tuning must be true or false, got '${CONTROL_PLANE_LEADER_ELECTION_TUNING}'" - exit 1 - ;; - esac - - if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" || "${CONTROL_PLANE_LEADER_ELECTION_TUNING}" == "true" ]]; then - patch_dir="$(mktemp -d)" - config_template="$(mktemp)" - - # Keep heredoc body indentation aligned with this run block. GitHub - # Actions strips the common run: | indent before bash sees it. - if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then - cat > "${patch_dir}/kube-apiserver+strategic.yaml" < "${patch_dir}/kube-controller-manager+strategic.yaml" < "${patch_dir}/kube-scheduler+strategic.yaml" < "${patch_dir}/etcd+strategic.yaml" < "${config_template}" <<'EOF' - kind: Cluster - apiVersion: kind.x-k8s.io/v1alpha4 - {{- if hasKey $ "name" }} - name: {{ $.name }} - {{- end }} - nodes: - - role: control-plane - {{- if hasKey $ "image" }} - image: {{ $.image }} - {{- end }} - EOF - if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then - cat >> "${config_template}" <> "${config_template}" <<'EOF' - kubeadmConfigPatches: - EOF - if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then - cat >> "${config_template}" <<'EOF' - - | - kind: InitConfiguration - patches: - directory: /patches - EOF - fi - if [[ "${CONTROL_PLANE_LEADER_ELECTION_TUNING}" == "true" ]]; then - # kind v0.31 renders kubeadm v1beta3. Keep a v1beta4 patch too so - # this remains valid when a future kind image switches API versions. - cat >> "${config_template}" <> "${config_template}" <<'EOF' - {{- range $.workers }} - - role: worker - {{- if hasKey $ "image" }} - image: {{ $.image }} - {{- end }} - - {{- if hasKey . "devices" }} - {{- $devices := .devices }} - {{- if not (kindIs "slice" $devices) }} - {{- $devices = list .devices }} - {{- end }} - extraMounts: - # We inject all NVIDIA GPUs using the nvidia-container-runtime. - # This requires `accept-nvidia-visible-devices-as-volume-mounts = true` be set - # in `/etc/nvidia-container-runtime/config.toml` - {{- range $d := $devices }} - - hostPath: /dev/null - containerPath: /var/run/nvidia-container-devices/{{ $d }} - {{- end }} - {{- end }} - {{- end }} - EOF - if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then - echo "Applying control-plane static pod resource patches from ${patch_dir}:" - for patch_file in "${patch_dir}"/*.yaml; do - echo "--- ${patch_file}" - sed 's/^/ /' "${patch_file}" - done - fi - if [[ "${CONTROL_PLANE_LEADER_ELECTION_TUNING}" == "true" ]]; then - echo "Increasing kube-controller-manager and kube-scheduler leader election timeouts for slow CI control planes:" - echo " lease-duration=${LEADER_ELECTION_LEASE_DURATION}" - echo " renew-deadline=${LEADER_ELECTION_RENEW_DEADLINE}" - echo " retry-period=${LEADER_ELECTION_RETRY_PERIOD}" - fi - validate_generated_control_plane_config - CREATE_ARGS+=(--config-template="${config_template}") - fi - - set +e - timeout "${CLUSTER_CREATE_TIMEOUT}" nvkind cluster create "${CREATE_ARGS[@]}" - create_status=$? - set -e - if (( create_status != 0 )); then - echo "::warning::nvkind cluster create exited with status ${create_status}; continuing only if post-create checks pass" - fi - - kubectl --context="kind-${KIND_CLUSTER_NAME}" wait --for=condition=Ready nodes --all --timeout=300s - kubectl --context="kind-${KIND_CLUSTER_NAME}" cluster-info - kubectl --context="kind-${KIND_CLUSTER_NAME}" get nodes -o wide - kubectl --context="kind-${KIND_CLUSTER_NAME}" describe nodes | \ - grep -E "^(Name:|Capacity:|Allocatable:|Allocated resources:| cpu| memory| nvidia.com/gpu)" || true - - echo "=== Kind node container resources ===" - docker ps --filter "label=io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}" \ - --format '{{.Names}}' | sort | while read -r node_container; do - [[ -z "${node_container}" ]] && continue - docker inspect "${node_container}" \ - --format '{{.Name}} NanoCpus={{.HostConfig.NanoCpus}} CpuShares={{.HostConfig.CpuShares}} Memory={{.HostConfig.Memory}} MemoryReservation={{.HostConfig.MemoryReservation}}' - done - - echo "=== Control-plane resource requests/limits ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kube-system \ - get pods -l tier=control-plane -o json | jq -r ' - .items[] as $pod | - $pod.metadata.name, - ($pod.spec.containers[] | - " " + .name + - " requests=" + ((.resources.requests // {}) | tostring) + - " limits=" + ((.resources.limits // {}) | tostring)) - ' || true - - normalize_cpu_request() { - local cpu="$1" - - if [[ "${cpu}" =~ ^([0-9]+)000m$ ]]; then - echo "${BASH_REMATCH[1]}" - return - fi - echo "${cpu}" - } - - control_plane_request() { - local component="$1" - local resource="$2" - - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kube-system \ - get pod -l "component=${component}" \ - -o "jsonpath={.items[0].spec.containers[0].resources.requests.${resource}}" - } - - assert_control_plane_request() { - local component="$1" - local resource="$2" - local expected="$3" - local actual - - actual="$(control_plane_request "${component}" "${resource}")" - if [[ "${resource}" == "cpu" ]]; then - expected="$(normalize_cpu_request "${expected}")" - actual="$(normalize_cpu_request "${actual}")" - fi - if [[ "${actual}" != "${expected}" ]]; then - echo "::error::${component} ${resource} request is '${actual}', expected '${expected}'" - exit 1 - fi - echo "${component} ${resource} request verified: ${actual}" - } - - control_plane_command_args() { - local component="$1" - - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kube-system \ - get pod -l "component=${component}" \ - -o jsonpath='{range .items[0].spec.containers[0].command[*]}{.}{"\n"}{end}{range .items[0].spec.containers[0].args[*]}{.}{"\n"}{end}' - } - - assert_control_plane_arg() { - local component="$1" - local expected="$2" - local command_args - - command_args="$(control_plane_command_args "${component}")" - if ! grep -Fxq "${expected}" <<< "${command_args}"; then - echo "::error::${component} live pod command/args does not contain ${expected}" - echo "Observed live command/args:" - echo "${command_args}" - exit 1 - fi - echo "${component} command/args verified: ${expected}" - } - - if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then - echo "Verifying control-plane resource patches..." - assert_control_plane_request kube-apiserver cpu "${API_SERVER_CPU_REQUEST}" - assert_control_plane_request kube-apiserver memory "${API_SERVER_MEMORY_REQUEST}" - assert_control_plane_request kube-controller-manager cpu "${CONTROLLER_MANAGER_CPU_REQUEST}" - assert_control_plane_request kube-controller-manager memory "${CONTROLLER_MANAGER_MEMORY_REQUEST}" - assert_control_plane_request kube-scheduler cpu "${SCHEDULER_CPU_REQUEST}" - assert_control_plane_request kube-scheduler memory "${SCHEDULER_MEMORY_REQUEST}" - assert_control_plane_request etcd cpu "${ETCD_CPU_REQUEST}" - assert_control_plane_request etcd memory "${ETCD_MEMORY_REQUEST}" - fi - - if [[ "${CONTROL_PLANE_LEADER_ELECTION_TUNING}" == "true" ]]; then - echo "Verifying control-plane leader election timeout patches..." - for component in kube-controller-manager kube-scheduler; do - assert_control_plane_arg "${component}" "--leader-elect-lease-duration=${LEADER_ELECTION_LEASE_DURATION}" - assert_control_plane_arg "${component}" "--leader-elect-renew-deadline=${LEADER_ELECTION_RENEW_DEADLINE}" - assert_control_plane_arg "${component}" "--leader-elect-retry-period=${LEADER_ELECTION_RETRY_PERIOD}" - done - fi - + run: bash "${{ github.action_path }}/create-gpu-kind-cluster.sh" - name: Print GPUs (nvkind) shell: bash run: nvkind cluster print-gpus --name="${KIND_CLUSTER_NAME}" diff --git a/.github/actions/gpu-cluster-setup/check-runner-capacity.sh b/.github/actions/gpu-cluster-setup/check-runner-capacity.sh new file mode 100644 index 000000000..2e83beeb9 --- /dev/null +++ b/.github/actions/gpu-cluster-setup/check-runner-capacity.sh @@ -0,0 +1,29 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail +free_disk_gb=$(df -BG --output=avail / | tail -1 | tr -dc '0-9') +if (( free_disk_gb < MIN_FREE_DISK_GB )); then + echo "::error::free disk on / is ${free_disk_gb}GiB, need at least ${MIN_FREE_DISK_GB}GiB" + exit 1 +fi + +available_memory_gb=$(free -g | awk '/^Mem:/ {print $7}') +if (( available_memory_gb < MIN_AVAILABLE_MEMORY_GB )); then + echo "::error::available memory is ${available_memory_gb}GiB, need at least ${MIN_AVAILABLE_MEMORY_GB}GiB" + exit 1 +fi + +echo "Runner capacity is sufficient: disk=${free_disk_gb}GiB memory=${available_memory_gb}GiB" diff --git a/.github/actions/gpu-cluster-setup/configure-nvidia-container-toolkit.sh b/.github/actions/gpu-cluster-setup/configure-nvidia-container-toolkit.sh new file mode 100644 index 000000000..0a2fcd814 --- /dev/null +++ b/.github/actions/gpu-cluster-setup/configure-nvidia-container-toolkit.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +sudo nvidia-ctk runtime configure --runtime=docker --set-as-default --cdi.enabled +sudo nvidia-ctk config --set accept-nvidia-visible-devices-as-volume-mounts=true --in-place +sudo nvidia-ctk config --set accept-nvidia-visible-devices-envvar-when-unprivileged=false --in-place +sudo systemctl restart docker diff --git a/.github/actions/gpu-cluster-setup/create-gpu-kind-cluster.sh b/.github/actions/gpu-cluster-setup/create-gpu-kind-cluster.sh new file mode 100644 index 000000000..42a282e17 --- /dev/null +++ b/.github/actions/gpu-cluster-setup/create-gpu-kind-cluster.sh @@ -0,0 +1,411 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +validate_duration_input() { + local input_name="$1" + local input_value="$2" + + if ! [[ "${input_value}" =~ ^[0-9]+[smh]$ ]]; then + echo "::error::${input_name} must be a duration like 300s, 10m, or 1h; got '${input_value}'" + exit 1 + fi +} + +validate_generated_control_plane_config() { + if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then + for patch_file in "${patch_dir}"/*.yaml; do + if ! grep -Fxq 'apiVersion: v1' "${patch_file}" || + ! grep -Fxq 'kind: Pod' "${patch_file}" || + ! grep -Eq '^[[:space:]]+resources:$' "${patch_file}"; then + echo "::error::rendered static pod patch ${patch_file} is missing expected top-level YAML" + sed 's/^/ /' "${patch_file}" || true + exit 1 + fi + done + + if ! grep -Eq '^[[:space:]]*extraMounts:$' "${config_template}" || + ! grep -Fq 'directory: /patches' "${config_template}"; then + echo "::error::rendered kind config is missing control-plane patch mounts" + sed 's/^/ /' "${config_template}" || true + exit 1 + fi + fi + + if [[ "${CONTROL_PLANE_LEADER_ELECTION_TUNING}" == "true" ]]; then + for expected in \ + 'apiVersion: kubeadm.k8s.io/v1beta3' \ + 'apiVersion: kubeadm.k8s.io/v1beta4' \ + "leader-elect-lease-duration: \"${LEADER_ELECTION_LEASE_DURATION}\"" \ + "leader-elect-renew-deadline: \"${LEADER_ELECTION_RENEW_DEADLINE}\"" \ + "leader-elect-retry-period: \"${LEADER_ELECTION_RETRY_PERIOD}\"" \ + "value: \"${LEADER_ELECTION_LEASE_DURATION}\"" \ + "value: \"${LEADER_ELECTION_RENEW_DEADLINE}\"" \ + "value: \"${LEADER_ELECTION_RETRY_PERIOD}\""; do + if ! grep -Fq "${expected}" "${config_template}"; then + echo "::error::rendered kind config is missing expected leader election setting: ${expected}" + sed 's/^/ /' "${config_template}" || true + exit 1 + fi + done + fi +} + +validate_duration_input cluster_create_timeout "${CLUSTER_CREATE_TIMEOUT}" +validate_duration_input leader_election_lease_duration "${LEADER_ELECTION_LEASE_DURATION}" +validate_duration_input leader_election_renew_deadline "${LEADER_ELECTION_RENEW_DEADLINE}" +validate_duration_input leader_election_retry_period "${LEADER_ELECTION_RETRY_PERIOD}" + +CREATE_ARGS=(--name="${KIND_CLUSTER_NAME}") +if [[ -n "${KIND_NODE_IMAGE}" ]]; then + echo "Using kind node image: ${KIND_NODE_IMAGE}" + CREATE_ARGS+=(--image="${KIND_NODE_IMAGE}") +fi + +case "${CONTROL_PLANE_RESOURCE_PATCHES}" in + true) ;; + ""|false) CONTROL_PLANE_RESOURCE_PATCHES=false ;; + *) + echo "::error::control_plane_resource_patches must be true or false, got '${CONTROL_PLANE_RESOURCE_PATCHES}'" + exit 1 + ;; +esac + +case "${CONTROL_PLANE_LEADER_ELECTION_TUNING}" in + true) ;; + ""|false) CONTROL_PLANE_LEADER_ELECTION_TUNING=false ;; + *) + echo "::error::control_plane_leader_election_tuning must be true or false, got '${CONTROL_PLANE_LEADER_ELECTION_TUNING}'" + exit 1 + ;; +esac + +if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" || "${CONTROL_PLANE_LEADER_ELECTION_TUNING}" == "true" ]]; then + patch_dir="$(mktemp -d)" + config_template="$(mktemp)" + + # Keep YAML heredocs at column 0; indentation is literal content. + if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then + cat > "${patch_dir}/kube-apiserver+strategic.yaml" < "${patch_dir}/kube-controller-manager+strategic.yaml" < "${patch_dir}/kube-scheduler+strategic.yaml" < "${patch_dir}/etcd+strategic.yaml" < "${config_template}" <<'EOF' +kind: Cluster +apiVersion: kind.x-k8s.io/v1alpha4 +{{- if hasKey $ "name" }} +name: {{ $.name }} +{{- end }} +nodes: +- role: control-plane + {{- if hasKey $ "image" }} + image: {{ $.image }} + {{- end }} +EOF + if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then + cat >> "${config_template}" <> "${config_template}" <<'EOF' + kubeadmConfigPatches: +EOF + if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then + cat >> "${config_template}" <<'EOF' + - | + kind: InitConfiguration + patches: + directory: /patches +EOF + fi + if [[ "${CONTROL_PLANE_LEADER_ELECTION_TUNING}" == "true" ]]; then + # kind v0.31 renders kubeadm v1beta3. Keep a v1beta4 patch too so + # this remains valid when a future kind image switches API versions. + cat >> "${config_template}" <> "${config_template}" <<'EOF' +{{- range $.workers }} +- role: worker + {{- if hasKey $ "image" }} + image: {{ $.image }} + {{- end }} + + {{- if hasKey . "devices" }} + {{- $devices := .devices }} + {{- if not (kindIs "slice" $devices) }} + {{- $devices = list .devices }} + {{- end }} + extraMounts: + # We inject all NVIDIA GPUs using the nvidia-container-runtime. + # This requires `accept-nvidia-visible-devices-as-volume-mounts = true` be set + # in `/etc/nvidia-container-runtime/config.toml` + {{- range $d := $devices }} + - hostPath: /dev/null + containerPath: /var/run/nvidia-container-devices/{{ $d }} + {{- end }} + {{- end }} +{{- end }} +EOF + if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then + echo "Applying control-plane static pod resource patches from ${patch_dir}:" + for patch_file in "${patch_dir}"/*.yaml; do + echo "--- ${patch_file}" + sed 's/^/ /' "${patch_file}" + done + fi + if [[ "${CONTROL_PLANE_LEADER_ELECTION_TUNING}" == "true" ]]; then + echo "Increasing kube-controller-manager and kube-scheduler leader election timeouts for slow CI control planes:" + echo " lease-duration=${LEADER_ELECTION_LEASE_DURATION}" + echo " renew-deadline=${LEADER_ELECTION_RENEW_DEADLINE}" + echo " retry-period=${LEADER_ELECTION_RETRY_PERIOD}" + fi + validate_generated_control_plane_config + CREATE_ARGS+=(--config-template="${config_template}") +fi + +set +e +timeout "${CLUSTER_CREATE_TIMEOUT}" nvkind cluster create "${CREATE_ARGS[@]}" +create_status=$? +set -e +case "${create_status}" in + 0) ;; + 124) + echo "::warning::nvkind cluster create timed out after ${CLUSTER_CREATE_TIMEOUT}; continuing only if post-create checks pass" + ;; + *) + echo "::error::nvkind cluster create failed with status ${create_status}" + exit "${create_status}" + ;; +esac + +kubectl --context="kind-${KIND_CLUSTER_NAME}" wait --for=condition=Ready nodes --all --timeout=300s +kubectl --context="kind-${KIND_CLUSTER_NAME}" cluster-info +kubectl --context="kind-${KIND_CLUSTER_NAME}" get nodes -o wide +kubectl --context="kind-${KIND_CLUSTER_NAME}" describe nodes | \ + grep -E "^(Name:|Capacity:|Allocatable:|Allocated resources:| cpu| memory| nvidia.com/gpu)" || true + +echo "=== Kind node container resources ===" +docker ps --filter "label=io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}" \ + --format '{{.Names}}' | sort | while read -r node_container; do + [[ -z "${node_container}" ]] && continue + docker inspect "${node_container}" \ + --format '{{.Name}} NanoCpus={{.HostConfig.NanoCpus}} CpuShares={{.HostConfig.CpuShares}} Memory={{.HostConfig.Memory}} MemoryReservation={{.HostConfig.MemoryReservation}}' + done + +echo "=== Control-plane resource requests/limits ===" +kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kube-system \ + get pods -l tier=control-plane -o json | jq -r ' + .items[] as $pod | + $pod.metadata.name, + ($pod.spec.containers[] | + " " + .name + + " requests=" + ((.resources.requests // {}) | tostring) + + " limits=" + ((.resources.limits // {}) | tostring)) + ' || true + +normalize_cpu_request() { + local cpu="$1" + + if [[ "${cpu}" =~ ^([0-9]+)000m$ ]]; then + echo "${BASH_REMATCH[1]}" + return + fi + echo "${cpu}" +} + +control_plane_request() { + local component="$1" + local resource="$2" + + kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kube-system \ + get pod -l "component=${component}" \ + -o "jsonpath={.items[0].spec.containers[0].resources.requests.${resource}}" +} + +assert_control_plane_request() { + local component="$1" + local resource="$2" + local expected="$3" + local actual + + actual="$(control_plane_request "${component}" "${resource}")" + if [[ "${resource}" == "cpu" ]]; then + expected="$(normalize_cpu_request "${expected}")" + actual="$(normalize_cpu_request "${actual}")" + fi + if [[ "${actual}" != "${expected}" ]]; then + echo "::error::${component} ${resource} request is '${actual}', expected '${expected}'" + exit 1 + fi + echo "${component} ${resource} request verified: ${actual}" +} + +control_plane_command_args() { + local component="$1" + + kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kube-system \ + get pod -l "component=${component}" \ + -o json | jq -r '.items[0].spec.containers[0] | ((.command // []) + (.args // []))[]?' +} + +static_pod_manifest_contains_arg() { + local component="$1" + local expected="$2" + local node="${KIND_CLUSTER_NAME}-control-plane" + + docker exec "${node}" grep -Fq -- "- ${expected}" "/etc/kubernetes/manifests/${component}.yaml" +} + +dump_static_pod_manifest() { + local component="$1" + local node="${KIND_CLUSTER_NAME}-control-plane" + + echo "Static pod manifest /etc/kubernetes/manifests/${component}.yaml:" + docker exec "${node}" sed -n '1,220p' "/etc/kubernetes/manifests/${component}.yaml" || true +} + +assert_control_plane_arg() { + local component="$1" + local expected="$2" + local command_args + + command_args="$(control_plane_command_args "${component}")" + if ! grep -Fxq "${expected}" <<< "${command_args}"; then + if static_pod_manifest_contains_arg "${component}" "${expected}"; then + echo "::warning::${component} live mirror pod command/args did not show ${expected}; static pod manifest is patched" + return + fi + echo "::error::${component} live pod command/args does not contain ${expected}" + echo "Observed live command/args:" + echo "${command_args}" + dump_static_pod_manifest "${component}" + exit 1 + fi + echo "${component} command/args verified: ${expected}" +} + +if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then + echo "Verifying control-plane resource patches..." + assert_control_plane_request kube-apiserver cpu "${API_SERVER_CPU_REQUEST}" + assert_control_plane_request kube-apiserver memory "${API_SERVER_MEMORY_REQUEST}" + assert_control_plane_request kube-controller-manager cpu "${CONTROLLER_MANAGER_CPU_REQUEST}" + assert_control_plane_request kube-controller-manager memory "${CONTROLLER_MANAGER_MEMORY_REQUEST}" + assert_control_plane_request kube-scheduler cpu "${SCHEDULER_CPU_REQUEST}" + assert_control_plane_request kube-scheduler memory "${SCHEDULER_MEMORY_REQUEST}" + assert_control_plane_request etcd cpu "${ETCD_CPU_REQUEST}" + assert_control_plane_request etcd memory "${ETCD_MEMORY_REQUEST}" +fi + +if [[ "${CONTROL_PLANE_LEADER_ELECTION_TUNING}" == "true" ]]; then + echo "Verifying control-plane leader election timeout patches..." + for component in kube-controller-manager kube-scheduler; do + assert_control_plane_arg "${component}" "--leader-elect-lease-duration=${LEADER_ELECTION_LEASE_DURATION}" + assert_control_plane_arg "${component}" "--leader-elect-renew-deadline=${LEADER_ELECTION_RENEW_DEADLINE}" + assert_control_plane_arg "${component}" "--leader-elect-retry-period=${LEADER_ELECTION_RETRY_PERIOD}" + done +fi diff --git a/.github/actions/gpu-cluster-setup/delete-stale-kind-cluster.sh b/.github/actions/gpu-cluster-setup/delete-stale-kind-cluster.sh new file mode 100644 index 000000000..8e85ffcb9 --- /dev/null +++ b/.github/actions/gpu-cluster-setup/delete-stale-kind-cluster.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail +kind_cluster_label="io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}" +if kind get clusters | grep -Fxq "${KIND_CLUSTER_NAME}"; then + echo "Deleting stale kind cluster: ${KIND_CLUSTER_NAME}" + if ! timeout 180s kind delete cluster --name "${KIND_CLUSTER_NAME}"; then + echo "::warning::kind delete cluster timed out or failed; falling back to direct container cleanup" + fi +else + echo "No stale kind cluster named ${KIND_CLUSTER_NAME}" +fi + +remaining_containers=$(docker ps -aq --filter "label=${kind_cluster_label}") +if [[ -n "${remaining_containers}" ]]; then + echo "Removing stale containers for ${KIND_CLUSTER_NAME}:" + docker ps -a --filter "label=${kind_cluster_label}" + docker rm -f ${remaining_containers} +fi + +remaining_containers=$(docker ps -aq --filter "label=${kind_cluster_label}") +if [[ -n "${remaining_containers}" ]]; then + echo "::error::stale containers still remain for ${KIND_CLUSTER_NAME}:" + docker ps -a --filter "label=${kind_cluster_label}" + exit 1 +fi diff --git a/.github/actions/gpu-cluster-setup/increase-inotify-limits.sh b/.github/actions/gpu-cluster-setup/increase-inotify-limits.sh new file mode 100644 index 000000000..843496a38 --- /dev/null +++ b/.github/actions/gpu-cluster-setup/increase-inotify-limits.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +sudo sysctl -w fs.inotify.max_user_watches=524288 +sudo sysctl -w fs.inotify.max_user_instances=1024 diff --git a/.github/actions/gpu-cluster-setup/install-nvkind.sh b/.github/actions/gpu-cluster-setup/install-nvkind.sh new file mode 100644 index 000000000..38f1ce0ae --- /dev/null +++ b/.github/actions/gpu-cluster-setup/install-nvkind.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +go install "github.com/NVIDIA/nvkind/cmd/nvkind@${NVKIND_VERSION}" +nvkind --help diff --git a/.github/actions/gpu-cluster-setup/runner-preflight.sh b/.github/actions/gpu-cluster-setup/runner-preflight.sh new file mode 100644 index 000000000..678b9d419 --- /dev/null +++ b/.github/actions/gpu-cluster-setup/runner-preflight.sh @@ -0,0 +1,69 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +echo "=== Runner baseline ===" +date -u +hostname +uptime +nproc +free -h +df -h / +df -ih / + +for value_name in MIN_GPU_COUNT MIN_FREE_DISK_GB MIN_AVAILABLE_MEMORY_GB; do + value="${!value_name}" + if ! [[ "${value}" =~ ^[0-9]+$ ]]; then + echo "::error::${value_name} must be an integer, got '${value}'" + exit 1 + fi +done + +echo "=== Docker health ===" +docker info >/dev/null +docker version + +echo "=== Host GPUs ===" +nvidia-smi -L +nvidia-smi + +mapfile -t gpu_names < <(nvidia-smi --query-gpu=name --format=csv,noheader) +if [[ -n "${GPU_MODEL_PATTERN}" ]]; then + set +e + gpu_count=$(printf '%s\n' "${gpu_names[@]}" | grep -Eic -- "${GPU_MODEL_PATTERN}") + grep_status=$? + set -e + if (( grep_status == 2 )); then + echo "::error::invalid gpu_model_pattern regex: ${GPU_MODEL_PATTERN}" + exit 1 + fi + if (( grep_status != 0 )); then + gpu_count=0 + fi + echo "Visible GPUs matching '${GPU_MODEL_PATTERN}': ${gpu_count}" +else + gpu_count="${#gpu_names[@]}" + echo "Visible GPUs: ${gpu_count}" +fi + +if (( gpu_count < MIN_GPU_COUNT )); then + echo "::error::visible GPU count ${gpu_count} is below required minimum ${MIN_GPU_COUNT}" + exit 1 +fi + +echo "=== Existing kind state ===" +kind get clusters || true +docker ps -a --filter "label=io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}" || true diff --git a/.github/actions/gpu-cluster-setup/validate-docker-gpu-access.sh b/.github/actions/gpu-cluster-setup/validate-docker-gpu-access.sh new file mode 100644 index 000000000..6f01ba156 --- /dev/null +++ b/.github/actions/gpu-cluster-setup/validate-docker-gpu-access.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail +timeout 300s docker run --rm -v /dev/null:/var/run/nvidia-container-devices/all ubuntu:22.04 nvidia-smi -L diff --git a/.github/actions/gpu-cluster-setup/validate-env.sh b/.github/actions/gpu-cluster-setup/validate-env.sh new file mode 100644 index 000000000..697d077c2 --- /dev/null +++ b/.github/actions/gpu-cluster-setup/validate-env.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +if [[ -z "${KIND_CLUSTER_NAME:-}" ]]; then + echo "::error::KIND_CLUSTER_NAME environment variable must be set by the calling workflow" + exit 1 +fi diff --git a/.github/actions/gpu-cluster-setup/warm-kind-node-image.sh b/.github/actions/gpu-cluster-setup/warm-kind-node-image.sh new file mode 100644 index 000000000..3d58a4887 --- /dev/null +++ b/.github/actions/gpu-cluster-setup/warm-kind-node-image.sh @@ -0,0 +1,29 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail +echo "=== Kind node image cache ===" +if docker image inspect "${KIND_NODE_IMAGE}" >/dev/null 2>&1; then + echo "Kind node image already cached: ${KIND_NODE_IMAGE}" +else + echo "Pulling kind node image: ${KIND_NODE_IMAGE}" + timeout 600s docker pull "${KIND_NODE_IMAGE}" +fi +free_disk_gb=$(df -BG --output=avail / | tail -1 | tr -dc '0-9') +if (( free_disk_gb < MIN_FREE_DISK_GB )); then + echo "::error::free disk on / is ${free_disk_gb}GiB after warming ${KIND_NODE_IMAGE}, need at least ${MIN_FREE_DISK_GB}GiB" + exit 1 +fi +echo "Runner disk remains sufficient after kind image warm-up: ${free_disk_gb}GiB" diff --git a/.github/actions/gpu-operator-install/action.yml b/.github/actions/gpu-operator-install/action.yml index 86d247932..b30c63f2d 100644 --- a/.github/actions/gpu-operator-install/action.yml +++ b/.github/actions/gpu-operator-install/action.yml @@ -49,105 +49,33 @@ runs: - name: Install GPU Operator (helm) if: inputs.method == 'helm' shell: bash - run: | - helm repo add nvidia https://helm.ngc.nvidia.com/nvidia - helm repo update - helm upgrade -i \ - --kube-context="kind-${KIND_CLUSTER_NAME}" \ - --namespace gpu-operator \ - --create-namespace \ - --set driver.enabled=false \ - --set toolkit.enabled=false \ - --set dcgmExporter.enabled=false \ - --set nfd.enabled=true \ - --wait --timeout=600s \ - gpu-operator nvidia/gpu-operator - + run: bash "${{ github.action_path }}/install-gpu-operator-helm.sh" - name: Wait for GPU operands (helm) if: inputs.method == 'helm' shell: bash - run: | - echo "Waiting for device plugin to be ready..." - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \ - rollout status daemonset -l app=nvidia-device-plugin-daemonset --timeout=300s || true - echo "GPU Operator pods:" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods - + run: bash "${{ github.action_path }}/wait-gpu-operands-helm.sh" # --- Bundle mode: aicr recipe → bundle → deploy --- - name: Generate recipe if: inputs.method == 'bundle' shell: bash - run: | - PLATFORM_FLAG="" - if [[ -n "${{ inputs.platform }}" ]]; then - PLATFORM_FLAG="--platform ${{ inputs.platform }}" - fi - ./aicr recipe \ - --service kind \ - --accelerator ${{ inputs.accelerator }} \ - --os ubuntu \ - --intent ${{ inputs.intent }} \ - ${PLATFORM_FLAG} \ - --output recipe.yaml - echo "Recipe written to recipe.yaml" - + env: + AICR_ACCELERATOR: ${{ inputs.accelerator }} + AICR_INTENT: ${{ inputs.intent }} + AICR_PLATFORM: ${{ inputs.platform }} + run: bash "${{ github.action_path }}/generate-recipe.sh" - name: Generate deployment bundle if: inputs.method == 'bundle' shell: bash - run: | - ./aicr bundle \ - --recipe recipe.yaml \ - --accelerated-node-toleration nvidia.com/gpu:NoSchedule \ - --output bundle - echo "--- Bundle contents ---" - ls -la bundle/ - + run: bash "${{ github.action_path }}/generate-bundle.sh" - name: Install bundle into cluster if: inputs.method == 'bundle' shell: bash - run: | - cd bundle - # The default keeps legacy bundle-mode behavior: do not wait on every - # Helm resource and keep deploying after component failures. H100 - # qualification jobs override these inputs to hard-fail and wait. - chmod +x deploy.sh - DEPLOY_ARGS=() - if [[ "${{ inputs.wait }}" != "true" ]]; then - DEPLOY_ARGS+=(--no-wait) - fi - if [[ "${{ inputs.best_effort }}" == "true" ]]; then - DEPLOY_ARGS+=(--best-effort) - fi - if [[ "${#DEPLOY_ARGS[@]}" -gt 0 ]]; then - echo "Deploying bundle with args: ${DEPLOY_ARGS[*]}" - else - echo "Deploying bundle with default args" - fi - ./deploy.sh "${DEPLOY_ARGS[@]}" - + env: + AICR_DEPLOY_WAIT: ${{ inputs.wait }} + AICR_DEPLOY_BEST_EFFORT: ${{ inputs.best_effort }} + run: bash "${{ github.action_path }}/install-bundle.sh" - name: Wait for GPU operands (bundle) if: inputs.method == 'bundle' shell: bash - run: | - echo "Waiting for GPU operator controller to deploy operands..." - # The GPU operator controller watches ClusterPolicy and creates - # DaemonSets for device-plugin, NFD, GFD, etc. This happens - # asynchronously after the helm install completes. - for i in $(seq 1 30); do - count=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \ - get daemonset -l app=nvidia-device-plugin-daemonset --no-headers 2>/dev/null | wc -l) - if [[ "$count" -gt 0 ]]; then - echo "Device plugin DaemonSet found." - break - fi - echo "Waiting for device plugin DaemonSet to be created... (${i}/30)" - sleep 10 - done - echo "Waiting for device plugin rollout..." - # Operands are excluded from control-plane nodes via nodeAffinity in - # the kind overlay, so all scheduled pods should become ready. - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \ - rollout status daemonset -l app=nvidia-device-plugin-daemonset --timeout=300s - echo "GPU Operator pods:" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods + run: bash "${{ github.action_path }}/wait-gpu-operands-bundle.sh" diff --git a/.github/actions/gpu-operator-install/generate-bundle.sh b/.github/actions/gpu-operator-install/generate-bundle.sh new file mode 100644 index 000000000..c7dd3f413 --- /dev/null +++ b/.github/actions/gpu-operator-install/generate-bundle.sh @@ -0,0 +1,23 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +./aicr bundle \ + --recipe recipe.yaml \ + --accelerated-node-toleration nvidia.com/gpu:NoSchedule \ + --output bundle +echo "--- Bundle contents ---" +ls -la bundle/ diff --git a/.github/actions/gpu-operator-install/generate-recipe.sh b/.github/actions/gpu-operator-install/generate-recipe.sh new file mode 100644 index 000000000..6015e69ed --- /dev/null +++ b/.github/actions/gpu-operator-install/generate-recipe.sh @@ -0,0 +1,29 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +RECIPE_ARGS=( + --service kind + --accelerator "${AICR_ACCELERATOR}" + --os ubuntu + --intent "${AICR_INTENT}" +) +if [[ -n "${AICR_PLATFORM}" ]]; then + RECIPE_ARGS+=(--platform "${AICR_PLATFORM}") +fi + +./aicr recipe "${RECIPE_ARGS[@]}" --output recipe.yaml +echo "Recipe written to recipe.yaml" diff --git a/.github/actions/gpu-operator-install/install-bundle.sh b/.github/actions/gpu-operator-install/install-bundle.sh new file mode 100644 index 000000000..cefa4ce5d --- /dev/null +++ b/.github/actions/gpu-operator-install/install-bundle.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +cd bundle +# The default keeps legacy bundle-mode behavior: do not wait on every +# Helm resource and keep deploying after component failures. H100 +# qualification jobs override these inputs to hard-fail and wait. +chmod +x deploy.sh +DEPLOY_ARGS=() +if [[ "${AICR_DEPLOY_WAIT}" != "true" ]]; then + DEPLOY_ARGS+=(--no-wait) +fi +if [[ "${AICR_DEPLOY_BEST_EFFORT}" == "true" ]]; then + DEPLOY_ARGS+=(--best-effort) +fi +if [[ "${#DEPLOY_ARGS[@]}" -gt 0 ]]; then + echo "Deploying bundle with args: ${DEPLOY_ARGS[*]}" +else + echo "Deploying bundle with default args" +fi +./deploy.sh "${DEPLOY_ARGS[@]}" diff --git a/.github/actions/gpu-operator-install/install-gpu-operator-helm.sh b/.github/actions/gpu-operator-install/install-gpu-operator-helm.sh new file mode 100644 index 000000000..0aea450eb --- /dev/null +++ b/.github/actions/gpu-operator-install/install-gpu-operator-helm.sh @@ -0,0 +1,29 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +helm repo add nvidia https://helm.ngc.nvidia.com/nvidia +helm repo update +helm upgrade -i \ + --kube-context="kind-${KIND_CLUSTER_NAME}" \ + --namespace gpu-operator \ + --create-namespace \ + --set driver.enabled=false \ + --set toolkit.enabled=false \ + --set dcgmExporter.enabled=false \ + --set nfd.enabled=true \ + --wait --timeout=600s \ + gpu-operator nvidia/gpu-operator diff --git a/.github/actions/gpu-operator-install/wait-gpu-operands-bundle.sh b/.github/actions/gpu-operator-install/wait-gpu-operands-bundle.sh new file mode 100644 index 000000000..496eb372e --- /dev/null +++ b/.github/actions/gpu-operator-install/wait-gpu-operands-bundle.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +echo "Waiting for GPU operator controller to deploy operands..." +# The GPU operator controller watches ClusterPolicy and creates +# DaemonSets for device-plugin, NFD, GFD, etc. This happens +# asynchronously after the helm install completes. +for i in $(seq 1 30); do + count=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \ + get daemonset -l app=nvidia-device-plugin-daemonset --no-headers 2>/dev/null | wc -l) + if [[ "$count" -gt 0 ]]; then + echo "Device plugin DaemonSet found." + break + fi + echo "Waiting for device plugin DaemonSet to be created... (${i}/30)" + sleep 10 +done +echo "Waiting for device plugin rollout..." +# Operands are excluded from control-plane nodes via nodeAffinity in +# the kind overlay, so all scheduled pods should become ready. +kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \ + rollout status daemonset -l app=nvidia-device-plugin-daemonset --timeout=300s +echo "GPU Operator pods:" +kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods diff --git a/.github/actions/gpu-operator-install/wait-gpu-operands-helm.sh b/.github/actions/gpu-operator-install/wait-gpu-operands-helm.sh new file mode 100644 index 000000000..2ad7e801d --- /dev/null +++ b/.github/actions/gpu-operator-install/wait-gpu-operands-helm.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +echo "Waiting for device plugin to be ready..." +kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \ + rollout status daemonset -l app=nvidia-device-plugin-daemonset --timeout=300s || true +echo "GPU Operator pods:" +kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods diff --git a/.github/actions/gpu-snapshot-validate/action.yml b/.github/actions/gpu-snapshot-validate/action.yml index b89224a60..7af987da0 100644 --- a/.github/actions/gpu-snapshot-validate/action.yml +++ b/.github/actions/gpu-snapshot-validate/action.yml @@ -36,54 +36,18 @@ runs: steps: - name: Run aicr snapshot shell: bash - run: | - ./aicr snapshot \ - --kubeconfig="${HOME}/.kube/config" \ - --namespace=default \ - --image=ko.local:smoke-test \ - --require-gpu \ - --timeout="${{ inputs.snapshot_timeout }}" \ - --output=snapshot.yaml - echo "--- Snapshot output ---" - cat snapshot.yaml - + env: + SNAPSHOT_TIMEOUT: ${{ inputs.snapshot_timeout }} + run: bash "${{ github.action_path }}/run-snapshot.sh" - name: Validate snapshot detected GPU shell: bash - run: | - # Query by subtype field (not index) — #502 added a "hardware" subtype before "smi". - GPU_MODEL=$(yq eval '.measurements[] | select(.type == "GPU") | .subtypes[] | select(.subtype == "smi") | .data["gpu.model"]' snapshot.yaml) - GPU_COUNT=$(yq eval '.measurements[] | select(.type == "GPU") | .subtypes[] | select(.subtype == "smi") | .data["gpu-count"]' snapshot.yaml) - echo "GPU model: ${GPU_MODEL}" - echo "GPU count: ${GPU_COUNT}" - if [[ "${GPU_MODEL}" != *"${{ inputs.gpu_model }}"* ]]; then - echo "::error::Expected ${{ inputs.gpu_model }} GPU in snapshot, got: ${GPU_MODEL}" - exit 1 - fi - if [[ "${GPU_COUNT}" -lt ${{ inputs.min_gpu_count }} ]]; then - echo "::error::Expected gpu-count >= ${{ inputs.min_gpu_count }}, got: ${GPU_COUNT}" - exit 1 - fi - echo "Snapshot correctly detected ${GPU_COUNT}x ${GPU_MODEL}" - + env: + EXPECTED_GPU_MODEL: ${{ inputs.gpu_model }} + MIN_GPU_COUNT: ${{ inputs.min_gpu_count }} + run: bash "${{ github.action_path }}/validate-snapshot-gpu.sh" - name: Debug snapshot Job if: failure() shell: bash - run: | - kubectl_kind() { - timeout 30s kubectl --request-timeout=10s --context="kind-${{ inputs.cluster_name }}" "$@" - } - - echo "=== Snapshot Job ===" - kubectl_kind -n default get job aicr -o yaml || true - echo "=== Snapshot Pods ===" - kubectl_kind -n default get pods -l app.kubernetes.io/name=aicr -o wide || true - echo "=== Snapshot Job describe ===" - kubectl_kind -n default describe job aicr || true - echo "=== Snapshot Pod describe ===" - kubectl_kind -n default describe pods -l app.kubernetes.io/name=aicr || true - echo "=== Snapshot current logs ===" - kubectl_kind -n default logs -l app.kubernetes.io/name=aicr --all-containers --tail=200 || true - echo "=== Snapshot previous logs ===" - kubectl_kind -n default logs -l app.kubernetes.io/name=aicr --all-containers --previous --tail=200 || true - echo "=== Snapshot ConfigMap ===" - kubectl_kind -n default get configmap aicr-snapshot -o yaml || true + env: + KIND_CLUSTER_NAME: ${{ inputs.cluster_name }} + run: bash "${{ github.action_path }}/debug-snapshot-job.sh" diff --git a/.github/actions/gpu-snapshot-validate/debug-snapshot-job.sh b/.github/actions/gpu-snapshot-validate/debug-snapshot-job.sh new file mode 100644 index 000000000..2e0f1547f --- /dev/null +++ b/.github/actions/gpu-snapshot-validate/debug-snapshot-job.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +kubectl_kind() { + timeout 30s kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" "$@" +} + +echo "=== Snapshot Job ===" +kubectl_kind -n default get job aicr -o yaml || true +echo "=== Snapshot Pods ===" +kubectl_kind -n default get pods -l app.kubernetes.io/name=aicr -o wide || true +echo "=== Snapshot Job describe ===" +kubectl_kind -n default describe job aicr || true +echo "=== Snapshot Pod describe ===" +kubectl_kind -n default describe pods -l app.kubernetes.io/name=aicr || true +echo "=== Snapshot current logs ===" +kubectl_kind -n default logs -l app.kubernetes.io/name=aicr --all-containers --tail=200 || true +echo "=== Snapshot previous logs ===" +kubectl_kind -n default logs -l app.kubernetes.io/name=aicr --all-containers --previous --tail=200 || true +echo "=== Snapshot ConfigMap ===" +kubectl_kind -n default get configmap aicr-snapshot -o yaml || true diff --git a/.github/actions/gpu-snapshot-validate/run-snapshot.sh b/.github/actions/gpu-snapshot-validate/run-snapshot.sh new file mode 100644 index 000000000..e45b575ef --- /dev/null +++ b/.github/actions/gpu-snapshot-validate/run-snapshot.sh @@ -0,0 +1,26 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +./aicr snapshot \ + --kubeconfig="${HOME}/.kube/config" \ + --namespace=default \ + --image=ko.local:smoke-test \ + --require-gpu \ + --timeout="${SNAPSHOT_TIMEOUT}" \ + --output=snapshot.yaml +echo "--- Snapshot output ---" +cat snapshot.yaml diff --git a/.github/actions/gpu-snapshot-validate/validate-snapshot-gpu.sh b/.github/actions/gpu-snapshot-validate/validate-snapshot-gpu.sh new file mode 100644 index 000000000..57a622a2d --- /dev/null +++ b/.github/actions/gpu-snapshot-validate/validate-snapshot-gpu.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +# Query by subtype field (not index) — #502 added a "hardware" subtype before "smi". +GPU_MODEL=$(yq eval '.measurements[] | select(.type == "GPU") | .subtypes[] | select(.subtype == "smi") | .data["gpu.model"]' snapshot.yaml) +GPU_COUNT=$(yq eval '.measurements[] | select(.type == "GPU") | .subtypes[] | select(.subtype == "smi") | .data["gpu-count"]' snapshot.yaml) +echo "GPU model: ${GPU_MODEL}" +echo "GPU count: ${GPU_COUNT}" +if [[ "${GPU_MODEL}" != *"${EXPECTED_GPU_MODEL}"* ]]; then + echo "::error::Expected ${EXPECTED_GPU_MODEL} GPU in snapshot, got: ${GPU_MODEL}" + exit 1 +fi +if [[ "${GPU_COUNT}" -lt ${MIN_GPU_COUNT} ]]; then + echo "::error::Expected gpu-count >= ${MIN_GPU_COUNT}, got: ${GPU_COUNT}" + exit 1 +fi +echo "Snapshot correctly detected ${GPU_COUNT}x ${GPU_MODEL}" diff --git a/.github/actions/gpu-test-cleanup/action.yml b/.github/actions/gpu-test-cleanup/action.yml index c085ed630..a1eef57b9 100644 --- a/.github/actions/gpu-test-cleanup/action.yml +++ b/.github/actions/gpu-test-cleanup/action.yml @@ -32,118 +32,19 @@ runs: shell: bash env: KIND_CLUSTER_NAME: ${{ inputs.cluster_name }} - run: | - set -o pipefail - mkdir -p /tmp/debug-artifacts - CONTROL_PLANE_COMPONENTS="kube-apiserver kube-controller-manager kube-scheduler etcd" - kubectl_kind() { - timeout 30s kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" "$@" - } - - kubectl_kind get all --all-namespaces > /tmp/debug-artifacts/all-resources.txt || true - kubectl_kind get events --all-namespaces --sort-by='.lastTimestamp' > /tmp/debug-artifacts/events.txt || true - kubectl_kind get --raw='/livez?verbose' > /tmp/debug-artifacts/apiserver-livez.txt 2>&1 || true - kubectl_kind get --raw='/readyz?verbose' > /tmp/debug-artifacts/apiserver-readyz.txt 2>&1 || true - kubectl_kind -n kube-system get pods -l tier=control-plane -o wide \ - > /tmp/debug-artifacts/control-plane-pods.txt 2>&1 || true - kubectl_kind -n kube-system get events --sort-by='.lastTimestamp' \ - > /tmp/debug-artifacts/kube-system-events.txt 2>&1 || true - for component in ${CONTROL_PLANE_COMPONENTS}; do - kubectl_kind -n kube-system describe pod -l "component=${component}" \ - > "/tmp/debug-artifacts/${component}-describe.txt" 2>&1 || true - kubectl_kind -n kube-system logs -l "component=${component}" --all-containers --tail=300 \ - > "/tmp/debug-artifacts/${component}-logs.txt" 2>&1 || true - kubectl_kind -n kube-system logs -l "component=${component}" --all-containers --previous --tail=300 \ - > "/tmp/debug-artifacts/${component}-previous-logs.txt" 2>&1 || true - kubectl_kind -n kube-system get lease "${component}" -o yaml \ - > "/tmp/debug-artifacts/${component}-lease.yaml" 2>&1 || true - done - kubectl_kind -n gpu-operator get pods -o wide > /tmp/debug-artifacts/gpu-operator-pods.txt || true - kubectl_kind -n gpu-operator logs -l app=nvidia-device-plugin-daemonset --tail=100 > /tmp/debug-artifacts/device-plugin-logs.txt || true - kubectl_kind -n gpu-operator logs -l app.kubernetes.io/component=gpu-operator --tail=100 > /tmp/debug-artifacts/gpu-operator-logs.txt || true - kubectl_kind get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded > /tmp/debug-artifacts/non-running-pods.txt || true - tar_inputs=() - [[ -f recipe.yaml ]] && tar_inputs+=(recipe.yaml) - [[ -d bundle ]] && tar_inputs+=(bundle) - if [[ "${#tar_inputs[@]}" -gt 0 ]]; then - echo "Archiving runtime bundle inputs: ${tar_inputs[*]}" - tar -czf /tmp/debug-artifacts/aicr-runtime-bundle.tar.gz "${tar_inputs[@]}" || true - else - echo "No recipe.yaml or bundle directory found; skipping runtime bundle archive" - fi - - docker ps --filter "label=io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}" \ - --format '{{.Names}}' | sort | while read -r node_container; do - [[ -z "${node_container}" ]] && continue - node_file="${node_container//[^A-Za-z0-9_.-]/_}" - timeout 30s docker exec "${node_container}" journalctl -u kubelet \ - --since "90 minutes ago" --no-pager \ - > "/tmp/debug-artifacts/${node_file}-kubelet-journal.txt" 2>&1 || true - timeout 30s docker exec "${node_container}" journalctl -u containerd \ - --since "90 minutes ago" --no-pager \ - > "/tmp/debug-artifacts/${node_file}-containerd-journal.txt" 2>&1 || true - timeout 30s docker exec "${node_container}" crictl ps -a \ - > "/tmp/debug-artifacts/${node_file}-crictl-ps-a.txt" 2>&1 || true - timeout 30s docker exec "${node_container}" crictl pods \ - > "/tmp/debug-artifacts/${node_file}-crictl-pods.txt" 2>&1 || true - timeout 30s docker exec "${node_container}" crictl stats \ - > "/tmp/debug-artifacts/${node_file}-crictl-stats.txt" 2>&1 || true - timeout 30s docker exec "${node_container}" sh -c ' - date - uptime || true - free -h || true - df -h / /var/lib/containerd /var/lib/kubelet 2>/dev/null || df -h - echo "--- top cpu/memory processes ---" - ps -eo pid,ppid,stat,etime,%cpu,%mem,comm,args --sort=-%cpu | head -40 || true - ' > "/tmp/debug-artifacts/${node_file}-node-pressure.txt" 2>&1 || true - timeout 120s docker exec "${node_container}" sh -c ' - for component in kube-apiserver kube-controller-manager kube-scheduler etcd; do - echo "=== ${component} static pod manifest ===" - sed -n "1,220p" "/etc/kubernetes/manifests/${component}.yaml" 2>/dev/null || true - echo "=== ${component} CRI containers ===" - crictl ps -a --name "${component}" || true - count=0 - for container_id in $(crictl ps -a --name "${component}" -q 2>/dev/null); do - count=$((count + 1)) - if [ "${count}" -gt 8 ]; then - echo "Skipping remaining ${component} CRI containers after first 8 entries." - break - fi - echo "=== crictl inspect ${component} ${container_id} ===" - crictl inspect "${container_id}" || true - echo "=== crictl logs ${component} ${container_id} ===" - crictl logs --tail=300 "${container_id}" || true - done - done - ' > "/tmp/debug-artifacts/${node_file}-control-plane-cri.txt" 2>&1 || true - done - + run: bash "${{ github.action_path }}/collect-debug-artifacts.sh" - name: Export kind logs if: failure() || cancelled() shell: bash env: KIND_CLUSTER_NAME: ${{ inputs.cluster_name }} - run: | - mkdir -p /tmp/kind-logs - timeout 300s kind export logs /tmp/kind-logs --name "${KIND_CLUSTER_NAME}" || true - + run: bash "${{ github.action_path }}/export-kind-logs.sh" - name: Cleanup if: always() shell: bash env: KIND_CLUSTER_NAME: ${{ inputs.cluster_name }} - run: | - timeout 300s kind delete cluster --name "${KIND_CLUSTER_NAME}" || true - kind_cluster_label="io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}" - remaining_containers=$(docker ps -aq --filter "label=${kind_cluster_label}") - if [[ -n "${remaining_containers}" ]]; then - echo "Removing leftover kind containers for ${KIND_CLUSTER_NAME}:" - docker ps -a --filter "label=${kind_cluster_label}" - docker rm -f ${remaining_containers} || true - fi - timeout 60s docker builder prune -f --filter "until=24h" || true - timeout 60s docker system prune -f --filter "until=24h" || true - + run: bash "${{ github.action_path }}/cleanup-kind-cluster.sh" - name: Upload debug artifacts if: failure() || cancelled() uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0 diff --git a/.github/actions/gpu-test-cleanup/cleanup-kind-cluster.sh b/.github/actions/gpu-test-cleanup/cleanup-kind-cluster.sh new file mode 100644 index 000000000..4603d494d --- /dev/null +++ b/.github/actions/gpu-test-cleanup/cleanup-kind-cluster.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +timeout 300s kind delete cluster --name "${KIND_CLUSTER_NAME}" || true +docker_timeout() { + local limit="$1" + shift + timeout "${limit}" docker "$@" +} +kind_cluster_label="io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}" +remaining_containers=$(docker_timeout 30s ps -aq --filter "label=${kind_cluster_label}" || true) +if [[ -n "${remaining_containers}" ]]; then + echo "Removing leftover kind containers for ${KIND_CLUSTER_NAME}:" + docker_timeout 30s ps -a --filter "label=${kind_cluster_label}" || true + docker_timeout 30s rm -f ${remaining_containers} || true +fi +docker_timeout 60s builder prune -f --filter "until=24h" || true +docker_timeout 60s system prune -f --filter "until=24h" || true diff --git a/.github/actions/gpu-test-cleanup/collect-debug-artifacts.sh b/.github/actions/gpu-test-cleanup/collect-debug-artifacts.sh new file mode 100644 index 000000000..417606c7d --- /dev/null +++ b/.github/actions/gpu-test-cleanup/collect-debug-artifacts.sh @@ -0,0 +1,104 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -o pipefail +mkdir -p /tmp/debug-artifacts +CONTROL_PLANE_COMPONENTS="kube-apiserver kube-controller-manager kube-scheduler etcd" +kubectl_kind() { + timeout 30s kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" "$@" +} +docker_timeout() { + local limit="$1" + shift + timeout "${limit}" docker "$@" +} + +kubectl_kind get all --all-namespaces > /tmp/debug-artifacts/all-resources.txt || true +kubectl_kind get events --all-namespaces --sort-by='.lastTimestamp' > /tmp/debug-artifacts/events.txt || true +kubectl_kind get --raw='/livez?verbose' > /tmp/debug-artifacts/apiserver-livez.txt 2>&1 || true +kubectl_kind get --raw='/readyz?verbose' > /tmp/debug-artifacts/apiserver-readyz.txt 2>&1 || true +kubectl_kind -n kube-system get pods -l tier=control-plane -o wide \ + > /tmp/debug-artifacts/control-plane-pods.txt 2>&1 || true +kubectl_kind -n kube-system get events --sort-by='.lastTimestamp' \ + > /tmp/debug-artifacts/kube-system-events.txt 2>&1 || true +for component in ${CONTROL_PLANE_COMPONENTS}; do + kubectl_kind -n kube-system describe pod -l "component=${component}" \ + > "/tmp/debug-artifacts/${component}-describe.txt" 2>&1 || true + kubectl_kind -n kube-system logs -l "component=${component}" --all-containers --tail=300 \ + > "/tmp/debug-artifacts/${component}-logs.txt" 2>&1 || true + kubectl_kind -n kube-system logs -l "component=${component}" --all-containers --previous --tail=300 \ + > "/tmp/debug-artifacts/${component}-previous-logs.txt" 2>&1 || true + kubectl_kind -n kube-system get lease "${component}" -o yaml \ + > "/tmp/debug-artifacts/${component}-lease.yaml" 2>&1 || true +done +kubectl_kind -n gpu-operator get pods -o wide > /tmp/debug-artifacts/gpu-operator-pods.txt || true +kubectl_kind -n gpu-operator logs -l app=nvidia-device-plugin-daemonset --tail=100 > /tmp/debug-artifacts/device-plugin-logs.txt || true +kubectl_kind -n gpu-operator logs -l app.kubernetes.io/component=gpu-operator --tail=100 > /tmp/debug-artifacts/gpu-operator-logs.txt || true +kubectl_kind get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded > /tmp/debug-artifacts/non-running-pods.txt || true +tar_inputs=() +[[ -f recipe.yaml ]] && tar_inputs+=(recipe.yaml) +[[ -d bundle ]] && tar_inputs+=(bundle) +if [[ "${#tar_inputs[@]}" -gt 0 ]]; then + echo "Archiving runtime bundle inputs: ${tar_inputs[*]}" + tar -czf /tmp/debug-artifacts/aicr-runtime-bundle.tar.gz "${tar_inputs[@]}" || true +else + echo "No recipe.yaml or bundle directory found; skipping runtime bundle archive" +fi + +docker_timeout 30s ps --filter "label=io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}" \ + --format '{{.Names}}' | sort | while read -r node_container; do + [[ -z "${node_container}" ]] && continue + node_file="${node_container//[^A-Za-z0-9_.-]/_}" + docker_timeout 30s exec "${node_container}" journalctl -u kubelet \ + --since "90 minutes ago" --no-pager \ + > "/tmp/debug-artifacts/${node_file}-kubelet-journal.txt" 2>&1 || true + docker_timeout 30s exec "${node_container}" journalctl -u containerd \ + --since "90 minutes ago" --no-pager \ + > "/tmp/debug-artifacts/${node_file}-containerd-journal.txt" 2>&1 || true + docker_timeout 30s exec "${node_container}" crictl ps -a \ + > "/tmp/debug-artifacts/${node_file}-crictl-ps-a.txt" 2>&1 || true + docker_timeout 30s exec "${node_container}" crictl pods \ + > "/tmp/debug-artifacts/${node_file}-crictl-pods.txt" 2>&1 || true + docker_timeout 30s exec "${node_container}" crictl stats \ + > "/tmp/debug-artifacts/${node_file}-crictl-stats.txt" 2>&1 || true + docker_timeout 30s exec "${node_container}" sh -c ' + date + uptime || true + free -h || true + df -h / /var/lib/containerd /var/lib/kubelet 2>/dev/null || df -h + echo "--- top cpu/memory processes ---" + ps -eo pid,ppid,stat,etime,%cpu,%mem,comm,args --sort=-%cpu | head -40 || true + ' > "/tmp/debug-artifacts/${node_file}-node-pressure.txt" 2>&1 || true + docker_timeout 120s exec "${node_container}" sh -c ' + for component in kube-apiserver kube-controller-manager kube-scheduler etcd; do + echo "=== ${component} static pod manifest ===" + sed -n "1,220p" "/etc/kubernetes/manifests/${component}.yaml" 2>/dev/null || true + echo "=== ${component} CRI containers ===" + crictl ps -a --name "${component}" || true + count=0 + for container_id in $(crictl ps -a --name "${component}" -q 2>/dev/null); do + count=$((count + 1)) + if [ "${count}" -gt 8 ]; then + echo "Skipping remaining ${component} CRI containers after first 8 entries." + break + fi + echo "=== crictl inspect ${component} ${container_id} ===" + crictl inspect "${container_id}" || true + echo "=== crictl logs ${component} ${container_id} ===" + crictl logs --tail=300 "${container_id}" || true + done + done + ' > "/tmp/debug-artifacts/${node_file}-control-plane-cri.txt" 2>&1 || true + done || true diff --git a/.github/actions/gpu-test-cleanup/export-kind-logs.sh b/.github/actions/gpu-test-cleanup/export-kind-logs.sh new file mode 100644 index 000000000..2522481eb --- /dev/null +++ b/.github/actions/gpu-test-cleanup/export-kind-logs.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +mkdir -p /tmp/kind-logs +timeout 300s kind export logs /tmp/kind-logs --name "${KIND_CLUSTER_NAME}" || true diff --git a/.github/actions/install-karpenter-kwok/action.yml b/.github/actions/install-karpenter-kwok/action.yml index f66848e6f..f26aa38a5 100644 --- a/.github/actions/install-karpenter-kwok/action.yml +++ b/.github/actions/install-karpenter-kwok/action.yml @@ -38,11 +38,7 @@ runs: - name: Resolve versions id: versions shell: bash - run: | - echo "karpenter=$(yq eval '.testing_tools.karpenter' .settings.yaml)" >> "$GITHUB_OUTPUT" - echo "ko=$(yq eval '.build_tools.ko' .settings.yaml)" >> "$GITHUB_OUTPUT" - echo "go=$(go env GOVERSION)" >> "$GITHUB_OUTPUT" - + run: bash "${{ github.action_path }}/resolve-versions.sh" - name: Install ko uses: ./.github/actions/setup-build-tools with: @@ -68,30 +64,4 @@ runs: KWOK_HELM_TIMEOUT: ${{ inputs.kwok_helm_timeout }} KO_BUILD_TIMEOUT: ${{ inputs.ko_build_timeout }} KARPENTER_HELM_TIMEOUT: ${{ inputs.karpenter_helm_timeout }} - run: | - set -euo pipefail - validate_duration_input() { - local input_name="$1" - local input_value="$2" - - if ! [[ "${input_value}" =~ ^[0-9]+[smh]$ ]]; then - echo "::error::${input_name} must be a duration like 300s, 10m, or 1h; got '${input_value}'" - exit 1 - fi - } - - validate_seconds_input() { - local input_name="$1" - local input_value="$2" - - if ! [[ "${input_value}" =~ ^[0-9]+$ ]]; then - echo "::error::${input_name} must be an integer number of seconds, got '${input_value}'" - exit 1 - fi - } - - validate_duration_input kwok_helm_timeout "${KWOK_HELM_TIMEOUT}" - validate_seconds_input ko_build_timeout "${KO_BUILD_TIMEOUT}" - validate_duration_input karpenter_helm_timeout "${KARPENTER_HELM_TIMEOUT}" - bash kwok/scripts/install-karpenter-kwok.sh - kubectl --context="kind-${KIND_CLUSTER_NAME}" apply -f kwok/manifests/karpenter/nodepool.yaml + run: bash "${{ github.action_path }}/install-karpenter-kwok.sh" diff --git a/.github/actions/install-karpenter-kwok/install-karpenter-kwok.sh b/.github/actions/install-karpenter-kwok/install-karpenter-kwok.sh new file mode 100644 index 000000000..2fdb26312 --- /dev/null +++ b/.github/actions/install-karpenter-kwok/install-karpenter-kwok.sh @@ -0,0 +1,41 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail +validate_duration_input() { + local input_name="$1" + local input_value="$2" + + if ! [[ "${input_value}" =~ ^[0-9]+[smh]$ ]]; then + echo "::error::${input_name} must be a duration like 300s, 10m, or 1h; got '${input_value}'" + exit 1 + fi +} + +validate_seconds_input() { + local input_name="$1" + local input_value="$2" + + if ! [[ "${input_value}" =~ ^[0-9]+$ ]]; then + echo "::error::${input_name} must be an integer number of seconds, got '${input_value}'" + exit 1 + fi +} + +validate_duration_input kwok_helm_timeout "${KWOK_HELM_TIMEOUT}" +validate_seconds_input ko_build_timeout "${KO_BUILD_TIMEOUT}" +validate_duration_input karpenter_helm_timeout "${KARPENTER_HELM_TIMEOUT}" +bash kwok/scripts/install-karpenter-kwok.sh +kubectl --context="kind-${KIND_CLUSTER_NAME}" apply -f kwok/manifests/karpenter/nodepool.yaml diff --git a/.github/actions/install-karpenter-kwok/resolve-versions.sh b/.github/actions/install-karpenter-kwok/resolve-versions.sh new file mode 100644 index 000000000..84e85458e --- /dev/null +++ b/.github/actions/install-karpenter-kwok/resolve-versions.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +echo "karpenter=$(yq eval '.testing_tools.karpenter' .settings.yaml)" >> "$GITHUB_OUTPUT" +echo "ko=$(yq eval '.build_tools.ko' .settings.yaml)" >> "$GITHUB_OUTPUT" +echo "go=$(go env GOVERSION)" >> "$GITHUB_OUTPUT" diff --git a/.github/scripts/gpu-chainsaw-health.sh b/.github/scripts/gpu-chainsaw-health.sh new file mode 100644 index 000000000..35450eb00 --- /dev/null +++ b/.github/scripts/gpu-chainsaw-health.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash +set -euo pipefail + +test_dir="$1" + +chainsaw test \ + --test-dir "${test_dir}" \ + --config tests/chainsaw/chainsaw-config.yaml \ + --cleanup-timeout 120s \ + --delete-timeout 120s diff --git a/.github/scripts/gpu-debug-diagnostics.sh b/.github/scripts/gpu-debug-diagnostics.sh new file mode 100644 index 000000000..0d1cd76a2 --- /dev/null +++ b/.github/scripts/gpu-debug-diagnostics.sh @@ -0,0 +1,146 @@ +#!/usr/bin/env bash +set -o pipefail + +mode="${GPU_TEST_DIAGNOSTIC_MODE:-smoke}" + +kubectl_kind() { + timeout 30s kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" "$@" +} + +print_workload_images() { + local ns="$1" + kubectl_kind -n "${ns}" get deployment,daemonset,statefulset -o json 2>/dev/null \ + | jq -r ' + .items[] | + [ + .kind, + .metadata.namespace + "/" + .metadata.name, + (([.spec.template.spec.containers[]?.image] + + [.spec.template.spec.initContainers[]?.image]) | unique | join(",")) + ] | @tsv + ' || true +} + +print_workload_inventory() { + local ns + echo "=== Workload image inventory ===" + for ns in "$@"; do + echo "--- ${ns} ---" + print_workload_images "${ns}" + done +} + +print_grafana_diagnostics() { + echo "=== Grafana deployment ===" + kubectl_kind -n monitoring get deployment grafana -o wide 2>/dev/null || true + echo "=== Grafana pods ===" + kubectl_kind -n monitoring get pods -l app.kubernetes.io/name=grafana -o wide 2>/dev/null || true + echo "=== Grafana deployment describe ===" + kubectl_kind -n monitoring describe deployment grafana 2>/dev/null || true + echo "=== Grafana pod describe ===" + kubectl_kind -n monitoring describe pods -l app.kubernetes.io/name=grafana 2>/dev/null || true +} + +print_kai_diagnostics() { + echo "=== KAI scheduler pods ===" + kubectl_kind -n kai-scheduler get pods -o wide 2>/dev/null || true + echo "=== KAI admission deployment ===" + kubectl_kind -n kai-scheduler get deployment admission -o wide 2>/dev/null || true + echo "=== KAI admission deployment describe ===" + kubectl_kind -n kai-scheduler describe deployment admission 2>/dev/null || true + echo "=== KAI admission pod describe ===" + kubectl_kind -n kai-scheduler get pods -o name 2>/dev/null \ + | grep '^pod/admission-' \ + | while read -r pod; do + kubectl_kind -n kai-scheduler describe "${pod}" 2>/dev/null || true + done || true + echo "=== KAI admission logs ===" + kubectl_kind -n kai-scheduler logs deployment/admission --all-containers --tail=200 2>/dev/null || true + echo "=== KAI scheduler logs ===" + kubectl_kind -n kai-scheduler logs deployment/kai-scheduler-default --tail=100 2>/dev/null || true + echo "=== KAI scheduler queues ===" + kubectl_kind get queues -A 2>/dev/null || true + echo "=== KAI scheduler podgroups ===" + kubectl_kind get podgroups -A 2>/dev/null || true + echo "=== Recent events (kai-scheduler) ===" + kubectl_kind -n kai-scheduler get events --sort-by='.lastTimestamp' 2>/dev/null | tail -50 || true +} + +print_common_gpu_diagnostics() { + echo "=== ClusterPolicy status ===" + kubectl_kind get clusterpolicy -o yaml 2>/dev/null || true + echo "=== GPU Operator pods ===" + kubectl_kind -n gpu-operator get pods -o wide 2>/dev/null || true + echo "=== Non-running pods (all namespaces) ===" + kubectl_kind get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded 2>/dev/null || true + echo "=== Recent events (gpu-operator) ===" + kubectl_kind -n gpu-operator get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true +} + +case "${mode}" in + smoke) + print_common_gpu_diagnostics + echo "=== Node status ===" + kubectl_kind get nodes -o wide 2>/dev/null || true + ;; + training) + print_workload_inventory cert-manager gpu-operator monitoring skyhook nvsentinel nvidia-dra-driver \ + nvidia-network-operator kai-scheduler kubeflow + print_grafana_diagnostics + print_kai_diagnostics + echo "=== Kubeflow Trainer deployment ===" + kubectl_kind -n kubeflow get deployment kubeflow-trainer-controller-manager -o wide 2>/dev/null || true + echo "=== Kubeflow pods ===" + kubectl_kind -n kubeflow get pods -o wide 2>/dev/null || true + echo "=== Kubeflow validating webhooks ===" + kubectl_kind get validatingwebhookconfigurations validator.trainer.kubeflow.org -o yaml 2>/dev/null || true + echo "=== Kubeflow Trainer CRD ===" + kubectl_kind get crd trainjobs.trainer.kubeflow.org -o yaml 2>/dev/null || true + echo "=== Non-running pods (all namespaces) ===" + kubectl_kind get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded 2>/dev/null || true + echo "=== GPU Operator pods ===" + kubectl_kind -n gpu-operator get pods -o wide 2>/dev/null || true + echo "=== Node resources ===" + kubectl_kind describe nodes 2>/dev/null | grep -A 20 "Allocated resources" || true + ;; + inference) + print_workload_inventory cert-manager gpu-operator monitoring skyhook nvsentinel nvidia-dra-driver \ + nvidia-network-operator kai-scheduler dynamo-system kgateway-system + print_common_gpu_diagnostics + echo "=== Dynamo pods ===" + kubectl_kind -n dynamo-system get pods -o wide 2>/dev/null || true + echo "=== Dynamo operator logs ===" + kubectl_kind -n dynamo-system logs deployment/dynamo-operator-controller-manager --tail=100 -c manager 2>/dev/null || true + echo "=== Recent events (dynamo-system) ===" + kubectl_kind -n dynamo-system get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true + print_kai_diagnostics + echo "=== Custom metrics API ===" + for metric in gpu_utilization gpu_memory_used gpu_power_usage; do + echo "--- ${metric} ---" + for ns in gpu-operator dynamo-system; do + kubectl_kind get --raw "/apis/custom.metrics.k8s.io/v1beta1/namespaces/${ns}/pods/*/${metric}" 2>/dev/null | jq . || true + done + done + print_grafana_diagnostics + echo "=== prometheus-adapter pods ===" + kubectl_kind -n monitoring get pods -l app.kubernetes.io/name=prometheus-adapter -o wide 2>/dev/null || true + echo "=== kgateway pods ===" + kubectl_kind -n kgateway-system get pods -o wide 2>/dev/null || true + echo "=== GatewayClass status ===" + kubectl_kind get gatewayclass -o yaml 2>/dev/null || true + echo "=== Gateway status ===" + kubectl_kind get gateways -A -o yaml 2>/dev/null || true + echo "=== DCGM Exporter pods ===" + kubectl_kind -n gpu-operator get pods -l app=nvidia-dcgm-exporter -o wide 2>/dev/null || true + echo "=== Monitoring pods ===" + kubectl_kind -n monitoring get pods -o wide 2>/dev/null || true + echo "=== DRA ResourceSlices ===" + kubectl_kind get resourceslices -o wide 2>/dev/null || true + echo "=== Node status ===" + kubectl_kind get nodes -o wide 2>/dev/null || true + ;; + *) + echo "::error::unknown GPU_TEST_DIAGNOSTIC_MODE: ${mode}" + exit 1 + ;; +esac diff --git a/.github/scripts/gpu-smoke-run-nvidia-smi.sh b/.github/scripts/gpu-smoke-run-nvidia-smi.sh new file mode 100644 index 000000000..cefff49fa --- /dev/null +++ b/.github/scripts/gpu-smoke-run-nvidia-smi.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash +set -euo pipefail + +cat <<'EOF' | kubectl --context="kind-${KIND_CLUSTER_NAME}" apply -f - +apiVersion: v1 +kind: Pod +metadata: + name: gpu-smoke-test +spec: + restartPolicy: Never + containers: + - name: nvidia-smi + image: ubuntu:22.04 + command: ["nvidia-smi"] + resources: + limits: + nvidia.com/gpu: 1 +EOF + +echo "Waiting for gpu-smoke-test pod to complete..." +kubectl --context="kind-${KIND_CLUSTER_NAME}" wait pod/gpu-smoke-test \ + --for=condition=Ready --timeout=120s || true +kubectl --context="kind-${KIND_CLUSTER_NAME}" wait pod/gpu-smoke-test \ + --for=jsonpath='{.status.phase}'=Succeeded --timeout=120s diff --git a/.github/scripts/gpu-validate-conformance.sh b/.github/scripts/gpu-validate-conformance.sh new file mode 100644 index 000000000..2fb3fa4a3 --- /dev/null +++ b/.github/scripts/gpu-validate-conformance.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash +set -euo pipefail + +AICR_VALIDATOR_IMAGE_REGISTRY=ko.local \ +./aicr validate \ + --recipe recipe.yaml \ + --phase conformance \ + --namespace gpu-operator \ + --kubeconfig="${HOME}/.kube/config" \ + --require-gpu \ + --image=ko.local:smoke-test \ + --timeout=10m \ + --toleration '*' \ + --output=validation-result.yaml \ + --evidence-dir=conformance-evidence diff --git a/.github/workflows/gpu-h100-inference-test.yaml b/.github/workflows/gpu-h100-inference-test.yaml index 27d9d27cc..08152c25e 100644 --- a/.github/workflows/gpu-h100-inference-test.yaml +++ b/.github/workflows/gpu-h100-inference-test.yaml @@ -38,6 +38,8 @@ jobs: should-run: ${{ steps.filter.outputs.matched }} steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false - uses: dorny/paths-filter@fbd0ab8f3e69293af611ebaee6363fc25e6d187d # v4.0.1 id: filter with: @@ -215,12 +217,7 @@ jobs: - name: Run chainsaw health checks # The H100 stack can make namespace cleanup API calls slow under load. # Keep cleanup enabled, but allow more than the default 30s deadline. - run: | - chainsaw test \ - --test-dir tests/chainsaw/ai-conformance/kind-inference-dynamo \ - --config tests/chainsaw/chainsaw-config.yaml \ - --cleanup-timeout 120s \ - --delete-timeout 120s + run: bash .github/scripts/gpu-chainsaw-health.sh tests/chainsaw/ai-conformance/kind-inference-dynamo # --- CNCF AI Conformance validation --- # Runs after the stack health checks so gateway and metrics validators @@ -245,19 +242,7 @@ jobs: - name: Validate CNCF AI Conformance id: validate-conformance - run: | - AICR_VALIDATOR_IMAGE_REGISTRY=ko.local \ - ./aicr validate \ - --recipe recipe.yaml \ - --phase conformance \ - --namespace gpu-operator \ - --kubeconfig="${HOME}/.kube/config" \ - --require-gpu \ - --image=ko.local:smoke-test \ - --timeout=10m \ - --toleration '*' \ - --output=validation-result.yaml \ - --evidence-dir=conformance-evidence + run: bash .github/scripts/gpu-validate-conformance.sh # Dynamo smoke is intentionally disabled for now. The vLLM runtime image # adds significant latency and flakiness in Kind CI, and training has no @@ -282,105 +267,9 @@ jobs: if: failure() timeout-minutes: 5 shell: bash - run: | - set -o pipefail - kubectl_kind() { - timeout 30s kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" "$@" - } - print_workload_images() { - local ns="$1" - kubectl_kind -n "${ns}" get deployment,daemonset,statefulset -o json 2>/dev/null \ - | jq -r ' - .items[] | - [ - .kind, - .metadata.namespace + "/" + .metadata.name, - (([.spec.template.spec.containers[]?.image] + - [.spec.template.spec.initContainers[]?.image]) | unique | join(",")) - ] | @tsv - ' || true - } - - echo "=== Workload image inventory ===" - for NS in cert-manager gpu-operator monitoring skyhook nvsentinel nvidia-dra-driver \ - nvidia-network-operator kai-scheduler dynamo-system kgateway-system; do - echo "--- ${NS} ---" - print_workload_images "${NS}" - done - echo "=== ClusterPolicy status ===" - kubectl_kind get clusterpolicy -o yaml 2>/dev/null || true - echo "=== GPU Operator pods ===" - kubectl_kind -n gpu-operator get pods -o wide 2>/dev/null || true - echo "=== Non-running pods (all namespaces) ===" - kubectl_kind get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded 2>/dev/null || true - echo "=== Recent events (gpu-operator) ===" - kubectl_kind -n gpu-operator get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true - echo "=== Dynamo pods ===" - kubectl_kind -n dynamo-system get pods -o wide 2>/dev/null || true - echo "=== Dynamo operator logs ===" - kubectl_kind -n dynamo-system \ - logs deployment/dynamo-operator-controller-manager --tail=100 -c manager 2>/dev/null || true - echo "=== Recent events (dynamo-system) ===" - kubectl_kind -n dynamo-system get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true - echo "=== KAI scheduler pods ===" - kubectl_kind -n kai-scheduler get pods -o wide 2>/dev/null || true - echo "=== KAI admission deployment ===" - kubectl_kind -n kai-scheduler get deployment admission -o wide 2>/dev/null || true - echo "=== KAI admission deployment describe ===" - kubectl_kind -n kai-scheduler describe deployment admission 2>/dev/null || true - echo "=== KAI admission pod describe ===" - kubectl_kind -n kai-scheduler get pods -o name 2>/dev/null \ - | grep '^pod/admission-' \ - | while read -r pod; do - kubectl_kind -n kai-scheduler describe "${pod}" 2>/dev/null || true - done || true - echo "=== KAI admission logs ===" - kubectl_kind -n kai-scheduler \ - logs deployment/admission --all-containers --tail=200 2>/dev/null || true - echo "=== KAI scheduler logs ===" - kubectl_kind -n kai-scheduler \ - logs deployment/kai-scheduler-default --tail=100 2>/dev/null || true - echo "=== KAI scheduler queues ===" - kubectl_kind get queues -A 2>/dev/null || true - echo "=== KAI scheduler podgroups ===" - kubectl_kind get podgroups -A 2>/dev/null || true - echo "=== Recent events (kai-scheduler) ===" - kubectl_kind -n kai-scheduler get events --sort-by='.lastTimestamp' 2>/dev/null | tail -50 || true - echo "=== Custom metrics API ===" - for METRIC in gpu_utilization gpu_memory_used gpu_power_usage; do - echo "--- ${METRIC} ---" - for NS in gpu-operator dynamo-system; do - kubectl_kind get --raw \ - "/apis/custom.metrics.k8s.io/v1beta1/namespaces/${NS}/pods/*/${METRIC}" 2>/dev/null | jq . || true - done - done - echo "=== Grafana deployment ===" - kubectl_kind -n monitoring get deployment grafana -o wide 2>/dev/null || true - echo "=== Grafana pods ===" - kubectl_kind -n monitoring get pods \ - -l app.kubernetes.io/name=grafana -o wide 2>/dev/null || true - echo "=== Grafana deployment describe ===" - kubectl_kind -n monitoring describe deployment grafana 2>/dev/null || true - echo "=== Grafana pod describe ===" - kubectl_kind -n monitoring describe pods \ - -l app.kubernetes.io/name=grafana 2>/dev/null || true - echo "=== prometheus-adapter pods ===" - kubectl_kind -n monitoring get pods -l app.kubernetes.io/name=prometheus-adapter -o wide 2>/dev/null || true - echo "=== kgateway pods ===" - kubectl_kind -n kgateway-system get pods -o wide 2>/dev/null || true - echo "=== GatewayClass status ===" - kubectl_kind get gatewayclass -o yaml 2>/dev/null || true - echo "=== Gateway status ===" - kubectl_kind get gateways -A -o yaml 2>/dev/null || true - echo "=== DCGM Exporter pods ===" - kubectl_kind -n gpu-operator \ - get pods -l app=nvidia-dcgm-exporter -o wide 2>/dev/null || true - echo "=== Monitoring pods ===" - kubectl_kind -n monitoring get pods -o wide 2>/dev/null || true - echo "=== DRA ResourceSlices ===" - kubectl_kind get resourceslices -o wide 2>/dev/null || true - echo "=== Node status ===" - kubectl_kind get nodes -o wide 2>/dev/null || true + env: + GPU_TEST_DIAGNOSTIC_MODE: inference + run: bash .github/scripts/gpu-debug-diagnostics.sh - name: GPU Test Cleanup if: always() diff --git a/.github/workflows/gpu-h100-training-test.yaml b/.github/workflows/gpu-h100-training-test.yaml index 972e38f73..f5ecbaf7f 100644 --- a/.github/workflows/gpu-h100-training-test.yaml +++ b/.github/workflows/gpu-h100-training-test.yaml @@ -38,6 +38,8 @@ jobs: should-run: ${{ steps.filter.outputs.matched }} steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false - uses: dorny/paths-filter@fbd0ab8f3e69293af611ebaee6363fc25e6d187d # v4.0.1 id: filter with: @@ -211,12 +213,7 @@ jobs: - name: Run chainsaw health checks # The H100 stack can make namespace cleanup API calls slow under load. # Keep cleanup enabled, but allow more than the default 30s deadline. - run: | - chainsaw test \ - --test-dir tests/chainsaw/ai-conformance/kind-training-kubeflow \ - --config tests/chainsaw/chainsaw-config.yaml \ - --cleanup-timeout 120s \ - --delete-timeout 120s + run: bash .github/scripts/gpu-chainsaw-health.sh tests/chainsaw/ai-conformance/kind-training-kubeflow # --- CNCF AI Conformance validation --- # Runs last to ensure the DCGM → Prometheus → adapter pipeline @@ -241,19 +238,7 @@ jobs: - name: Validate CNCF AI Conformance id: validate-conformance - run: | - AICR_VALIDATOR_IMAGE_REGISTRY=ko.local \ - ./aicr validate \ - --recipe recipe.yaml \ - --phase conformance \ - --namespace gpu-operator \ - --kubeconfig="${HOME}/.kube/config" \ - --require-gpu \ - --image=ko.local:smoke-test \ - --timeout=10m \ - --toleration '*' \ - --output=validation-result.yaml \ - --evidence-dir=conformance-evidence + run: bash .github/scripts/gpu-validate-conformance.sh # --- Validation artifacts --- @@ -274,81 +259,9 @@ jobs: if: failure() timeout-minutes: 5 shell: bash - run: | - set -o pipefail - kubectl_kind() { - timeout 30s kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" "$@" - } - print_workload_images() { - local ns="$1" - kubectl_kind -n "${ns}" get deployment,daemonset,statefulset -o json 2>/dev/null \ - | jq -r ' - .items[] | - [ - .kind, - .metadata.namespace + "/" + .metadata.name, - (([.spec.template.spec.containers[]?.image] + - [.spec.template.spec.initContainers[]?.image]) | unique | join(",")) - ] | @tsv - ' || true - } - - echo "=== Workload image inventory ===" - for NS in cert-manager gpu-operator monitoring skyhook nvsentinel nvidia-dra-driver \ - nvidia-network-operator kai-scheduler kubeflow; do - echo "--- ${NS} ---" - print_workload_images "${NS}" - done - echo "=== Grafana deployment ===" - kubectl_kind -n monitoring get deployment grafana -o wide 2>/dev/null || true - echo "=== Grafana pods ===" - kubectl_kind -n monitoring get pods \ - -l app.kubernetes.io/name=grafana -o wide 2>/dev/null || true - echo "=== Grafana deployment describe ===" - kubectl_kind -n monitoring describe deployment grafana 2>/dev/null || true - echo "=== Grafana pod describe ===" - kubectl_kind -n monitoring describe pods \ - -l app.kubernetes.io/name=grafana 2>/dev/null || true - echo "=== KAI scheduler pods ===" - kubectl_kind -n kai-scheduler get pods -o wide 2>/dev/null || true - echo "=== KAI admission deployment ===" - kubectl_kind -n kai-scheduler get deployment admission -o wide 2>/dev/null || true - echo "=== KAI admission deployment describe ===" - kubectl_kind -n kai-scheduler describe deployment admission 2>/dev/null || true - echo "=== KAI admission pod describe ===" - kubectl_kind -n kai-scheduler get pods -o name 2>/dev/null \ - | grep '^pod/admission-' \ - | while read -r pod; do - kubectl_kind -n kai-scheduler describe "${pod}" 2>/dev/null || true - done || true - echo "=== KAI admission logs ===" - kubectl_kind -n kai-scheduler \ - logs deployment/admission --all-containers --tail=200 2>/dev/null || true - echo "=== KAI scheduler logs ===" - kubectl_kind -n kai-scheduler \ - logs deployment/kai-scheduler-default --tail=100 2>/dev/null || true - echo "=== KAI scheduler queues ===" - kubectl_kind get queues -A 2>/dev/null || true - echo "=== KAI scheduler podgroups ===" - kubectl_kind get podgroups -A 2>/dev/null || true - echo "=== Recent events (kai-scheduler) ===" - kubectl_kind -n kai-scheduler get events --sort-by='.lastTimestamp' 2>/dev/null | tail -50 || true - echo "=== Kubeflow Trainer deployment ===" - kubectl_kind -n kubeflow get deployment kubeflow-trainer-controller-manager -o wide 2>/dev/null || true - echo "=== Kubeflow pods ===" - kubectl_kind -n kubeflow get pods -o wide 2>/dev/null || true - echo "=== Kubeflow validating webhooks ===" - kubectl_kind get validatingwebhookconfigurations validator.trainer.kubeflow.org -o yaml 2>/dev/null || true - echo "=== Kubeflow Trainer CRD ===" - kubectl_kind get crd trainjobs.trainer.kubeflow.org -o yaml 2>/dev/null || true - echo "=== Non-running pods (all namespaces) ===" - kubectl_kind get pods -A \ - --field-selector=status.phase!=Running,status.phase!=Succeeded 2>/dev/null || true - echo "=== GPU Operator pods ===" - kubectl_kind -n gpu-operator get pods -o wide 2>/dev/null || true - echo "=== Node resources ===" - kubectl_kind describe nodes 2>/dev/null | \ - grep -A 20 "Allocated resources" || true + env: + GPU_TEST_DIAGNOSTIC_MODE: training + run: bash .github/scripts/gpu-debug-diagnostics.sh - name: GPU Test Cleanup if: always() diff --git a/.github/workflows/gpu-smoke-test.yaml b/.github/workflows/gpu-smoke-test.yaml index 805548afc..0cbb2da5c 100644 --- a/.github/workflows/gpu-smoke-test.yaml +++ b/.github/workflows/gpu-smoke-test.yaml @@ -38,6 +38,8 @@ jobs: should-run: ${{ steps.filter.outputs.matched }} steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false - uses: dorny/paths-filter@fbd0ab8f3e69293af611ebaee6363fc25e6d187d # v4.0.1 id: filter with: @@ -106,28 +108,7 @@ jobs: method: helm - name: Run nvidia-smi in a pod - run: | - cat <<'EOF' | kubectl --context="kind-${KIND_CLUSTER_NAME}" apply -f - - apiVersion: v1 - kind: Pod - metadata: - name: gpu-smoke-test - spec: - restartPolicy: Never - containers: - - name: nvidia-smi - image: ubuntu:22.04 - command: ["nvidia-smi"] - resources: - limits: - nvidia.com/gpu: 1 - EOF - - echo "Waiting for gpu-smoke-test pod to complete..." - kubectl --context="kind-${KIND_CLUSTER_NAME}" wait pod/gpu-smoke-test \ - --for=condition=Ready --timeout=120s || true - kubectl --context="kind-${KIND_CLUSTER_NAME}" wait pod/gpu-smoke-test \ - --for=jsonpath='{.status.phase}'=Succeeded --timeout=120s + run: bash .github/scripts/gpu-smoke-run-nvidia-smi.sh - name: Show nvidia-smi output run: kubectl --context="kind-${KIND_CLUSTER_NAME}" logs gpu-smoke-test @@ -143,17 +124,9 @@ jobs: - name: Debug diagnostics if: failure() - run: | - echo "=== ClusterPolicy status ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" get clusterpolicy -o yaml 2>/dev/null || true - echo "=== GPU Operator pods ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods -o wide 2>/dev/null || true - echo "=== Non-running pods (all namespaces) ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded 2>/dev/null || true - echo "=== Recent events (gpu-operator) ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true - echo "=== Node status ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" get nodes -o wide 2>/dev/null || true + env: + GPU_TEST_DIAGNOSTIC_MODE: smoke + run: bash .github/scripts/gpu-debug-diagnostics.sh - name: GPU Test Cleanup if: always() diff --git a/docs/user/cli-reference.md b/docs/user/cli-reference.md index ba62aef1a..02b104a8c 100644 --- a/docs/user/cli-reference.md +++ b/docs/user/cli-reference.md @@ -1322,8 +1322,8 @@ After `helm install`, the same manifests are re-applied as post-install to ensur Components that use operator patterns with custom resources that reconcile asynchronously use component-specific install behavior: -- `kai-scheduler` installs without `--wait` to avoid Helm timing out on CR readiness. It uses a 30 minute per-attempt timeout and caps the retry budget at 1 retry so hook diagnostics surface quickly on cold runners. -- `dynamo-platform` uses client-side apply so retries and upgrades do not conflict with Grove-managed webhook certificate Secret data, while preserving Helm `--wait` behavior. Dynamo installs use a 20 minute per-attempt timeout and cap the retry budget at 3 retries because cold-start failures often involve operator webhook and certificate resources settling across attempts. +- `kai-scheduler` installs without `--wait` to avoid Helm timing out on CR readiness. It uses a 30-minute per-attempt timeout and caps the retry budget at 1 retry so hook diagnostics surface quickly on cold runners. +- `dynamo-platform` uses client-side apply so retries and upgrades do not conflict with Grove-managed webhook certificate Secret data, while preserving Helm `--wait` behavior. Dynamo installs use a 20-minute per-attempt timeout and cap the retry budget at 3 retries because cold-start failures often involve operator webhook and certificate resources settling across attempts. - `kube-prometheus-stack` keeps the default timeout and retry budget because Grafana can hit its Deployment progress deadline before a longer Helm timeout would help, especially on slower H100 CI runners under image-pull and control-plane load. ##### DRA kubelet plugin registration diff --git a/pkg/bundler/deployer/helm/helm_test.go b/pkg/bundler/deployer/helm/helm_test.go index ff41c88e4..4028a8246 100644 --- a/pkg/bundler/deployer/helm/helm_test.go +++ b/pkg/bundler/deployer/helm/helm_test.go @@ -575,12 +575,16 @@ func TestGenerate_DeployScriptComponentTimeouts(t *testing.T) { wantRetryCap: `if [[ "${COMPONENT_MAX_RETRIES}" -gt 3 ]]`, wantApplyArgs: `COMPONENT_HELM_APPLY_ARGS=(--server-side=false)`, wantSnippets: []string{ + `helm_supports_server_side_flag`, + `--request-timeout="${KUBECTL_REQUEST_TIMEOUT}"`, + `dynamo-platform conflict mitigation requires a Helm client that supports --server-side=false`, `dump_dynamo_platform_helm_diagnostics "${namespace}"`, `deployment/dynamo-platform-dynamo-operator-controller-manager`, `--previous --tail=200`, }, wantReadmeSnippets: []string{ `--server-side=false`, + `Helm client that supports the flag`, `--wait --timeout 20m`, }, }, diff --git a/pkg/bundler/deployer/helm/templates/README.md.tmpl b/pkg/bundler/deployer/helm/templates/README.md.tmpl index ba3cef380..3a1368470 100644 --- a/pkg/bundler/deployer/helm/templates/README.md.tmpl +++ b/pkg/bundler/deployer/helm/templates/README.md.tmpl @@ -104,6 +104,9 @@ helm upgrade --install {{ .Name }} {{ .ChartName }} \ {{ end -}} {{ end -}} ``` +{{ if eq .Name "dynamo-platform" }} +`--server-side=false` requires a Helm client that supports the flag (Helm v4 or compatible). `deploy.sh` checks support before adding the flag and warns before falling back when unavailable. +{{ end -}} {{ end -}} {{ if .HasManifests }} ```bash diff --git a/pkg/bundler/deployer/helm/templates/component-README.md.tmpl b/pkg/bundler/deployer/helm/templates/component-README.md.tmpl index 7797779a0..84517401d 100644 --- a/pkg/bundler/deployer/helm/templates/component-README.md.tmpl +++ b/pkg/bundler/deployer/helm/templates/component-README.md.tmpl @@ -70,6 +70,9 @@ helm upgrade --install {{ .Name }} {{ .ChartName }} \ {{ end -}} {{ end -}} ``` +{{ if eq .Name "dynamo-platform" }} +`--server-side=false` requires a Helm client that supports the flag (Helm v4 or compatible). `deploy.sh` checks support before adding the flag and warns before falling back when unavailable. +{{ end -}} {{ if .HasManifests }} After the chart is installed, apply additional manifests: diff --git a/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl b/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl index 826ab08ed..c8f7ceb68 100644 --- a/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl +++ b/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl @@ -24,6 +24,7 @@ trap 'rm -rf "${HELM_WORKDIR}"' EXIT cd "${HELM_WORKDIR}" HELM_TIMEOUT="10m" +KUBECTL_REQUEST_TIMEOUT="10s" NO_WAIT=false BEST_EFFORT=false FAILED_COMPONENTS="" @@ -59,6 +60,10 @@ function backoff_seconds() { echo "${seconds}" } +function helm_supports_server_side_flag() { + helm help upgrade 2>/dev/null | grep -q -- '--server-side' +} + function retry() { local desc="$1"; shift local attempt=0 @@ -152,23 +157,23 @@ function dump_dynamo_platform_helm_diagnostics() { echo " --- ${namespace} diagnostics ---" echo " Deployments:" - kubectl get deployments -n "${namespace}" -o wide 2>/dev/null || true + kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" get deployments -n "${namespace}" -o wide 2>/dev/null || true echo " Jobs:" - kubectl get jobs -n "${namespace}" 2>/dev/null || true + kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" get jobs -n "${namespace}" 2>/dev/null || true echo " Pods:" - kubectl get pods -n "${namespace}" -o wide 2>/dev/null || true + kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" get pods -n "${namespace}" -o wide 2>/dev/null || true echo " Pod descriptions:" - kubectl describe pods -n "${namespace}" 2>/dev/null || true + kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" describe pods -n "${namespace}" 2>/dev/null || true echo " Dynamo operator manager logs:" - kubectl logs deployment/dynamo-platform-dynamo-operator-controller-manager -n "${namespace}" -c manager --tail=200 2>/dev/null || true + kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" logs deployment/dynamo-platform-dynamo-operator-controller-manager -n "${namespace}" -c manager --tail=200 2>/dev/null || true echo " Dynamo operator manager previous logs:" - kubectl logs deployment/dynamo-platform-dynamo-operator-controller-manager -n "${namespace}" -c manager --previous --tail=200 2>/dev/null || true + kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" logs deployment/dynamo-platform-dynamo-operator-controller-manager -n "${namespace}" -c manager --previous --tail=200 2>/dev/null || true echo " Grove operator logs:" - kubectl logs deployment/grove-operator -n "${namespace}" --all-containers --tail=100 2>/dev/null || true + kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" logs deployment/grove-operator -n "${namespace}" --all-containers --tail=100 2>/dev/null || true echo " Grove operator previous logs:" - kubectl logs deployment/grove-operator -n "${namespace}" --all-containers --previous --tail=100 2>/dev/null || true + kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" logs deployment/grove-operator -n "${namespace}" --all-containers --previous --tail=100 2>/dev/null || true echo " Recent events:" - kubectl get events -n "${namespace}" --sort-by='.lastTimestamp' 2>/dev/null | tail -50 || true + kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" get events -n "${namespace}" --sort-by='.lastTimestamp' 2>/dev/null | tail -50 || true echo " --- End ${namespace} diagnostics ---" } @@ -414,8 +419,12 @@ fi COMPONENT_HELM_TIMEOUT="20m" # Grove owns the generated webhook certificate Secret data after install. # Client-side apply avoids server-side field ownership conflicts during retries. -# Requires Helm v4+ for --server-side=false; AICR bundles pin Helm v4 in .settings.yaml. -COMPONENT_HELM_APPLY_ARGS=(--server-side=false) +# This flag requires a Helm client that supports --server-side=false. +if helm_supports_server_side_flag; then + COMPONENT_HELM_APPLY_ARGS=(--server-side=false) +else + echo "::warning::dynamo-platform conflict mitigation requires a Helm client that supports --server-side=false; proceeding without this flag" +fi if [[ "${COMPONENT_MAX_RETRIES}" -gt 3 ]]; then COMPONENT_MAX_RETRIES="3" fi From 1d553296663225f40a2edb21bd228eaadeeab427 Mon Sep 17 00:00:00 2001 From: Yuan Chen Date: Sun, 26 Apr 2026 15:39:11 -0700 Subject: [PATCH 19/21] ci: address GPU workflow review hardening --- .github/actions/README.md | 7 +++ .github/actions/aicr-build/build-cli.sh | 14 +++++ .../aicr-build/build-snapshot-agent.sh | 16 +++++- .../aicr-build/build-validator-images.sh | 23 +++++++- .github/actions/aicr-build/stage-cli.sh | 14 +++++ .../check-runner-capacity.sh | 10 ++-- .../configure-nvidia-container-toolkit.sh | 15 ++++- .../create-gpu-kind-cluster.sh | 56 ++++++++++++++++++- .../delete-stale-kind-cluster.sh | 18 +++--- .../gpu-cluster-setup/install-nvkind.sh | 8 ++- .../gpu-cluster-setup/warm-kind-node-image.sh | 10 ++-- .../gpu-operator-install/generate-bundle.sh | 1 + .../install-gpu-operator-helm.sh | 5 +- .../wait-gpu-operands-bundle.sh | 6 +- .../wait-gpu-operands-helm.sh | 19 ++++++- .../validate-snapshot-gpu.sh | 4 ++ .../collect-debug-artifacts.sh | 4 +- .../install-karpenter-kwok.sh | 8 ++- .github/scripts/gpu-chainsaw-health.sh | 32 ++++++++++- .github/scripts/gpu-debug-diagnostics.sh | 24 ++++++-- .github/scripts/gpu-smoke-run-nvidia-smi.sh | 29 ++++++++-- .../scripts/gpu-smoke-show-nvidia-smi-log.sh | 35 ++++++++++++ .github/scripts/gpu-validate-conformance.sh | 14 +++++ .github/workflows/gpu-smoke-test.yaml | 2 +- docs/user/cli-reference.md | 2 +- pkg/bundler/deployer/helm/helm_test.go | 11 ++-- .../deployer/helm/templates/README.md.tmpl | 2 +- .../helm/templates/component-README.md.tmpl | 2 +- .../deployer/helm/templates/deploy.sh.tmpl | 56 ++++++++++++++----- 29 files changed, 381 insertions(+), 66 deletions(-) create mode 100644 .github/scripts/gpu-smoke-show-nvidia-smi-log.sh diff --git a/.github/actions/README.md b/.github/actions/README.md index 3b1ee648d..15710df7d 100644 --- a/.github/actions/README.md +++ b/.github/actions/README.md @@ -4,6 +4,13 @@ This directory contains a modular, reusable GitHub Actions architecture optimize ## Composite Actions +### Script Conventions + +Composite action helper scripts in this directory are intentionally portable +across checkout modes: keep them mode `0644` and invoke them as +`bash path/to/script.sh` from workflows or `action.yml` files. Do not rely on +executable bits or `./script.sh` invocation. + ### Core CI/CD Actions #### `security-scan/` diff --git a/.github/actions/aicr-build/build-cli.sh b/.github/actions/aicr-build/build-cli.sh index 81b657f58..83c834aad 100644 --- a/.github/actions/aicr-build/build-cli.sh +++ b/.github/actions/aicr-build/build-cli.sh @@ -1,4 +1,18 @@ #!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + set -euo pipefail mkdir -p dist diff --git a/.github/actions/aicr-build/build-snapshot-agent.sh b/.github/actions/aicr-build/build-snapshot-agent.sh index 9650fbd0d..512aad2f0 100644 --- a/.github/actions/aicr-build/build-snapshot-agent.sh +++ b/.github/actions/aicr-build/build-snapshot-agent.sh @@ -1,9 +1,23 @@ #!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + set -euo pipefail # Build snapshot agent image with CUDA base (provides nvidia-smi for GPU detection). # Uses cuda:base (~250MB) instead of cuda:runtime (~1.8GB) because only nvidia-smi is needed. -docker build -t ko.local:smoke-test -f - . <<'DOCKERFILE' +timeout 900s docker build -t ko.local:smoke-test -f - . <<'DOCKERFILE' FROM nvcr.io/nvidia/cuda:13.1.0-base-ubuntu24.04 COPY dist/aicr /usr/local/bin/aicr ENTRYPOINT ["/usr/local/bin/aicr"] diff --git a/.github/actions/aicr-build/build-validator-images.sh b/.github/actions/aicr-build/build-validator-images.sh index 4389b15b4..76682af90 100644 --- a/.github/actions/aicr-build/build-validator-images.sh +++ b/.github/actions/aicr-build/build-validator-images.sh @@ -1,4 +1,18 @@ #!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + set -euo pipefail if [[ -n "${VALIDATOR_PHASES}" ]]; then @@ -14,12 +28,19 @@ fi mkdir -p dist/validator for phase in ${PHASES//,/ }; do + if ! [[ "${phase}" =~ ^[a-z][a-z0-9_-]*$ ]]; then + echo "::error::invalid validator phase '${phase}'; expected ^[a-z][a-z0-9_-]*$" + exit 1 + fi echo "Building validator binary: ${phase}" CGO_ENABLED=0 go build -trimpath -o "dist/validator/${phase}" "./validators/${phase}" done for phase in ${PHASES//,/ }; do - mkdir -p "validators/${phase}/testdata" + if [[ ! -d "validators/${phase}/testdata" ]]; then + echo "::warning::validators/${phase}/testdata is missing; creating empty testdata directory" + mkdir -p "validators/${phase}/testdata" + fi docker build -t "ko.local/aicr-validators/${phase}:latest" -f - . </dev/null 2>&1; then + echo "Docker is healthy after NVIDIA runtime configuration." + exit 0 + fi + echo "Waiting for Docker to become healthy... (${attempt}/30)" + sleep 2 +done + +echo "::error::Docker did not become healthy after NVIDIA runtime configuration" +sudo systemctl status docker --no-pager || true +exit 1 diff --git a/.github/actions/gpu-cluster-setup/create-gpu-kind-cluster.sh b/.github/actions/gpu-cluster-setup/create-gpu-kind-cluster.sh index 42a282e17..2f9ab4817 100644 --- a/.github/actions/gpu-cluster-setup/create-gpu-kind-cluster.sh +++ b/.github/actions/gpu-cluster-setup/create-gpu-kind-cluster.sh @@ -96,6 +96,11 @@ esac if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" || "${CONTROL_PLANE_LEADER_ELECTION_TUNING}" == "true" ]]; then patch_dir="$(mktemp -d)" config_template="$(mktemp)" + cleanup_generated_config() { + [[ -n "${patch_dir:-}" ]] && rm -rf "${patch_dir}" + [[ -n "${config_template:-}" ]] && rm -f "${config_template}" + } + trap cleanup_generated_config EXIT # Keep YAML heredocs at column 0; indentation is literal content. if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then @@ -361,6 +366,48 @@ static_pod_manifest_contains_arg() { docker exec "${node}" grep -Fq -- "- ${expected}" "/etc/kubernetes/manifests/${component}.yaml" } +running_static_pod_container_contains_arg() { + local component="$1" + local expected="$2" + local node="${KIND_CLUSTER_NAME}-control-plane" + local container_ids + local container_id + + if ! container_ids="$(docker exec "${node}" crictl ps --name "${component}" -q 2>/dev/null)"; then + return 1 + fi + [[ -z "${container_ids}" ]] && return 1 + + for container_id in ${container_ids}; do + if docker exec "${node}" crictl inspect "${container_id}" 2>/dev/null | jq -e --arg expected "${expected}" ' + ([.info.runtimeSpec.process.args[]?, .status.info.runtimeSpec.process.args[]?] | index($expected)) != null + ' >/dev/null; then + return 0 + fi + done + return 1 +} + +dump_running_static_pod_container_args() { + local component="$1" + local node="${KIND_CLUSTER_NAME}-control-plane" + local container_ids + local container_id + + echo "Running ${component} CRI container args:" + container_ids="$(docker exec "${node}" crictl ps --name "${component}" -q 2>/dev/null || true)" + if [[ -z "${container_ids}" ]]; then + echo "(no running ${component} CRI containers found)" + return + fi + for container_id in ${container_ids}; do + echo "--- ${container_id} ---" + docker exec "${node}" crictl inspect "${container_id}" 2>/dev/null | jq -r ' + [.info.runtimeSpec.process.args[]?, .status.info.runtimeSpec.process.args[]?][]? + ' || true + done +} + dump_static_pod_manifest() { local component="$1" local node="${KIND_CLUSTER_NAME}-control-plane" @@ -375,14 +422,19 @@ assert_control_plane_arg() { local command_args command_args="$(control_plane_command_args "${component}")" - if ! grep -Fxq "${expected}" <<< "${command_args}"; then + if ! grep -Fxq -- "${expected}" <<< "${command_args}"; then + if running_static_pod_container_contains_arg "${component}" "${expected}"; then + echo "${component} running CRI container args verified: ${expected} (live mirror pod omitted it)" + return + fi if static_pod_manifest_contains_arg "${component}" "${expected}"; then - echo "::warning::${component} live mirror pod command/args did not show ${expected}; static pod manifest is patched" + echo "::warning::${component} live mirror pod and running CRI container args did not show ${expected}; static pod manifest is patched" return fi echo "::error::${component} live pod command/args does not contain ${expected}" echo "Observed live command/args:" echo "${command_args}" + dump_running_static_pod_container_args "${component}" dump_static_pod_manifest "${component}" exit 1 fi diff --git a/.github/actions/gpu-cluster-setup/delete-stale-kind-cluster.sh b/.github/actions/gpu-cluster-setup/delete-stale-kind-cluster.sh index 8e85ffcb9..0f29c469b 100644 --- a/.github/actions/gpu-cluster-setup/delete-stale-kind-cluster.sh +++ b/.github/actions/gpu-cluster-setup/delete-stale-kind-cluster.sh @@ -15,6 +15,10 @@ set -euo pipefail kind_cluster_label="io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}" +docker_timeout() { + timeout 30s docker "$@" +} + if kind get clusters | grep -Fxq "${KIND_CLUSTER_NAME}"; then echo "Deleting stale kind cluster: ${KIND_CLUSTER_NAME}" if ! timeout 180s kind delete cluster --name "${KIND_CLUSTER_NAME}"; then @@ -24,16 +28,16 @@ else echo "No stale kind cluster named ${KIND_CLUSTER_NAME}" fi -remaining_containers=$(docker ps -aq --filter "label=${kind_cluster_label}") -if [[ -n "${remaining_containers}" ]]; then +mapfile -t remaining_containers < <(docker_timeout ps -aq --filter "label=${kind_cluster_label}") +if (( ${#remaining_containers[@]} > 0 )); then echo "Removing stale containers for ${KIND_CLUSTER_NAME}:" - docker ps -a --filter "label=${kind_cluster_label}" - docker rm -f ${remaining_containers} + docker_timeout ps -a --filter "label=${kind_cluster_label}" + docker_timeout rm -f "${remaining_containers[@]}" fi -remaining_containers=$(docker ps -aq --filter "label=${kind_cluster_label}") -if [[ -n "${remaining_containers}" ]]; then +mapfile -t remaining_containers < <(docker_timeout ps -aq --filter "label=${kind_cluster_label}") +if (( ${#remaining_containers[@]} > 0 )); then echo "::error::stale containers still remain for ${KIND_CLUSTER_NAME}:" - docker ps -a --filter "label=${kind_cluster_label}" + docker_timeout ps -a --filter "label=${kind_cluster_label}" exit 1 fi diff --git a/.github/actions/gpu-cluster-setup/install-nvkind.sh b/.github/actions/gpu-cluster-setup/install-nvkind.sh index 38f1ce0ae..c2200e078 100644 --- a/.github/actions/gpu-cluster-setup/install-nvkind.sh +++ b/.github/actions/gpu-cluster-setup/install-nvkind.sh @@ -15,5 +15,11 @@ set -euo pipefail +if [[ -z "${NVKIND_VERSION:-}" ]]; then + echo "::error::NVKIND_VERSION must be set" + exit 1 +fi + go install "github.com/NVIDIA/nvkind/cmd/nvkind@${NVKIND_VERSION}" -nvkind --help +nvkind_bin="${GOBIN:-$(go env GOPATH)/bin}/nvkind" +"${nvkind_bin}" --help diff --git a/.github/actions/gpu-cluster-setup/warm-kind-node-image.sh b/.github/actions/gpu-cluster-setup/warm-kind-node-image.sh index 3d58a4887..b0567fa7c 100644 --- a/.github/actions/gpu-cluster-setup/warm-kind-node-image.sh +++ b/.github/actions/gpu-cluster-setup/warm-kind-node-image.sh @@ -21,9 +21,11 @@ else echo "Pulling kind node image: ${KIND_NODE_IMAGE}" timeout 600s docker pull "${KIND_NODE_IMAGE}" fi -free_disk_gb=$(df -BG --output=avail / | tail -1 | tr -dc '0-9') -if (( free_disk_gb < MIN_FREE_DISK_GB )); then - echo "::error::free disk on / is ${free_disk_gb}GiB after warming ${KIND_NODE_IMAGE}, need at least ${MIN_FREE_DISK_GB}GiB" +free_disk_bytes=$(df -B1 --output=avail / | tail -1 | tr -dc '0-9') +min_free_disk_bytes=$((MIN_FREE_DISK_GB * 1024 * 1024 * 1024)) +free_disk_gib=$((free_disk_bytes / 1024 / 1024 / 1024)) +if (( free_disk_bytes < min_free_disk_bytes )); then + echo "::error::free disk on / is ${free_disk_bytes} bytes (${free_disk_gib}GiB) after warming ${KIND_NODE_IMAGE}, need at least ${min_free_disk_bytes} bytes (${MIN_FREE_DISK_GB}GiB)" exit 1 fi -echo "Runner disk remains sufficient after kind image warm-up: ${free_disk_gb}GiB" +echo "Runner disk remains sufficient after kind image warm-up: ${free_disk_gib}GiB (${free_disk_bytes} bytes)" diff --git a/.github/actions/gpu-operator-install/generate-bundle.sh b/.github/actions/gpu-operator-install/generate-bundle.sh index c7dd3f413..095b68415 100644 --- a/.github/actions/gpu-operator-install/generate-bundle.sh +++ b/.github/actions/gpu-operator-install/generate-bundle.sh @@ -15,6 +15,7 @@ set -euo pipefail +rm -rf bundle ./aicr bundle \ --recipe recipe.yaml \ --accelerated-node-toleration nvidia.com/gpu:NoSchedule \ diff --git a/.github/actions/gpu-operator-install/install-gpu-operator-helm.sh b/.github/actions/gpu-operator-install/install-gpu-operator-helm.sh index 0aea450eb..6079cad83 100644 --- a/.github/actions/gpu-operator-install/install-gpu-operator-helm.sh +++ b/.github/actions/gpu-operator-install/install-gpu-operator-helm.sh @@ -15,7 +15,9 @@ set -euo pipefail -helm repo add nvidia https://helm.ngc.nvidia.com/nvidia +GPU_OPERATOR_CHART_VERSION="v25.10.1" + +helm repo add nvidia https://helm.ngc.nvidia.com/nvidia --force-update helm repo update helm upgrade -i \ --kube-context="kind-${KIND_CLUSTER_NAME}" \ @@ -25,5 +27,6 @@ helm upgrade -i \ --set toolkit.enabled=false \ --set dcgmExporter.enabled=false \ --set nfd.enabled=true \ + --version="${GPU_OPERATOR_CHART_VERSION}" \ --wait --timeout=600s \ gpu-operator nvidia/gpu-operator diff --git a/.github/actions/gpu-operator-install/wait-gpu-operands-bundle.sh b/.github/actions/gpu-operator-install/wait-gpu-operands-bundle.sh index 496eb372e..2fee8a2c0 100644 --- a/.github/actions/gpu-operator-install/wait-gpu-operands-bundle.sh +++ b/.github/actions/gpu-operator-install/wait-gpu-operands-bundle.sh @@ -20,7 +20,7 @@ echo "Waiting for GPU operator controller to deploy operands..." # DaemonSets for device-plugin, NFD, GFD, etc. This happens # asynchronously after the helm install completes. for i in $(seq 1 30); do - count=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \ + count=$(kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \ get daemonset -l app=nvidia-device-plugin-daemonset --no-headers 2>/dev/null | wc -l) if [[ "$count" -gt 0 ]]; then echo "Device plugin DaemonSet found." @@ -32,7 +32,7 @@ done echo "Waiting for device plugin rollout..." # Operands are excluded from control-plane nodes via nodeAffinity in # the kind overlay, so all scheduled pods should become ready. -kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \ +kubectl --request-timeout=300s --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \ rollout status daemonset -l app=nvidia-device-plugin-daemonset --timeout=300s echo "GPU Operator pods:" -kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods +kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods diff --git a/.github/actions/gpu-operator-install/wait-gpu-operands-helm.sh b/.github/actions/gpu-operator-install/wait-gpu-operands-helm.sh index 2ad7e801d..2f0bbe159 100644 --- a/.github/actions/gpu-operator-install/wait-gpu-operands-helm.sh +++ b/.github/actions/gpu-operator-install/wait-gpu-operands-helm.sh @@ -16,7 +16,22 @@ set -euo pipefail echo "Waiting for device plugin to be ready..." -kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \ +for i in $(seq 1 30); do + if kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \ + get daemonset -l app=nvidia-device-plugin-daemonset --no-headers 2>/dev/null | grep -q .; then + echo "Device plugin DaemonSet found." + break + fi + if (( i == 30 )); then + echo "::error::device plugin DaemonSet was not created within 300s" + kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods || true + exit 1 + fi + echo "Waiting for device plugin DaemonSet to be created... (${i}/30)" + sleep 10 +done + +kubectl --request-timeout=300s --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \ rollout status daemonset -l app=nvidia-device-plugin-daemonset --timeout=300s || true echo "GPU Operator pods:" -kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods +kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods diff --git a/.github/actions/gpu-snapshot-validate/validate-snapshot-gpu.sh b/.github/actions/gpu-snapshot-validate/validate-snapshot-gpu.sh index 57a622a2d..5a27e6093 100644 --- a/.github/actions/gpu-snapshot-validate/validate-snapshot-gpu.sh +++ b/.github/actions/gpu-snapshot-validate/validate-snapshot-gpu.sh @@ -20,6 +20,10 @@ GPU_MODEL=$(yq eval '.measurements[] | select(.type == "GPU") | .subtypes[] | se GPU_COUNT=$(yq eval '.measurements[] | select(.type == "GPU") | .subtypes[] | select(.subtype == "smi") | .data["gpu-count"]' snapshot.yaml) echo "GPU model: ${GPU_MODEL}" echo "GPU count: ${GPU_COUNT}" +if ! [[ "${GPU_COUNT}" =~ ^[0-9]+$ ]]; then + echo "::error::Expected numeric gpu-count in snapshot, got: ${GPU_COUNT}" + exit 1 +fi if [[ "${GPU_MODEL}" != *"${EXPECTED_GPU_MODEL}"* ]]; then echo "::error::Expected ${EXPECTED_GPU_MODEL} GPU in snapshot, got: ${GPU_MODEL}" exit 1 diff --git a/.github/actions/gpu-test-cleanup/collect-debug-artifacts.sh b/.github/actions/gpu-test-cleanup/collect-debug-artifacts.sh index 417606c7d..257695334 100644 --- a/.github/actions/gpu-test-cleanup/collect-debug-artifacts.sh +++ b/.github/actions/gpu-test-cleanup/collect-debug-artifacts.sh @@ -13,7 +13,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -set -o pipefail +# Diagnostic artifact collection intentionally omits -e so one broken cluster +# call does not prevent later artifacts from being collected. +set -uo pipefail mkdir -p /tmp/debug-artifacts CONTROL_PLANE_COMPONENTS="kube-apiserver kube-controller-manager kube-scheduler etcd" kubectl_kind() { diff --git a/.github/actions/install-karpenter-kwok/install-karpenter-kwok.sh b/.github/actions/install-karpenter-kwok/install-karpenter-kwok.sh index 2fdb26312..472eb844b 100644 --- a/.github/actions/install-karpenter-kwok/install-karpenter-kwok.sh +++ b/.github/actions/install-karpenter-kwok/install-karpenter-kwok.sh @@ -32,10 +32,16 @@ validate_seconds_input() { echo "::error::${input_name} must be an integer number of seconds, got '${input_value}'" exit 1 fi + if (( input_value <= 0 )); then + echo "::error::${input_name} must be greater than 0 seconds, got '${input_value}'" + exit 1 + fi } validate_duration_input kwok_helm_timeout "${KWOK_HELM_TIMEOUT}" validate_seconds_input ko_build_timeout "${KO_BUILD_TIMEOUT}" validate_duration_input karpenter_helm_timeout "${KARPENTER_HELM_TIMEOUT}" bash kwok/scripts/install-karpenter-kwok.sh -kubectl --context="kind-${KIND_CLUSTER_NAME}" apply -f kwok/manifests/karpenter/nodepool.yaml +timeout 30s kubectl --request-timeout=10s \ + --context="kind-${KIND_CLUSTER_NAME}" \ + apply -f kwok/manifests/karpenter/nodepool.yaml diff --git a/.github/scripts/gpu-chainsaw-health.sh b/.github/scripts/gpu-chainsaw-health.sh index 35450eb00..7e2c5ac96 100644 --- a/.github/scripts/gpu-chainsaw-health.sh +++ b/.github/scripts/gpu-chainsaw-health.sh @@ -1,10 +1,36 @@ #!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + set -euo pipefail +if [[ $# -ne 1 ]]; then + echo "::error::Usage: $0 " + exit 2 +fi test_dir="$1" +if [[ ! -d "${test_dir}" ]]; then + echo "::error::Test directory not found: ${test_dir}" + exit 1 +fi + +CHAINSAW_TEST_TIMEOUT="${CHAINSAW_TEST_TIMEOUT:-30m}" +CHAINSAW_CLEANUP_TIMEOUT="${CHAINSAW_CLEANUP_TIMEOUT:-120s}" +CHAINSAW_DELETE_TIMEOUT="${CHAINSAW_DELETE_TIMEOUT:-120s}" -chainsaw test \ +timeout "${CHAINSAW_TEST_TIMEOUT}" chainsaw test \ --test-dir "${test_dir}" \ --config tests/chainsaw/chainsaw-config.yaml \ - --cleanup-timeout 120s \ - --delete-timeout 120s + --cleanup-timeout "${CHAINSAW_CLEANUP_TIMEOUT}" \ + --delete-timeout "${CHAINSAW_DELETE_TIMEOUT}" diff --git a/.github/scripts/gpu-debug-diagnostics.sh b/.github/scripts/gpu-debug-diagnostics.sh index 0d1cd76a2..3721691b3 100644 --- a/.github/scripts/gpu-debug-diagnostics.sh +++ b/.github/scripts/gpu-debug-diagnostics.sh @@ -1,5 +1,22 @@ #!/usr/bin/env bash -set -o pipefail +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Diagnostic script: intentionally omits -e so each mode can keep collecting +# partial failure data. Keep -u and pipefail to catch script bugs and pipeline +# failures while individual kubectl_kind calls tolerate cluster errors. +set -uo pipefail mode="${GPU_TEST_DIAGNOSTIC_MODE:-smoke}" @@ -86,6 +103,7 @@ case "${mode}" in training) print_workload_inventory cert-manager gpu-operator monitoring skyhook nvsentinel nvidia-dra-driver \ nvidia-network-operator kai-scheduler kubeflow + print_common_gpu_diagnostics print_grafana_diagnostics print_kai_diagnostics echo "=== Kubeflow Trainer deployment ===" @@ -96,10 +114,6 @@ case "${mode}" in kubectl_kind get validatingwebhookconfigurations validator.trainer.kubeflow.org -o yaml 2>/dev/null || true echo "=== Kubeflow Trainer CRD ===" kubectl_kind get crd trainjobs.trainer.kubeflow.org -o yaml 2>/dev/null || true - echo "=== Non-running pods (all namespaces) ===" - kubectl_kind get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded 2>/dev/null || true - echo "=== GPU Operator pods ===" - kubectl_kind -n gpu-operator get pods -o wide 2>/dev/null || true echo "=== Node resources ===" kubectl_kind describe nodes 2>/dev/null | grep -A 20 "Allocated resources" || true ;; diff --git a/.github/scripts/gpu-smoke-run-nvidia-smi.sh b/.github/scripts/gpu-smoke-run-nvidia-smi.sh index cefff49fa..1a2151253 100644 --- a/.github/scripts/gpu-smoke-run-nvidia-smi.sh +++ b/.github/scripts/gpu-smoke-run-nvidia-smi.sh @@ -1,11 +1,27 @@ #!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + set -euo pipefail -cat <<'EOF' | kubectl --context="kind-${KIND_CLUSTER_NAME}" apply -f - +pod_name=$(cat <<'EOF' | kubectl --context="kind-${KIND_CLUSTER_NAME}" create -f - -o jsonpath='{.metadata.name}' apiVersion: v1 kind: Pod metadata: - name: gpu-smoke-test + generateName: gpu-smoke-test- + labels: + app: gpu-smoke-test spec: restartPolicy: Never containers: @@ -16,9 +32,12 @@ spec: limits: nvidia.com/gpu: 1 EOF +) + +echo "${pod_name}" > /tmp/aicr-gpu-smoke-pod-name -echo "Waiting for gpu-smoke-test pod to complete..." -kubectl --context="kind-${KIND_CLUSTER_NAME}" wait pod/gpu-smoke-test \ +echo "Waiting for ${pod_name} pod to complete..." +kubectl --context="kind-${KIND_CLUSTER_NAME}" wait "pod/${pod_name}" \ --for=condition=Ready --timeout=120s || true -kubectl --context="kind-${KIND_CLUSTER_NAME}" wait pod/gpu-smoke-test \ +kubectl --context="kind-${KIND_CLUSTER_NAME}" wait "pod/${pod_name}" \ --for=jsonpath='{.status.phase}'=Succeeded --timeout=120s diff --git a/.github/scripts/gpu-smoke-show-nvidia-smi-log.sh b/.github/scripts/gpu-smoke-show-nvidia-smi-log.sh new file mode 100644 index 000000000..982648460 --- /dev/null +++ b/.github/scripts/gpu-smoke-show-nvidia-smi-log.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +pod_name="" +if [[ -f /tmp/aicr-gpu-smoke-pod-name ]]; then + pod_name="$(cat /tmp/aicr-gpu-smoke-pod-name)" +fi + +if [[ -z "${pod_name}" ]]; then + pod_name=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" get pods \ + -l app=gpu-smoke-test \ + --sort-by=.metadata.creationTimestamp \ + -o jsonpath='{.items[-1:].metadata.name}') +fi + +if [[ -z "${pod_name}" ]]; then + echo "::error::no gpu-smoke-test pod found" + exit 1 +fi + +kubectl --context="kind-${KIND_CLUSTER_NAME}" logs "${pod_name}" diff --git a/.github/scripts/gpu-validate-conformance.sh b/.github/scripts/gpu-validate-conformance.sh index 2fb3fa4a3..79550cb3a 100644 --- a/.github/scripts/gpu-validate-conformance.sh +++ b/.github/scripts/gpu-validate-conformance.sh @@ -1,4 +1,18 @@ #!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + set -euo pipefail AICR_VALIDATOR_IMAGE_REGISTRY=ko.local \ diff --git a/.github/workflows/gpu-smoke-test.yaml b/.github/workflows/gpu-smoke-test.yaml index 0cbb2da5c..bdf607e07 100644 --- a/.github/workflows/gpu-smoke-test.yaml +++ b/.github/workflows/gpu-smoke-test.yaml @@ -111,7 +111,7 @@ jobs: run: bash .github/scripts/gpu-smoke-run-nvidia-smi.sh - name: Show nvidia-smi output - run: kubectl --context="kind-${KIND_CLUSTER_NAME}" logs gpu-smoke-test + run: bash .github/scripts/gpu-smoke-show-nvidia-smi-log.sh # --- Snapshot and validation --- diff --git a/docs/user/cli-reference.md b/docs/user/cli-reference.md index 02b104a8c..c641eee07 100644 --- a/docs/user/cli-reference.md +++ b/docs/user/cli-reference.md @@ -1323,7 +1323,7 @@ After `helm install`, the same manifests are re-applied as post-install to ensur Components that use operator patterns with custom resources that reconcile asynchronously use component-specific install behavior: - `kai-scheduler` installs without `--wait` to avoid Helm timing out on CR readiness. It uses a 30-minute per-attempt timeout and caps the retry budget at 1 retry so hook diagnostics surface quickly on cold runners. -- `dynamo-platform` uses client-side apply so retries and upgrades do not conflict with Grove-managed webhook certificate Secret data, while preserving Helm `--wait` behavior. Dynamo installs use a 20-minute per-attempt timeout and cap the retry budget at 3 retries because cold-start failures often involve operator webhook and certificate resources settling across attempts. +- `dynamo-platform` has `deploy.sh` attempt `--server-side=false` so retries and upgrades do not conflict with Grove-managed webhook certificate Secret data, while preserving Helm `--wait` behavior. The script only adds `--server-side=false` when Helm v4.0.5 or later is detected; with older Helm clients it logs a warning and proceeds without that mitigation. Dynamo installs use a 20-minute per-attempt timeout and cap the retry budget at 3 retries because cold-start failures often involve operator webhook and certificate resources settling across attempts. - `kube-prometheus-stack` keeps the default timeout and retry budget because Grafana can hit its Deployment progress deadline before a longer Helm timeout would help, especially on slower H100 CI runners under image-pull and control-plane load. ##### DRA kubelet plugin registration diff --git a/pkg/bundler/deployer/helm/helm_test.go b/pkg/bundler/deployer/helm/helm_test.go index 4028a8246..da23545a0 100644 --- a/pkg/bundler/deployer/helm/helm_test.go +++ b/pkg/bundler/deployer/helm/helm_test.go @@ -512,10 +512,10 @@ func TestGenerate_DeployScriptKaiSchedulerTimeout(t *testing.T) { if !strings.Contains(script, `dump_kai_scheduler_helm_diagnostics "${namespace}"`) { t.Error("deploy.sh missing kai-scheduler diagnostics hook") } - if !strings.Contains(script, `kubectl get jobs -n "${namespace}"`) { + if !strings.Contains(script, `kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" get jobs -n "${namespace}"`) { t.Error("deploy.sh missing job diagnostics") } - if !strings.Contains(script, `kubectl describe pods -n "${namespace}"`) { + if !strings.Contains(script, `kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" describe pods -n "${namespace}"`) { t.Error("deploy.sh missing pod diagnostics") } @@ -575,16 +575,17 @@ func TestGenerate_DeployScriptComponentTimeouts(t *testing.T) { wantRetryCap: `if [[ "${COMPONENT_MAX_RETRIES}" -gt 3 ]]`, wantApplyArgs: `COMPONENT_HELM_APPLY_ARGS=(--server-side=false)`, wantSnippets: []string{ - `helm_supports_server_side_flag`, + `helm_supports_server_side_false_install`, + `Require v4.0.5+ before relying on`, `--request-timeout="${KUBECTL_REQUEST_TIMEOUT}"`, - `dynamo-platform conflict mitigation requires a Helm client that supports --server-side=false`, + `dynamo-platform conflict mitigation requires Helm v4.0.5+`, `dump_dynamo_platform_helm_diagnostics "${namespace}"`, `deployment/dynamo-platform-dynamo-operator-controller-manager`, `--previous --tail=200`, }, wantReadmeSnippets: []string{ `--server-side=false`, - `Helm client that supports the flag`, + `requires Helm v4.0.5 or later`, `--wait --timeout 20m`, }, }, diff --git a/pkg/bundler/deployer/helm/templates/README.md.tmpl b/pkg/bundler/deployer/helm/templates/README.md.tmpl index 3a1368470..1ef2f252b 100644 --- a/pkg/bundler/deployer/helm/templates/README.md.tmpl +++ b/pkg/bundler/deployer/helm/templates/README.md.tmpl @@ -105,7 +105,7 @@ helm upgrade --install {{ .Name }} {{ .ChartName }} \ {{ end -}} ``` {{ if eq .Name "dynamo-platform" }} -`--server-side=false` requires a Helm client that supports the flag (Helm v4 or compatible). `deploy.sh` checks support before adding the flag and warns before falling back when unavailable. +`--server-side=false` requires Helm v4.0.5 or later. `deploy.sh` checks the Helm version before adding the flag and warns before falling back when unavailable. {{ end -}} {{ end -}} {{ if .HasManifests }} diff --git a/pkg/bundler/deployer/helm/templates/component-README.md.tmpl b/pkg/bundler/deployer/helm/templates/component-README.md.tmpl index 84517401d..66762ac7f 100644 --- a/pkg/bundler/deployer/helm/templates/component-README.md.tmpl +++ b/pkg/bundler/deployer/helm/templates/component-README.md.tmpl @@ -71,7 +71,7 @@ helm upgrade --install {{ .Name }} {{ .ChartName }} \ {{ end -}} ``` {{ if eq .Name "dynamo-platform" }} -`--server-side=false` requires a Helm client that supports the flag (Helm v4 or compatible). `deploy.sh` checks support before adding the flag and warns before falling back when unavailable. +`--server-side=false` requires Helm v4.0.5 or later. `deploy.sh` checks the Helm version before adding the flag and warns before falling back when unavailable. {{ end -}} {{ if .HasManifests }} After the chart is installed, apply additional manifests: diff --git a/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl b/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl index c8f7ceb68..4c1e26f28 100644 --- a/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl +++ b/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl @@ -60,7 +60,32 @@ function backoff_seconds() { echo "${seconds}" } -function helm_supports_server_side_flag() { +function helm_supports_server_side_false_install() { + local version + local major + local minor + local patch + + # Helm v4.0.0-v4.0.4 advertise --server-side=false but ignore it for the + # upgrade --install install-fallback path. Require v4.0.5+ before relying on + # the flag for Dynamo's webhook Secret conflict mitigation. + version="$(helm version --short 2>/dev/null | head -n 1 || true)" + version="${version#v}" + version="${version%%+*}" + version="${version%%-*}" + if ! [[ "${version}" =~ ^([0-9]+)\.([0-9]+)\.([0-9]+)$ ]]; then + return 1 + fi + major="${BASH_REMATCH[1]}" + minor="${BASH_REMATCH[2]}" + patch="${BASH_REMATCH[3]}" + if (( major < 4 )); then + return 1 + fi + if (( major == 4 )) && (( minor == 0 )) && (( patch < 5 )); then + return 1 + fi + helm help upgrade 2>/dev/null | grep -q -- '--server-side' } @@ -91,7 +116,7 @@ function retry() { function cleanup_helm_hooks() { local namespace="$1" local job_names - job_names=$(kubectl get jobs -n "${namespace}" \ + job_names=$(kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" get jobs -n "${namespace}" \ --field-selector=status.successful=0 \ -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' \ 2>/dev/null || true) @@ -102,7 +127,7 @@ function cleanup_helm_hooks() { [[ -z "${name}" ]] && continue # Get the full Job JSON to reliably check annotations and status local job_json - job_json=$(kubectl get job "${name}" -n "${namespace}" -o json 2>/dev/null || true) + job_json=$(kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" get job "${name}" -n "${namespace}" -o json 2>/dev/null || true) [[ -z "${job_json}" ]] && continue # Skip non-hook Jobs (no helm.sh/hook annotation) local hook_val @@ -111,13 +136,13 @@ function cleanup_helm_hooks() { # Capture diagnostics before deleting. This helps diagnose transient hook # failures (e.g., dynamo ssh-keygen) that are otherwise lost after cleanup. echo " --- Failed hook Job ${name} diagnostics ---" - kubectl describe job "${name}" -n "${namespace}" 2>/dev/null | tail -50 || true + kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" describe job "${name}" -n "${namespace}" 2>/dev/null | tail -50 || true local pod_names - pod_names=$(kubectl get pods -n "${namespace}" -l "job-name=${name}" \ + pod_names=$(kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" get pods -n "${namespace}" -l "job-name=${name}" \ -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' 2>/dev/null || true) for pod_name in ${pod_names}; do echo " --- Hook pod ${pod_name} describe ---" - kubectl describe pod "${pod_name}" -n "${namespace}" 2>/dev/null | tail -50 || true + kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" describe pod "${pod_name}" -n "${namespace}" 2>/dev/null | tail -50 || true done echo " --- End diagnostics for ${name} ---" # Delete any non-succeeded hook Job. This function only runs after a Helm @@ -125,7 +150,7 @@ function cleanup_helm_hooks() { # retry — whether it failed, is stuck Pending (timed out before the pod # started), or is still active with a stuck container. echo " Cleaning up stale Helm hook Job ${name} in ${namespace}..." - kubectl delete job "${name}" -n "${namespace}" --ignore-not-found 2>/dev/null || true + kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" delete job "${name}" -n "${namespace}" --ignore-not-found 2>/dev/null || true done <<< "${job_names}" } @@ -137,15 +162,15 @@ function dump_kai_scheduler_helm_diagnostics() { echo " --- ${namespace} diagnostics ---" echo " Jobs:" - kubectl get jobs -n "${namespace}" 2>/dev/null || true + kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" get jobs -n "${namespace}" 2>/dev/null || true echo " Job descriptions:" - kubectl describe jobs -n "${namespace}" 2>/dev/null || true + kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" describe jobs -n "${namespace}" 2>/dev/null || true echo " Pods:" - kubectl get pods -n "${namespace}" -o wide 2>/dev/null || true + kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" get pods -n "${namespace}" -o wide 2>/dev/null || true echo " Pod descriptions:" - kubectl describe pods -n "${namespace}" 2>/dev/null || true + kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" describe pods -n "${namespace}" 2>/dev/null || true echo " Recent events:" - kubectl get events -n "${namespace}" --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true + kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" get events -n "${namespace}" --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true echo " --- End ${namespace} diagnostics ---" } @@ -419,11 +444,12 @@ fi COMPONENT_HELM_TIMEOUT="20m" # Grove owns the generated webhook certificate Secret data after install. # Client-side apply avoids server-side field ownership conflicts during retries. -# This flag requires a Helm client that supports --server-side=false. -if helm_supports_server_side_flag; then +# This flag requires Helm v4.0.5+; earlier Helm v4 releases advertise the flag +# but ignore --server-side=false on a fresh upgrade --install fallback. +if helm_supports_server_side_false_install; then COMPONENT_HELM_APPLY_ARGS=(--server-side=false) else - echo "::warning::dynamo-platform conflict mitigation requires a Helm client that supports --server-side=false; proceeding without this flag" + echo "::warning::dynamo-platform conflict mitigation requires Helm v4.0.5+ with working --server-side=false install fallback; proceeding without this flag" fi if [[ "${COMPONENT_MAX_RETRIES}" -gt 3 ]]; then COMPONENT_MAX_RETRIES="3" From e1e931aba92ff0178f3df2effc1b50e19f8516a0 Mon Sep 17 00:00:00 2001 From: Yuan Chen Date: Sun, 26 Apr 2026 19:48:05 -0700 Subject: [PATCH 20/21] ci: harden H100 kind runtime workflows --- .github/actions/aicr-build/build-cli.sh | 5 - .../aicr-build/build-validator-images.sh | 3 + .../check-control-plane-health/action.yml | 21 ++ .../check-control-plane-health.sh | 175 +++++++++++++- .../configure-nvidia-container-toolkit.sh | 11 +- .../create-gpu-kind-cluster.sh | 80 ++++--- .../delete-stale-kind-cluster.sh | 19 +- .../actions/gpu-debug-diagnostics/action.yml | 35 +++ .../wait-gpu-operands-bundle.sh | 18 +- .../wait-gpu-operands-helm.sh | 2 +- .../actions/gpu-smoke-nvidia-smi/action.yml | 36 +++ .github/actions/gpu-test-cleanup/action.yml | 10 +- .../collect-debug-artifacts.sh | 39 ++++ .../install-karpenter-kwok.sh | 2 +- .github/scripts/gpu-chainsaw-health.sh | 50 +++- .github/scripts/gpu-debug-diagnostics.sh | 212 ++++++++++++----- .../scripts/gpu-runtime-component-health.sh | 110 +++++++++ .github/scripts/gpu-smoke-run-nvidia-smi.sh | 20 +- .../scripts/gpu-smoke-show-nvidia-smi-log.sh | 16 +- .../workflows/gpu-h100-inference-test.yaml | 203 ++-------------- .../workflows/gpu-h100-kind-runtime-test.yaml | 221 ++++++++++++++++++ .github/workflows/gpu-h100-training-test.yaml | 197 ++-------------- .github/workflows/gpu-smoke-test.yaml | 26 ++- pkg/bundler/deployer/helm/helm_test.go | 4 + .../deployer/helm/templates/deploy.sh.tmpl | 3 +- recipes/overlays/kind.yaml | 33 +++ 26 files changed, 1060 insertions(+), 491 deletions(-) create mode 100644 .github/actions/gpu-debug-diagnostics/action.yml create mode 100644 .github/actions/gpu-smoke-nvidia-smi/action.yml create mode 100644 .github/scripts/gpu-runtime-component-health.sh create mode 100644 .github/workflows/gpu-h100-kind-runtime-test.yaml diff --git a/.github/actions/aicr-build/build-cli.sh b/.github/actions/aicr-build/build-cli.sh index 83c834aad..c87428241 100644 --- a/.github/actions/aicr-build/build-cli.sh +++ b/.github/actions/aicr-build/build-cli.sh @@ -16,9 +16,4 @@ set -euo pipefail mkdir -p dist -if [[ -x dist/aicr ]]; then - echo "Reusing existing dist/aicr" - exit 0 -fi - CGO_ENABLED=0 go build -trimpath -o dist/aicr ./cmd/aicr diff --git a/.github/actions/aicr-build/build-validator-images.sh b/.github/actions/aicr-build/build-validator-images.sh index 76682af90..f098e84e8 100644 --- a/.github/actions/aicr-build/build-validator-images.sh +++ b/.github/actions/aicr-build/build-validator-images.sh @@ -15,6 +15,7 @@ set -euo pipefail +VALIDATOR_PHASES="${VALIDATOR_PHASES:-}" if [[ -n "${VALIDATOR_PHASES}" ]]; then if [[ "${VALIDATOR_PHASES}" == "none" ]]; then echo "Skipping validator builds (validator_phases=none)" @@ -26,6 +27,8 @@ else PHASES="deployment,performance,conformance" fi +: "${KIND_CLUSTER_NAME:?KIND_CLUSTER_NAME must be set}" + mkdir -p dist/validator for phase in ${PHASES//,/ }; do if ! [[ "${phase}" =~ ^[a-z][a-z0-9_-]*$ ]]; then diff --git a/.github/actions/check-control-plane-health/action.yml b/.github/actions/check-control-plane-health/action.yml index 0172b00a9..85833925f 100644 --- a/.github/actions/check-control-plane-health/action.yml +++ b/.github/actions/check-control-plane-health/action.yml @@ -39,6 +39,22 @@ inputs: description: 'Optional duration to watch for new control-plane restarts after pods are Ready' required: false default: '0s' + stability_probe_interval: + description: 'Interval for active API server probes during the stability window' + required: false + default: '10s' + stability_probe_failure_threshold: + description: 'Consecutive active stability probe failures allowed before failing' + required: false + default: '2' + lease_components: + description: 'Space-separated leader election lease names to check for freshness' + required: false + default: kube-controller-manager kube-scheduler + lease_stale_timeout: + description: 'Maximum allowed leader election lease age at the end of a stability window' + required: false + default: '120s' recover_unhealthy: description: 'Restart eligible Kind control-plane static pod containers when they are currently unhealthy' required: false @@ -62,7 +78,12 @@ runs: NAMESPACE: ${{ inputs.namespace }} COMPONENTS: ${{ inputs.components }} WAIT_TIMEOUT: ${{ inputs.wait_timeout }} + MAX_RESTARTS: ${{ inputs.max_restarts }} STABILITY_WINDOW: ${{ inputs.stability_window }} + STABILITY_PROBE_INTERVAL: ${{ inputs.stability_probe_interval }} + STABILITY_PROBE_FAILURE_THRESHOLD: ${{ inputs.stability_probe_failure_threshold }} + LEASE_COMPONENTS: ${{ inputs.lease_components }} + LEASE_STALE_TIMEOUT: ${{ inputs.lease_stale_timeout }} RECOVER_UNHEALTHY: ${{ inputs.recover_unhealthy }} RECOVERY_COMPONENTS: ${{ inputs.recovery_components }} MAX_RECOVERY_ATTEMPTS: ${{ inputs.max_recovery_attempts }} diff --git a/.github/actions/check-control-plane-health/check-control-plane-health.sh b/.github/actions/check-control-plane-health/check-control-plane-health.sh index 3614df47f..350538255 100644 --- a/.github/actions/check-control-plane-health/check-control-plane-health.sh +++ b/.github/actions/check-control-plane-health/check-control-plane-health.sh @@ -25,6 +25,25 @@ validate_duration_input() { fi } +duration_seconds() { + local input_value="$1" + local number="${input_value%[smh]}" + local unit="${input_value: -1}" + local amount + + amount=$((10#${number})) + + case "${unit}" in + s) echo "${amount}" ;; + m) echo $((amount * 60)) ;; + h) echo $((amount * 3600)) ;; + *) + echo "::error::unsupported duration unit in '${input_value}'" >&2 + exit 1 + ;; + esac +} + MAX_RECOVERY_ATTEMPTS="${MAX_RECOVERY_ATTEMPTS#"${MAX_RECOVERY_ATTEMPTS%%[![:space:]]*}"}" MAX_RECOVERY_ATTEMPTS="${MAX_RECOVERY_ATTEMPTS%"${MAX_RECOVERY_ATTEMPTS##*[![:space:]]}"}" if ! [[ "${MAX_RECOVERY_ATTEMPTS}" =~ ^[0-9]+$ ]]; then @@ -32,6 +51,13 @@ if ! [[ "${MAX_RECOVERY_ATTEMPTS}" =~ ^[0-9]+$ ]]; then exit 1 fi +MAX_RESTARTS="${MAX_RESTARTS:-}" +MAX_RESTARTS="${MAX_RESTARTS#"${MAX_RESTARTS%%[![:space:]]*}"}" +MAX_RESTARTS="${MAX_RESTARTS%"${MAX_RESTARTS##*[![:space:]]}"}" +if [[ -n "${MAX_RESTARTS}" ]] && [[ "${MAX_RESTARTS}" != "1" ]]; then + echo "::warning::max_restarts is deprecated and ignored; use stability_window to fail on new control-plane restarts" +fi + WAIT_TIMEOUT="${WAIT_TIMEOUT#"${WAIT_TIMEOUT%%[![:space:]]*}"}" WAIT_TIMEOUT="${WAIT_TIMEOUT%"${WAIT_TIMEOUT##*[![:space:]]}"}" validate_duration_input wait_timeout "${WAIT_TIMEOUT}" @@ -45,6 +71,42 @@ validate_duration_input stability_window "${STABILITY_WINDOW}" if [[ "${STABILITY_WINDOW}" =~ ^0+[smh]$ ]]; then STABILITY_WINDOW="0s" fi +STABILITY_WINDOW_SECONDS="$(duration_seconds "${STABILITY_WINDOW}")" + +STABILITY_PROBE_INTERVAL="${STABILITY_PROBE_INTERVAL:-10s}" +STABILITY_PROBE_INTERVAL="${STABILITY_PROBE_INTERVAL#"${STABILITY_PROBE_INTERVAL%%[![:space:]]*}"}" +STABILITY_PROBE_INTERVAL="${STABILITY_PROBE_INTERVAL%"${STABILITY_PROBE_INTERVAL##*[![:space:]]}"}" +validate_duration_input stability_probe_interval "${STABILITY_PROBE_INTERVAL}" +STABILITY_PROBE_INTERVAL_SECONDS="$(duration_seconds "${STABILITY_PROBE_INTERVAL}")" +if (( STABILITY_PROBE_INTERVAL_SECONDS <= 0 )); then + echo "::error::stability_probe_interval must be greater than 0, got '${STABILITY_PROBE_INTERVAL}'" + exit 1 +fi +STABILITY_PROBE_FAILURE_THRESHOLD="${STABILITY_PROBE_FAILURE_THRESHOLD:-2}" +STABILITY_PROBE_FAILURE_THRESHOLD="${STABILITY_PROBE_FAILURE_THRESHOLD#"${STABILITY_PROBE_FAILURE_THRESHOLD%%[![:space:]]*}"}" +STABILITY_PROBE_FAILURE_THRESHOLD="${STABILITY_PROBE_FAILURE_THRESHOLD%"${STABILITY_PROBE_FAILURE_THRESHOLD##*[![:space:]]}"}" +if ! [[ "${STABILITY_PROBE_FAILURE_THRESHOLD}" =~ ^[0-9]+$ ]]; then + echo "::error::stability_probe_failure_threshold must be a positive integer, got '${STABILITY_PROBE_FAILURE_THRESHOLD}'" + exit 1 +fi +if (( STABILITY_PROBE_FAILURE_THRESHOLD <= 0 )); then + echo "::error::stability_probe_failure_threshold must be greater than 0, got '${STABILITY_PROBE_FAILURE_THRESHOLD}'" + exit 1 +fi + +LEASE_COMPONENTS="${LEASE_COMPONENTS:-kube-controller-manager kube-scheduler}" +LEASE_COMPONENTS="${LEASE_COMPONENTS#"${LEASE_COMPONENTS%%[![:space:]]*}"}" +LEASE_COMPONENTS="${LEASE_COMPONENTS%"${LEASE_COMPONENTS##*[![:space:]]}"}" + +LEASE_STALE_TIMEOUT="${LEASE_STALE_TIMEOUT:-120s}" +LEASE_STALE_TIMEOUT="${LEASE_STALE_TIMEOUT#"${LEASE_STALE_TIMEOUT%%[![:space:]]*}"}" +LEASE_STALE_TIMEOUT="${LEASE_STALE_TIMEOUT%"${LEASE_STALE_TIMEOUT##*[![:space:]]}"}" +validate_duration_input lease_stale_timeout "${LEASE_STALE_TIMEOUT}" +LEASE_STALE_TIMEOUT_SECONDS="$(duration_seconds "${LEASE_STALE_TIMEOUT}")" +if (( LEASE_STALE_TIMEOUT_SECONDS <= 0 )); then + echo "::error::lease_stale_timeout must be greater than 0, got '${LEASE_STALE_TIMEOUT}'" + exit 1 +fi RECOVER_UNHEALTHY="${RECOVER_UNHEALTHY#"${RECOVER_UNHEALTHY%%[![:space:]]*}"}" RECOVER_UNHEALTHY="${RECOVER_UNHEALTHY%"${RECOVER_UNHEALTHY##*[![:space:]]}"}" @@ -147,6 +209,114 @@ require_readyz() { fi } +probe_control_plane_api() { + local reason="$1" + local component + local lease_summary + + if ! kubectl_kind get --raw='/readyz' >/dev/null; then + echo "::error::kube-apiserver /readyz probe failed ${reason}" + return 1 + fi + + for component in ${LEASE_COMPONENTS}; do + if ! lease_summary=$(kubectl_kind -n "${NAMESPACE}" get lease "${component}" \ + -o jsonpath='{.metadata.name}{" holder="}{.spec.holderIdentity}{" renewTime="}{.spec.renewTime}{"\n"}' 2>/dev/null); then + echo "::error::failed to read leader election lease ${component} ${reason}" + return 1 + fi + echo "${lease_summary}" + done +} + +lease_renew_epoch() { + local renew_time="$1" + + date -u -d "${renew_time}" +%s 2>/dev/null +} + +verify_leader_lease_freshness() { + local component + local now_epoch + local renew_time + local renew_epoch + local lease_age + + [[ -z "${LEASE_COMPONENTS}" ]] && return + + now_epoch="$(date -u +%s)" + echo "Checking leader election lease freshness (max age ${LEASE_STALE_TIMEOUT})..." + for component in ${LEASE_COMPONENTS}; do + if ! renew_time=$(kubectl_kind -n "${NAMESPACE}" get lease "${component}" -o jsonpath='{.spec.renewTime}' 2>/dev/null); then + echo "::error::failed to read leader election lease ${component}" + dump_all_control_plane_runtime_diagnostics + exit 1 + fi + if [[ -z "${renew_time}" ]]; then + echo "::error::leader election lease ${component} has empty spec.renewTime" + dump_all_control_plane_runtime_diagnostics + exit 1 + fi + if ! renew_epoch="$(lease_renew_epoch "${renew_time}")"; then + echo "::error::failed to parse leader election lease ${component} renewTime '${renew_time}'" + dump_all_control_plane_runtime_diagnostics + exit 1 + fi + lease_age=$((now_epoch - renew_epoch)) + if (( lease_age < 0 )); then + lease_age=0 + fi + echo "${component} lease renewTime=${renew_time} age=${lease_age}s" + if (( lease_age > LEASE_STALE_TIMEOUT_SECONDS )); then + echo "::error::leader election lease ${component} is stale: age=${lease_age}s exceeds ${LEASE_STALE_TIMEOUT}" + dump_all_control_plane_runtime_diagnostics + exit 1 + fi + done +} + +observe_stability_window() { + local label="$1" + local elapsed=0 + local probe=0 + local sleep_seconds + local consecutive_failures=0 + local total_failures=0 + + echo "Observing control-plane stability for ${STABILITY_WINDOW} (${label}); probing every ${STABILITY_PROBE_INTERVAL}, failing after ${STABILITY_PROBE_FAILURE_THRESHOLD} consecutive probe failure(s)..." + while (( elapsed < STABILITY_WINDOW_SECONDS )); do + sleep_seconds="${STABILITY_PROBE_INTERVAL_SECONDS}" + if (( elapsed + sleep_seconds > STABILITY_WINDOW_SECONDS )); then + sleep_seconds=$((STABILITY_WINDOW_SECONDS - elapsed)) + fi + if (( sleep_seconds > 0 )); then + sleep "${sleep_seconds}" + elapsed=$((elapsed + sleep_seconds)) + fi + + probe=$((probe + 1)) + echo "=== Control-plane stability probe ${probe} (${elapsed}/${STABILITY_WINDOW_SECONDS}s, ${label}) ===" + if probe_control_plane_api "during ${label} stability probe ${probe}"; then + consecutive_failures=0 + continue + fi + + total_failures=$((total_failures + 1)) + consecutive_failures=$((consecutive_failures + 1)) + echo "::warning::control-plane stability probe ${probe} failed (${consecutive_failures} consecutive, ${total_failures} total)" + if (( consecutive_failures >= STABILITY_PROBE_FAILURE_THRESHOLD )); then + echo "::error::control-plane had ${consecutive_failures} consecutive failed stability probes during ${label}" + dump_all_control_plane_runtime_diagnostics + exit 1 + fi + done + + if (( total_failures > 0 )); then + echo "::warning::control-plane had ${total_failures} transient failed stability probe(s) during ${label}; final health checks must still pass" + fi + verify_leader_lease_freshness +} + dump_api_server_health() { local endpoint @@ -391,8 +561,7 @@ verify_stability_window() { return fi - echo "Observing control-plane stability for ${STABILITY_WINDOW}..." - sleep "${STABILITY_WINDOW}" + observe_stability_window "primary" for component in ${COMPONENTS}; do initial_restarts="${INITIAL_RESTARTS[${component}]:-}" if [[ -z "${initial_restarts}" ]]; then @@ -427,7 +596,7 @@ verify_stability_window() { fi echo "::warning::control-plane recovery occurred; observing one additional ${STABILITY_WINDOW} stability window" - sleep "${STABILITY_WINDOW}" + observe_stability_window "post-recovery" for component in ${COMPONENTS}; do initial_restarts="${INITIAL_RESTARTS[${component}]:-}" if [[ -z "${initial_restarts}" ]]; then diff --git a/.github/actions/gpu-cluster-setup/configure-nvidia-container-toolkit.sh b/.github/actions/gpu-cluster-setup/configure-nvidia-container-toolkit.sh index 50d8c10fa..16352077c 100644 --- a/.github/actions/gpu-cluster-setup/configure-nvidia-container-toolkit.sh +++ b/.github/actions/gpu-cluster-setup/configure-nvidia-container-toolkit.sh @@ -18,10 +18,19 @@ set -euo pipefail sudo nvidia-ctk runtime configure --runtime=docker --set-as-default --cdi.enabled sudo nvidia-ctk config --set accept-nvidia-visible-devices-as-volume-mounts=true --in-place sudo nvidia-ctk config --set accept-nvidia-visible-devices-envvar-when-unprivileged=false --in-place +set +e timeout 120s sudo systemctl restart docker +restart_status=$? +set -e +if (( restart_status != 0 )); then + echo "::error::Docker restart failed after NVIDIA runtime configuration" + sudo systemctl status docker --no-pager || true + journalctl -u docker --since "10 minutes ago" --no-pager || true + exit "${restart_status}" +fi for attempt in $(seq 1 30); do - if systemctl is-active --quiet docker && docker info >/dev/null 2>&1; then + if systemctl is-active --quiet docker && timeout 5s docker info >/dev/null 2>&1; then echo "Docker is healthy after NVIDIA runtime configuration." exit 0 fi diff --git a/.github/actions/gpu-cluster-setup/create-gpu-kind-cluster.sh b/.github/actions/gpu-cluster-setup/create-gpu-kind-cluster.sh index 2f9ab4817..0c22fb845 100644 --- a/.github/actions/gpu-cluster-setup/create-gpu-kind-cluster.sh +++ b/.github/actions/gpu-cluster-setup/create-gpu-kind-cluster.sh @@ -25,6 +25,20 @@ validate_duration_input() { fi } +kubectl_kind() { + timeout 30s kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" "$@" +} + +kubectl_kind_wait() { + timeout 330s kubectl --request-timeout=300s --context="kind-${KIND_CLUSTER_NAME}" "$@" +} + +docker_timeout() { + local limit="$1" + shift + timeout "${limit}" docker "$@" +} + validate_generated_control_plane_config() { if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then for patch_file in "${patch_dir}"/*.yaml; do @@ -288,22 +302,22 @@ case "${create_status}" in ;; esac -kubectl --context="kind-${KIND_CLUSTER_NAME}" wait --for=condition=Ready nodes --all --timeout=300s -kubectl --context="kind-${KIND_CLUSTER_NAME}" cluster-info -kubectl --context="kind-${KIND_CLUSTER_NAME}" get nodes -o wide -kubectl --context="kind-${KIND_CLUSTER_NAME}" describe nodes | \ +kubectl_kind_wait wait --for=condition=Ready nodes --all --timeout=300s +kubectl_kind cluster-info +kubectl_kind get nodes -o wide +kubectl_kind describe nodes | \ grep -E "^(Name:|Capacity:|Allocatable:|Allocated resources:| cpu| memory| nvidia.com/gpu)" || true echo "=== Kind node container resources ===" -docker ps --filter "label=io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}" \ +docker_timeout 30s ps --filter "label=io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}" \ --format '{{.Names}}' | sort | while read -r node_container; do [[ -z "${node_container}" ]] && continue - docker inspect "${node_container}" \ + docker_timeout 30s inspect "${node_container}" \ --format '{{.Name}} NanoCpus={{.HostConfig.NanoCpus}} CpuShares={{.HostConfig.CpuShares}} Memory={{.HostConfig.Memory}} MemoryReservation={{.HostConfig.MemoryReservation}}' done echo "=== Control-plane resource requests/limits ===" -kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kube-system \ +kubectl_kind -n kube-system \ get pods -l tier=control-plane -o json | jq -r ' .items[] as $pod | $pod.metadata.name, @@ -327,7 +341,7 @@ control_plane_request() { local component="$1" local resource="$2" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kube-system \ + kubectl_kind -n kube-system \ get pod -l "component=${component}" \ -o "jsonpath={.items[0].spec.containers[0].resources.requests.${resource}}" } @@ -353,7 +367,7 @@ assert_control_plane_request() { control_plane_command_args() { local component="$1" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kube-system \ + kubectl_kind -n kube-system \ get pod -l "component=${component}" \ -o json | jq -r '.items[0].spec.containers[0] | ((.command // []) + (.args // []))[]?' } @@ -363,7 +377,7 @@ static_pod_manifest_contains_arg() { local expected="$2" local node="${KIND_CLUSTER_NAME}-control-plane" - docker exec "${node}" grep -Fq -- "- ${expected}" "/etc/kubernetes/manifests/${component}.yaml" + docker_timeout 30s exec "${node}" grep -Fq -- "- ${expected}" "/etc/kubernetes/manifests/${component}.yaml" } running_static_pod_container_contains_arg() { @@ -372,16 +386,18 @@ running_static_pod_container_contains_arg() { local node="${KIND_CLUSTER_NAME}-control-plane" local container_ids local container_id + local inspect_output - if ! container_ids="$(docker exec "${node}" crictl ps --name "${component}" -q 2>/dev/null)"; then + if ! container_ids="$(docker_timeout 30s exec "${node}" crictl ps --name "${component}" -q 2>/dev/null)"; then return 1 fi [[ -z "${container_ids}" ]] && return 1 for container_id in ${container_ids}; do - if docker exec "${node}" crictl inspect "${container_id}" 2>/dev/null | jq -e --arg expected "${expected}" ' + inspect_output="$(docker_timeout 30s exec "${node}" crictl inspect "${container_id}" 2>/dev/null || true)" + if jq -e --arg expected "${expected}" ' ([.info.runtimeSpec.process.args[]?, .status.info.runtimeSpec.process.args[]?] | index($expected)) != null - ' >/dev/null; then + ' >/dev/null 2>&1 <<< "${inspect_output}" || grep -Fq -- "${expected}" <<< "${inspect_output}"; then return 0 fi done @@ -395,14 +411,14 @@ dump_running_static_pod_container_args() { local container_id echo "Running ${component} CRI container args:" - container_ids="$(docker exec "${node}" crictl ps --name "${component}" -q 2>/dev/null || true)" + container_ids="$(docker_timeout 30s exec "${node}" crictl ps --name "${component}" -q 2>/dev/null || true)" if [[ -z "${container_ids}" ]]; then echo "(no running ${component} CRI containers found)" return fi for container_id in ${container_ids}; do echo "--- ${container_id} ---" - docker exec "${node}" crictl inspect "${container_id}" 2>/dev/null | jq -r ' + docker_timeout 30s exec "${node}" crictl inspect "${container_id}" 2>/dev/null | jq -r ' [.info.runtimeSpec.process.args[]?, .status.info.runtimeSpec.process.args[]?][]? ' || true done @@ -413,32 +429,40 @@ dump_static_pod_manifest() { local node="${KIND_CLUSTER_NAME}-control-plane" echo "Static pod manifest /etc/kubernetes/manifests/${component}.yaml:" - docker exec "${node}" sed -n '1,220p' "/etc/kubernetes/manifests/${component}.yaml" || true + docker_timeout 30s exec "${node}" sed -n '1,220p' "/etc/kubernetes/manifests/${component}.yaml" || true } assert_control_plane_arg() { local component="$1" local expected="$2" + local attempt local command_args - command_args="$(control_plane_command_args "${component}")" - if ! grep -Fxq -- "${expected}" <<< "${command_args}"; then + for attempt in $(seq 1 12); do + command_args="$(control_plane_command_args "${component}" || true)" + if grep -Fxq -- "${expected}" <<< "${command_args}"; then + echo "${component} command/args verified: ${expected}" + return + fi if running_static_pod_container_contains_arg "${component}" "${expected}"; then echo "${component} running CRI container args verified: ${expected} (live mirror pod omitted it)" return fi if static_pod_manifest_contains_arg "${component}" "${expected}"; then - echo "::warning::${component} live mirror pod and running CRI container args did not show ${expected}; static pod manifest is patched" - return + echo "::warning::${component} static pod manifest has ${expected}, but the running container does not yet; waiting for kubelet to converge (${attempt}/12)" + sleep 5 + continue fi - echo "::error::${component} live pod command/args does not contain ${expected}" - echo "Observed live command/args:" - echo "${command_args}" - dump_running_static_pod_container_args "${component}" - dump_static_pod_manifest "${component}" - exit 1 - fi - echo "${component} command/args verified: ${expected}" + + break + done + + echo "::error::${component} running command/args does not contain ${expected}" + echo "Observed live command/args:" + echo "${command_args:-}" + dump_running_static_pod_container_args "${component}" + dump_static_pod_manifest "${component}" + exit 1 } if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then diff --git a/.github/actions/gpu-cluster-setup/delete-stale-kind-cluster.sh b/.github/actions/gpu-cluster-setup/delete-stale-kind-cluster.sh index 0f29c469b..5e0a81778 100644 --- a/.github/actions/gpu-cluster-setup/delete-stale-kind-cluster.sh +++ b/.github/actions/gpu-cluster-setup/delete-stale-kind-cluster.sh @@ -19,6 +19,21 @@ docker_timeout() { timeout 30s docker "$@" } +read_kind_container_ids() { + local output + + if ! output="$(docker_timeout ps -aq --filter "label=${kind_cluster_label}" 2>&1)"; then + echo "::error::failed to query stale kind containers for ${KIND_CLUSTER_NAME}" + echo "${output}" + exit 1 + fi + + remaining_containers=() + if [[ -n "${output}" ]]; then + mapfile -t remaining_containers <<< "${output}" + fi +} + if kind get clusters | grep -Fxq "${KIND_CLUSTER_NAME}"; then echo "Deleting stale kind cluster: ${KIND_CLUSTER_NAME}" if ! timeout 180s kind delete cluster --name "${KIND_CLUSTER_NAME}"; then @@ -28,14 +43,14 @@ else echo "No stale kind cluster named ${KIND_CLUSTER_NAME}" fi -mapfile -t remaining_containers < <(docker_timeout ps -aq --filter "label=${kind_cluster_label}") +read_kind_container_ids if (( ${#remaining_containers[@]} > 0 )); then echo "Removing stale containers for ${KIND_CLUSTER_NAME}:" docker_timeout ps -a --filter "label=${kind_cluster_label}" docker_timeout rm -f "${remaining_containers[@]}" fi -mapfile -t remaining_containers < <(docker_timeout ps -aq --filter "label=${kind_cluster_label}") +read_kind_container_ids if (( ${#remaining_containers[@]} > 0 )); then echo "::error::stale containers still remain for ${KIND_CLUSTER_NAME}:" docker_timeout ps -a --filter "label=${kind_cluster_label}" diff --git a/.github/actions/gpu-debug-diagnostics/action.yml b/.github/actions/gpu-debug-diagnostics/action.yml new file mode 100644 index 000000000..e5a38b964 --- /dev/null +++ b/.github/actions/gpu-debug-diagnostics/action.yml @@ -0,0 +1,35 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: 'GPU Debug Diagnostics' +description: 'Print bounded GPU CI diagnostics while the kind cluster is still present.' + +inputs: + cluster_name: + description: 'Kind cluster name' + required: true + mode: + description: 'Diagnostic mode: smoke, training, or inference' + required: false + default: 'smoke' + +runs: + using: 'composite' + steps: + - name: Print GPU debug diagnostics + shell: bash + env: + KIND_CLUSTER_NAME: ${{ inputs.cluster_name }} + GPU_TEST_DIAGNOSTIC_MODE: ${{ inputs.mode }} + run: bash "${{ github.workspace }}/.github/scripts/gpu-debug-diagnostics.sh" diff --git a/.github/actions/gpu-operator-install/wait-gpu-operands-bundle.sh b/.github/actions/gpu-operator-install/wait-gpu-operands-bundle.sh index 2fee8a2c0..9566fb8ba 100644 --- a/.github/actions/gpu-operator-install/wait-gpu-operands-bundle.sh +++ b/.github/actions/gpu-operator-install/wait-gpu-operands-bundle.sh @@ -19,16 +19,28 @@ echo "Waiting for GPU operator controller to deploy operands..." # The GPU operator controller watches ClusterPolicy and creates # DaemonSets for device-plugin, NFD, GFD, etc. This happens # asynchronously after the helm install completes. +daemonset_found=false for i in $(seq 1 30); do - count=$(kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \ - get daemonset -l app=nvidia-device-plugin-daemonset --no-headers 2>/dev/null | wc -l) - if [[ "$count" -gt 0 ]]; then + daemonsets="" + if daemonsets=$(kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \ + get daemonset -l app=nvidia-device-plugin-daemonset --no-headers 2>/dev/null); then + if [[ -n "${daemonsets}" ]]; then + daemonset_found=true + fi + fi + if [[ "${daemonset_found}" == "true" ]]; then echo "Device plugin DaemonSet found." break fi echo "Waiting for device plugin DaemonSet to be created... (${i}/30)" sleep 10 done +if [[ "${daemonset_found}" != "true" ]]; then + echo "::error::device plugin DaemonSet was not created within 300s" + kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods || true + kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get events --sort-by='.lastTimestamp' || true + exit 1 +fi echo "Waiting for device plugin rollout..." # Operands are excluded from control-plane nodes via nodeAffinity in # the kind overlay, so all scheduled pods should become ready. diff --git a/.github/actions/gpu-operator-install/wait-gpu-operands-helm.sh b/.github/actions/gpu-operator-install/wait-gpu-operands-helm.sh index 2f0bbe159..3d3042f8a 100644 --- a/.github/actions/gpu-operator-install/wait-gpu-operands-helm.sh +++ b/.github/actions/gpu-operator-install/wait-gpu-operands-helm.sh @@ -32,6 +32,6 @@ for i in $(seq 1 30); do done kubectl --request-timeout=300s --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \ - rollout status daemonset -l app=nvidia-device-plugin-daemonset --timeout=300s || true + rollout status daemonset -l app=nvidia-device-plugin-daemonset --timeout=300s echo "GPU Operator pods:" kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods diff --git a/.github/actions/gpu-smoke-nvidia-smi/action.yml b/.github/actions/gpu-smoke-nvidia-smi/action.yml new file mode 100644 index 000000000..cb61b5d0d --- /dev/null +++ b/.github/actions/gpu-smoke-nvidia-smi/action.yml @@ -0,0 +1,36 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: 'GPU Smoke nvidia-smi' +description: 'Run nvidia-smi in a GPU-backed kind pod and print its logs.' + +inputs: + cluster_name: + description: 'Kind cluster name' + required: true + +runs: + using: 'composite' + steps: + - name: Run nvidia-smi in a pod + shell: bash + env: + KIND_CLUSTER_NAME: ${{ inputs.cluster_name }} + run: bash "${{ github.workspace }}/.github/scripts/gpu-smoke-run-nvidia-smi.sh" + - name: Show nvidia-smi output + if: always() + shell: bash + env: + KIND_CLUSTER_NAME: ${{ inputs.cluster_name }} + run: bash "${{ github.workspace }}/.github/scripts/gpu-smoke-show-nvidia-smi-log.sh" diff --git a/.github/actions/gpu-test-cleanup/action.yml b/.github/actions/gpu-test-cleanup/action.yml index a1eef57b9..417130669 100644 --- a/.github/actions/gpu-test-cleanup/action.yml +++ b/.github/actions/gpu-test-cleanup/action.yml @@ -23,18 +23,22 @@ inputs: description: 'Prefix for the uploaded artifact name' required: false default: 'gpu-test-debug' + collect_artifacts: + description: 'Collect and upload debug artifacts before deleting the kind cluster' + required: false + default: 'false' runs: using: 'composite' steps: - name: Collect debug artifacts - if: failure() || cancelled() + if: inputs.collect_artifacts == 'true' shell: bash env: KIND_CLUSTER_NAME: ${{ inputs.cluster_name }} run: bash "${{ github.action_path }}/collect-debug-artifacts.sh" - name: Export kind logs - if: failure() || cancelled() + if: always() && inputs.collect_artifacts == 'true' shell: bash env: KIND_CLUSTER_NAME: ${{ inputs.cluster_name }} @@ -46,7 +50,7 @@ runs: KIND_CLUSTER_NAME: ${{ inputs.cluster_name }} run: bash "${{ github.action_path }}/cleanup-kind-cluster.sh" - name: Upload debug artifacts - if: failure() || cancelled() + if: always() && inputs.collect_artifacts == 'true' uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0 with: name: ${{ inputs.artifact_name_prefix }}-${{ github.run_id }} diff --git a/.github/actions/gpu-test-cleanup/collect-debug-artifacts.sh b/.github/actions/gpu-test-cleanup/collect-debug-artifacts.sh index 257695334..cb33e770a 100644 --- a/.github/actions/gpu-test-cleanup/collect-debug-artifacts.sh +++ b/.github/actions/gpu-test-cleanup/collect-debug-artifacts.sh @@ -16,6 +16,7 @@ # Diagnostic artifact collection intentionally omits -e so one broken cluster # call does not prevent later artifacts from being collected. set -uo pipefail +rm -rf /tmp/debug-artifacts mkdir -p /tmp/debug-artifacts CONTROL_PLANE_COMPONENTS="kube-apiserver kube-controller-manager kube-scheduler etcd" kubectl_kind() { @@ -27,6 +28,23 @@ docker_timeout() { timeout "${limit}" docker "$@" } +{ + date -u || true + hostname || true + uptime || true + nproc || true + free -h || true + df -h / || true + df -ih / || true +} > /tmp/debug-artifacts/runner-baseline.txt 2>&1 || true +docker_timeout 30s version > /tmp/debug-artifacts/docker-version.txt 2>&1 || true +docker_timeout 30s info > /tmp/debug-artifacts/docker-info.txt 2>&1 || true +nvidia-smi -L > /tmp/debug-artifacts/host-gpus.txt 2>&1 || true +nvidia-smi >> /tmp/debug-artifacts/host-gpus.txt 2>&1 || true +kind get clusters > /tmp/debug-artifacts/kind-clusters.txt 2>&1 || true +docker_timeout 30s ps -a --filter "label=io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}" \ + > /tmp/debug-artifacts/kind-node-containers.txt 2>&1 || true + kubectl_kind get all --all-namespaces > /tmp/debug-artifacts/all-resources.txt || true kubectl_kind get events --all-namespaces --sort-by='.lastTimestamp' > /tmp/debug-artifacts/events.txt || true kubectl_kind get --raw='/livez?verbose' > /tmp/debug-artifacts/apiserver-livez.txt 2>&1 || true @@ -48,6 +66,24 @@ done kubectl_kind -n gpu-operator get pods -o wide > /tmp/debug-artifacts/gpu-operator-pods.txt || true kubectl_kind -n gpu-operator logs -l app=nvidia-device-plugin-daemonset --tail=100 > /tmp/debug-artifacts/device-plugin-logs.txt || true kubectl_kind -n gpu-operator logs -l app.kubernetes.io/component=gpu-operator --tail=100 > /tmp/debug-artifacts/gpu-operator-logs.txt || true +kubectl_kind -n monitoring get deployment,statefulset,daemonset,pods -o wide \ + > /tmp/debug-artifacts/monitoring-workloads.txt 2>&1 || true +kubectl_kind -n monitoring describe deployment kube-prometheus-operator \ + > /tmp/debug-artifacts/kube-prometheus-operator-deployment-describe.txt 2>&1 || true +kubectl_kind -n monitoring logs deployment/kube-prometheus-operator --all-containers --tail=300 \ + > /tmp/debug-artifacts/kube-prometheus-operator-logs.txt 2>&1 || true +kubectl_kind -n monitoring logs deployment/kube-prometheus-operator --all-containers --previous --tail=300 \ + > /tmp/debug-artifacts/kube-prometheus-operator-previous-logs.txt 2>&1 || true +kubectl_kind -n monitoring get events --sort-by='.lastTimestamp' \ + > /tmp/debug-artifacts/monitoring-events.txt 2>&1 || true +{ + kubectl_kind -n monitoring get pods -o name 2>/dev/null \ + | grep '^pod/kube-prometheus-operator-' \ + | while read -r pod; do + echo "=== ${pod} ===" + kubectl_kind -n monitoring describe "${pod}" 2>&1 || true + done +} > /tmp/debug-artifacts/kube-prometheus-operator-pods-describe.txt 2>&1 || true kubectl_kind get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded > /tmp/debug-artifacts/non-running-pods.txt || true tar_inputs=() [[ -f recipe.yaml ]] && tar_inputs+=(recipe.yaml) @@ -63,6 +99,8 @@ docker_timeout 30s ps --filter "label=io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME} --format '{{.Names}}' | sort | while read -r node_container; do [[ -z "${node_container}" ]] && continue node_file="${node_container//[^A-Za-z0-9_.-]/_}" + docker_timeout 30s inspect "${node_container}" \ + > "/tmp/debug-artifacts/${node_file}-docker-inspect.json" 2>&1 || true docker_timeout 30s exec "${node_container}" journalctl -u kubelet \ --since "90 minutes ago" --no-pager \ > "/tmp/debug-artifacts/${node_file}-kubelet-journal.txt" 2>&1 || true @@ -83,6 +121,7 @@ docker_timeout 30s ps --filter "label=io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME} echo "--- top cpu/memory processes ---" ps -eo pid,ppid,stat,etime,%cpu,%mem,comm,args --sort=-%cpu | head -40 || true ' > "/tmp/debug-artifacts/${node_file}-node-pressure.txt" 2>&1 || true + # shellcheck disable=SC2016 # Expanded inside the kind node shell. docker_timeout 120s exec "${node_container}" sh -c ' for component in kube-apiserver kube-controller-manager kube-scheduler etcd; do echo "=== ${component} static pod manifest ===" diff --git a/.github/actions/install-karpenter-kwok/install-karpenter-kwok.sh b/.github/actions/install-karpenter-kwok/install-karpenter-kwok.sh index 472eb844b..8987144ab 100644 --- a/.github/actions/install-karpenter-kwok/install-karpenter-kwok.sh +++ b/.github/actions/install-karpenter-kwok/install-karpenter-kwok.sh @@ -32,7 +32,7 @@ validate_seconds_input() { echo "::error::${input_name} must be an integer number of seconds, got '${input_value}'" exit 1 fi - if (( input_value <= 0 )); then + if (( 10#${input_value} <= 0 )); then echo "::error::${input_name} must be greater than 0 seconds, got '${input_value}'" exit 1 fi diff --git a/.github/scripts/gpu-chainsaw-health.sh b/.github/scripts/gpu-chainsaw-health.sh index 7e2c5ac96..7098c6bf8 100644 --- a/.github/scripts/gpu-chainsaw-health.sh +++ b/.github/scripts/gpu-chainsaw-health.sh @@ -26,11 +26,53 @@ if [[ ! -d "${test_dir}" ]]; then fi CHAINSAW_TEST_TIMEOUT="${CHAINSAW_TEST_TIMEOUT:-30m}" -CHAINSAW_CLEANUP_TIMEOUT="${CHAINSAW_CLEANUP_TIMEOUT:-120s}" -CHAINSAW_DELETE_TIMEOUT="${CHAINSAW_DELETE_TIMEOUT:-120s}" +MONITORING_READY_TIMEOUT="${MONITORING_READY_TIMEOUT:-180s}" + +kubectl_kind() { + timeout 30s kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" "$@" +} + +kubectl_kind_wait() { + # Rollout status opens a watch that is already bounded by --timeout. Keep + # request-timeout unset here so a slow API server does not cut the watch short. + kubectl --context="kind-${KIND_CLUSTER_NAME}" "$@" +} + +print_monitoring_diagnostics() { + echo "=== Monitoring workloads ===" + kubectl_kind -n monitoring get deployment,statefulset,daemonset,pods -o wide 2>/dev/null || true + echo "=== kube-prometheus-operator deployment ===" + kubectl_kind -n monitoring get deployment kube-prometheus-operator -o wide 2>/dev/null || true + echo "=== kube-prometheus-operator deployment describe ===" + kubectl_kind -n monitoring describe deployment kube-prometheus-operator 2>/dev/null || true + echo "=== kube-prometheus-operator pods ===" + kubectl_kind -n monitoring get pods -o wide 2>/dev/null \ + | grep -E '(^NAME|^kube-prometheus-operator-)' || true + echo "=== kube-prometheus-operator logs ===" + kubectl_kind -n monitoring logs deployment/kube-prometheus-operator --all-containers --tail=200 2>/dev/null || true + echo "=== kube-prometheus-operator previous logs ===" + kubectl_kind -n monitoring logs deployment/kube-prometheus-operator --all-containers --previous --tail=200 2>/dev/null || true + echo "=== Recent events (monitoring) ===" + kubectl_kind -n monitoring get events --sort-by='.lastTimestamp' 2>/dev/null | tail -100 || true +} + +wait_for_monitoring_operator() { + echo "Waiting for monitoring/kube-prometheus-operator before Chainsaw..." + print_monitoring_diagnostics + if kubectl_kind_wait -n monitoring rollout status deployment/kube-prometheus-operator \ + --timeout="${MONITORING_READY_TIMEOUT}"; then + echo "monitoring/kube-prometheus-operator is rolled out." + return 0 + fi + + echo "::error::monitoring/kube-prometheus-operator did not become available within ${MONITORING_READY_TIMEOUT}" + print_monitoring_diagnostics + return 1 +} + +wait_for_monitoring_operator timeout "${CHAINSAW_TEST_TIMEOUT}" chainsaw test \ --test-dir "${test_dir}" \ --config tests/chainsaw/chainsaw-config.yaml \ - --cleanup-timeout "${CHAINSAW_CLEANUP_TIMEOUT}" \ - --delete-timeout "${CHAINSAW_DELETE_TIMEOUT}" + --skip-delete diff --git a/.github/scripts/gpu-debug-diagnostics.sh b/.github/scripts/gpu-debug-diagnostics.sh index 3721691b3..3db82a6e8 100644 --- a/.github/scripts/gpu-debug-diagnostics.sh +++ b/.github/scripts/gpu-debug-diagnostics.sh @@ -24,6 +24,39 @@ kubectl_kind() { timeout 30s kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" "$@" } +docker_timeout() { + local limit="$1" + shift + timeout "${limit}" docker "$@" +} + +print_setup_diagnostics() { + echo "=== Runner baseline ===" + date -u || true + hostname || true + uptime || true + nproc || true + free -h || true + df -h / || true + df -ih / || true + echo "=== Docker health ===" + docker info >/dev/null 2>&1 && docker version || true + echo "=== Host GPUs ===" + nvidia-smi -L || true + nvidia-smi || true + echo "=== Kind clusters ===" + kind get clusters || true + echo "=== Kind node containers ===" + docker_timeout 30s ps -a --filter "label=io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}" || true + echo "=== Kind node container resources ===" + docker_timeout 30s ps --filter "label=io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}" \ + --format '{{.Names}}' | sort | while read -r node_container; do + [[ -z "${node_container}" ]] && continue + docker_timeout 30s inspect "${node_container}" \ + --format '{{.Name}} State={{.State.Status}} NanoCpus={{.HostConfig.NanoCpus}} CpuShares={{.HostConfig.CpuShares}} Memory={{.HostConfig.Memory}} MemoryReservation={{.HostConfig.MemoryReservation}}' || true + done || true +} + print_workload_images() { local ns="$1" kubectl_kind -n "${ns}" get deployment,daemonset,statefulset -o json 2>/dev/null \ @@ -47,15 +80,39 @@ print_workload_inventory() { done } -print_grafana_diagnostics() { - echo "=== Grafana deployment ===" - kubectl_kind -n monitoring get deployment grafana -o wide 2>/dev/null || true - echo "=== Grafana pods ===" - kubectl_kind -n monitoring get pods -l app.kubernetes.io/name=grafana -o wide 2>/dev/null || true - echo "=== Grafana deployment describe ===" - kubectl_kind -n monitoring describe deployment grafana 2>/dev/null || true - echo "=== Grafana pod describe ===" - kubectl_kind -n monitoring describe pods -l app.kubernetes.io/name=grafana 2>/dev/null || true +print_component_status_summary() { + echo "=== Component workload status ===" + kubectl_kind get deployments,statefulsets,daemonsets,pods -A -o wide 2>/dev/null || true + echo "=== Component rollout conditions ===" + kubectl_kind get deployments,statefulsets,daemonsets -A \ + -o custom-columns='KIND:.kind,NAMESPACE:.metadata.namespace,NAME:.metadata.name,READY:.status.readyReplicas,AVAILABLE:.status.availableReplicas,DESIRED:.status.replicas,UPDATED:.status.updatedReplicas,AGE:.metadata.creationTimestamp' \ + 2>/dev/null || true + echo "=== Non-ready pods ===" + kubectl_kind get pods -A \ + --field-selector=status.phase!=Running,status.phase!=Succeeded \ + -o wide 2>/dev/null || true +} + +print_kube_prometheus_operator_diagnostics() { + echo "=== Monitoring workloads ===" + kubectl_kind -n monitoring get deployment,statefulset,daemonset,pods -o wide 2>/dev/null || true + echo "=== kube-prometheus-operator deployment ===" + kubectl_kind -n monitoring get deployment kube-prometheus-operator -o wide 2>/dev/null || true + echo "=== kube-prometheus-operator deployment describe ===" + kubectl_kind -n monitoring describe deployment kube-prometheus-operator 2>/dev/null || true + echo "=== kube-prometheus-operator pod describe ===" + kubectl_kind -n monitoring get pods -o name 2>/dev/null \ + | grep '^pod/kube-prometheus-operator-' \ + | while read -r pod; do + echo "--- ${pod} ---" + kubectl_kind -n monitoring describe "${pod}" 2>/dev/null || true + done || true + echo "=== kube-prometheus-operator logs ===" + kubectl_kind -n monitoring logs deployment/kube-prometheus-operator --all-containers --tail=200 2>/dev/null || true + echo "=== kube-prometheus-operator previous logs ===" + kubectl_kind -n monitoring logs deployment/kube-prometheus-operator --all-containers --previous --tail=200 2>/dev/null || true + echo "=== Recent events (monitoring) ===" + kubectl_kind -n monitoring get events --sort-by='.lastTimestamp' 2>/dev/null | tail -80 || true } print_kai_diagnostics() { @@ -83,6 +140,34 @@ print_kai_diagnostics() { kubectl_kind -n kai-scheduler get events --sort-by='.lastTimestamp' 2>/dev/null | tail -50 || true } +print_custom_metrics() { + local metric + local ns + local namespaces=("$@") + + echo "=== Custom metrics API ===" + for metric in gpu_utilization gpu_memory_used gpu_power_usage; do + for ns in "${namespaces[@]}"; do + echo "--- ${ns}/${metric} ---" + kubectl_kind get --raw "/apis/custom.metrics.k8s.io/v1beta1/namespaces/${ns}/pods/*/${metric}" 2>/dev/null \ + | jq . || true + done + done +} + +print_metrics_pipeline_diagnostics() { + echo "=== prometheus-adapter pods ===" + kubectl_kind -n monitoring get pods -l app.kubernetes.io/name=prometheus-adapter -o wide 2>/dev/null || true + echo "=== DCGM Exporter pods ===" + kubectl_kind -n gpu-operator get pods -l app=nvidia-dcgm-exporter -o wide 2>/dev/null || true + echo "=== Monitoring pods ===" + kubectl_kind -n monitoring get pods -o wide 2>/dev/null || true + echo "=== DRA ResourceSlices ===" + kubectl_kind get resourceslices -o wide 2>/dev/null || true + echo "=== Node status ===" + kubectl_kind get nodes -o wide 2>/dev/null || true +} + print_common_gpu_diagnostics() { echo "=== ClusterPolicy status ===" kubectl_kind get clusterpolicy -o yaml 2>/dev/null || true @@ -94,64 +179,75 @@ print_common_gpu_diagnostics() { kubectl_kind -n gpu-operator get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true } +print_h100_common_diagnostics() { + local metric_namespaces=("$@") + local common_namespaces=( + cert-manager + gpu-operator + monitoring + skyhook + nvsentinel + nvidia-dra-driver + nvidia-network-operator + kai-scheduler + ) + + print_setup_diagnostics + print_component_status_summary + print_workload_inventory "${common_namespaces[@]}" "${metric_namespaces[@]}" + print_common_gpu_diagnostics + print_kube_prometheus_operator_diagnostics + print_kai_diagnostics + print_custom_metrics gpu-operator "${metric_namespaces[@]}" + print_metrics_pipeline_diagnostics + echo "=== Node resources ===" + kubectl_kind describe nodes 2>/dev/null | grep -A 20 "Allocated resources" || true +} + +print_kubeflow_diagnostics() { + echo "=== Kubeflow Trainer deployment ===" + kubectl_kind -n kubeflow get deployment kubeflow-trainer-controller-manager -o wide 2>/dev/null || true + echo "=== Kubeflow pods ===" + kubectl_kind -n kubeflow get pods -o wide 2>/dev/null || true + echo "=== Kubeflow validating webhooks ===" + kubectl_kind get validatingwebhookconfigurations validator.trainer.kubeflow.org -o yaml 2>/dev/null || true + echo "=== Kubeflow Trainer CRD ===" + kubectl_kind get crd trainjobs.trainer.kubeflow.org -o yaml 2>/dev/null || true +} + +print_dynamo_diagnostics() { + echo "=== Dynamo pods ===" + kubectl_kind -n dynamo-system get pods -o wide 2>/dev/null || true + echo "=== Dynamo operator logs ===" + kubectl_kind -n dynamo-system logs deployment/dynamo-platform-dynamo-operator-controller-manager --tail=100 -c manager 2>/dev/null || true + echo "=== Recent events (dynamo-system) ===" + kubectl_kind -n dynamo-system get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true +} + +print_kgateway_diagnostics() { + echo "=== kgateway pods ===" + kubectl_kind -n kgateway-system get pods -o wide 2>/dev/null || true + echo "=== GatewayClass status ===" + kubectl_kind get gatewayclass -o yaml 2>/dev/null || true + echo "=== Gateway status ===" + kubectl_kind get gateways -A -o yaml 2>/dev/null || true +} + case "${mode}" in smoke) + print_setup_diagnostics print_common_gpu_diagnostics echo "=== Node status ===" kubectl_kind get nodes -o wide 2>/dev/null || true ;; training) - print_workload_inventory cert-manager gpu-operator monitoring skyhook nvsentinel nvidia-dra-driver \ - nvidia-network-operator kai-scheduler kubeflow - print_common_gpu_diagnostics - print_grafana_diagnostics - print_kai_diagnostics - echo "=== Kubeflow Trainer deployment ===" - kubectl_kind -n kubeflow get deployment kubeflow-trainer-controller-manager -o wide 2>/dev/null || true - echo "=== Kubeflow pods ===" - kubectl_kind -n kubeflow get pods -o wide 2>/dev/null || true - echo "=== Kubeflow validating webhooks ===" - kubectl_kind get validatingwebhookconfigurations validator.trainer.kubeflow.org -o yaml 2>/dev/null || true - echo "=== Kubeflow Trainer CRD ===" - kubectl_kind get crd trainjobs.trainer.kubeflow.org -o yaml 2>/dev/null || true - echo "=== Node resources ===" - kubectl_kind describe nodes 2>/dev/null | grep -A 20 "Allocated resources" || true + print_h100_common_diagnostics kubeflow + print_kubeflow_diagnostics ;; inference) - print_workload_inventory cert-manager gpu-operator monitoring skyhook nvsentinel nvidia-dra-driver \ - nvidia-network-operator kai-scheduler dynamo-system kgateway-system - print_common_gpu_diagnostics - echo "=== Dynamo pods ===" - kubectl_kind -n dynamo-system get pods -o wide 2>/dev/null || true - echo "=== Dynamo operator logs ===" - kubectl_kind -n dynamo-system logs deployment/dynamo-operator-controller-manager --tail=100 -c manager 2>/dev/null || true - echo "=== Recent events (dynamo-system) ===" - kubectl_kind -n dynamo-system get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true - print_kai_diagnostics - echo "=== Custom metrics API ===" - for metric in gpu_utilization gpu_memory_used gpu_power_usage; do - echo "--- ${metric} ---" - for ns in gpu-operator dynamo-system; do - kubectl_kind get --raw "/apis/custom.metrics.k8s.io/v1beta1/namespaces/${ns}/pods/*/${metric}" 2>/dev/null | jq . || true - done - done - print_grafana_diagnostics - echo "=== prometheus-adapter pods ===" - kubectl_kind -n monitoring get pods -l app.kubernetes.io/name=prometheus-adapter -o wide 2>/dev/null || true - echo "=== kgateway pods ===" - kubectl_kind -n kgateway-system get pods -o wide 2>/dev/null || true - echo "=== GatewayClass status ===" - kubectl_kind get gatewayclass -o yaml 2>/dev/null || true - echo "=== Gateway status ===" - kubectl_kind get gateways -A -o yaml 2>/dev/null || true - echo "=== DCGM Exporter pods ===" - kubectl_kind -n gpu-operator get pods -l app=nvidia-dcgm-exporter -o wide 2>/dev/null || true - echo "=== Monitoring pods ===" - kubectl_kind -n monitoring get pods -o wide 2>/dev/null || true - echo "=== DRA ResourceSlices ===" - kubectl_kind get resourceslices -o wide 2>/dev/null || true - echo "=== Node status ===" - kubectl_kind get nodes -o wide 2>/dev/null || true + print_h100_common_diagnostics dynamo-system kgateway-system + print_dynamo_diagnostics + print_kgateway_diagnostics ;; *) echo "::error::unknown GPU_TEST_DIAGNOSTIC_MODE: ${mode}" diff --git a/.github/scripts/gpu-runtime-component-health.sh b/.github/scripts/gpu-runtime-component-health.sh new file mode 100644 index 000000000..93b8efc7b --- /dev/null +++ b/.github/scripts/gpu-runtime-component-health.sh @@ -0,0 +1,110 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +if [[ $# -ne 1 ]]; then + echo "::error::Usage: $0 " + exit 2 +fi + +mode="$1" +COMPONENT_HEALTH_TIMEOUT="${COMPONENT_HEALTH_TIMEOUT:-120s}" + +kubectl_kind() { + timeout 30s kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" "$@" +} + +kubectl_kind_wait() { + # kubectl wait opens a watch that is already bounded by --timeout. Keep + # request-timeout unset here so a slow API server does not cut the watch short. + kubectl --context="kind-${KIND_CLUSTER_NAME}" "$@" +} + +print_namespace_diagnostics() { + local ns="$1" + + echo "=== ${ns} workloads ===" + kubectl_kind -n "${ns}" get deployments,statefulsets,daemonsets,pods -o wide 2>/dev/null || true + echo "=== Recent events (${ns}) ===" + kubectl_kind -n "${ns}" get events --sort-by='.lastTimestamp' 2>/dev/null | tail -80 || true +} + +wait_for_deployments() { + local ns="$1" + shift + local deployments=("$@") + + echo "Waiting up to ${COMPONENT_HEALTH_TIMEOUT} for ${ns}: ${deployments[*]}" + if kubectl_kind_wait -n "${ns}" wait \ + --for=condition=Available \ + --timeout="${COMPONENT_HEALTH_TIMEOUT}" \ + "${deployments[@]}"; then + return 0 + fi + + echo "::error::One or more deployments in ${ns} did not become Available within ${COMPONENT_HEALTH_TIMEOUT}: ${deployments[*]}" + print_namespace_diagnostics "${ns}" + return 1 +} + +wait_for_required_object() { + local resource="$1" + + echo "Verifying ${resource}" + if kubectl_kind get "${resource}" >/dev/null; then + return 0 + fi + + echo "::error::Required object is missing: ${resource}" + return 1 +} + +echo "=== Runtime component health (${mode}) ===" + +wait_for_deployments monitoring \ + deployment/kube-prometheus-operator + +wait_for_deployments kai-scheduler \ + deployment/kai-scheduler-default \ + deployment/admission \ + deployment/binder \ + deployment/kai-operator \ + deployment/pod-grouper \ + deployment/podgroup-controller \ + deployment/queue-controller + +case "${mode}" in + training) + wait_for_deployments kubeflow \ + deployment/kubeflow-trainer-controller-manager + wait_for_required_object validatingwebhookconfiguration/validator.trainer.kubeflow.org + wait_for_required_object customresourcedefinition/trainjobs.trainer.kubeflow.org + ;; + inference) + wait_for_deployments dynamo-system \ + deployment/dynamo-platform-dynamo-operator-controller-manager \ + deployment/grove-operator + wait_for_deployments kgateway-system \ + deployment/kgateway \ + deployment/inference-gateway + ;; + *) + echo "::error::unknown runtime component health mode: ${mode}" + exit 2 + ;; +esac + +echo "Runtime component health check passed." diff --git a/.github/scripts/gpu-smoke-run-nvidia-smi.sh b/.github/scripts/gpu-smoke-run-nvidia-smi.sh index 1a2151253..0d4ea31d7 100644 --- a/.github/scripts/gpu-smoke-run-nvidia-smi.sh +++ b/.github/scripts/gpu-smoke-run-nvidia-smi.sh @@ -15,7 +15,19 @@ set -euo pipefail -pod_name=$(cat <<'EOF' | kubectl --context="kind-${KIND_CLUSTER_NAME}" create -f - -o jsonpath='{.metadata.name}' +KUBECTL_REQUEST_TIMEOUT="${KUBECTL_REQUEST_TIMEOUT:-10s}" +KUBECTL_WAIT_REQUEST_TIMEOUT="${KUBECTL_WAIT_REQUEST_TIMEOUT:-130s}" +POD_NAME_FILE="${POD_NAME_FILE:-/tmp/aicr-gpu-smoke-pod-name-${KIND_CLUSTER_NAME}}" + +kubectl_kind() { + kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" --context="kind-${KIND_CLUSTER_NAME}" "$@" +} + +kubectl_kind_wait() { + timeout 150s kubectl --request-timeout="${KUBECTL_WAIT_REQUEST_TIMEOUT}" --context="kind-${KIND_CLUSTER_NAME}" "$@" +} + +pod_name=$(cat <<'EOF' | kubectl_kind create -f - -o jsonpath='{.metadata.name}' apiVersion: v1 kind: Pod metadata: @@ -34,10 +46,10 @@ spec: EOF ) -echo "${pod_name}" > /tmp/aicr-gpu-smoke-pod-name +echo "${pod_name}" > "${POD_NAME_FILE}" echo "Waiting for ${pod_name} pod to complete..." -kubectl --context="kind-${KIND_CLUSTER_NAME}" wait "pod/${pod_name}" \ +kubectl_kind_wait wait "pod/${pod_name}" \ --for=condition=Ready --timeout=120s || true -kubectl --context="kind-${KIND_CLUSTER_NAME}" wait "pod/${pod_name}" \ +kubectl_kind_wait wait "pod/${pod_name}" \ --for=jsonpath='{.status.phase}'=Succeeded --timeout=120s diff --git a/.github/scripts/gpu-smoke-show-nvidia-smi-log.sh b/.github/scripts/gpu-smoke-show-nvidia-smi-log.sh index 982648460..25b33c862 100644 --- a/.github/scripts/gpu-smoke-show-nvidia-smi-log.sh +++ b/.github/scripts/gpu-smoke-show-nvidia-smi-log.sh @@ -15,13 +15,20 @@ set -euo pipefail +KUBECTL_REQUEST_TIMEOUT="${KUBECTL_REQUEST_TIMEOUT:-10s}" +POD_NAME_FILE="${POD_NAME_FILE:-/tmp/aicr-gpu-smoke-pod-name-${KIND_CLUSTER_NAME}}" + +kubectl_kind() { + kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" --context="kind-${KIND_CLUSTER_NAME}" "$@" +} + pod_name="" -if [[ -f /tmp/aicr-gpu-smoke-pod-name ]]; then - pod_name="$(cat /tmp/aicr-gpu-smoke-pod-name)" +if [[ -f "${POD_NAME_FILE}" ]]; then + pod_name="$(cat "${POD_NAME_FILE}")" fi if [[ -z "${pod_name}" ]]; then - pod_name=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" get pods \ + pod_name=$(kubectl_kind get pods \ -l app=gpu-smoke-test \ --sort-by=.metadata.creationTimestamp \ -o jsonpath='{.items[-1:].metadata.name}') @@ -32,4 +39,5 @@ if [[ -z "${pod_name}" ]]; then exit 1 fi -kubectl --context="kind-${KIND_CLUSTER_NAME}" logs "${pod_name}" +kubectl_kind logs "${pod_name}" +rm -f "${POD_NAME_FILE}" diff --git a/.github/workflows/gpu-h100-inference-test.yaml b/.github/workflows/gpu-h100-inference-test.yaml index 08152c25e..4f06bb396 100644 --- a/.github/workflows/gpu-h100-inference-test.yaml +++ b/.github/workflows/gpu-h100-inference-test.yaml @@ -16,7 +16,7 @@ name: GPU Inference Test (nvkind + H100 x2) on: schedule: - - cron: '15 6,18 * * *' # Every 12 hours (2x daily), offset from T4 smoke test + - cron: '15 6,18 * * *' # Every 12 hours (2x daily), offset from training test push: branches: - "pull-request/[0-9]+" @@ -56,8 +56,14 @@ jobs: - '.github/actions/install-karpenter-kwok/**' - 'validators/*/Dockerfile' - 'pkg/evidence/**' + - '.github/workflows/gpu-h100-kind-runtime-test.yaml' + - '.github/actions/gpu-debug-diagnostics/**' - '.github/actions/gpu-test-cleanup/**' - '.github/actions/load-versions/**' + - '.github/scripts/gpu-chainsaw-health.sh' + - '.github/scripts/gpu-debug-diagnostics.sh' + - '.github/scripts/gpu-runtime-component-health.sh' + - '.github/scripts/gpu-validate-conformance.sh' - 'pkg/bundler/deployer/helm/**' - 'tests/chainsaw/chainsaw-config.yaml' - 'tests/chainsaw/ai-conformance/main.go' @@ -84,11 +90,11 @@ jobs: - 'pkg/defaults/timeouts.go' - 'validators/conformance/**' + # NVIDIA self-hosted GPU runners reject pull_request event jobs before + # checkout. PR GPU coverage runs through the pull-request/ push + # mirror after ok-to-test approval. gpu-inference-test: needs: [check-paths] - # NVIDIA self-hosted GPU runners reject pull_request event jobs before - # checkout. PR GPU coverage runs through the pull-request/ push - # mirror after ok-to-test approval. if: > always() && ( github.event_name == 'schedule' || @@ -96,184 +102,11 @@ jobs: (github.event_name == 'push' && needs.check-paths.outputs.should-run == 'true') ) name: GPU Inference Test (nvkind + H100 x2) - concurrency: - group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }} - cancel-in-progress: true - runs-on: linux-amd64-gpu-h100-latest-2 - # Cold self-hosted H100 runners can spend most of the old budget pulling - # images and loading Kind nodes before validation starts. - timeout-minutes: 180 - - env: - KIND_CLUSTER_NAME: gpu-inference-test - - steps: - - - name: Checkout Code - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - persist-credentials: false - - - name: Load GPU test versions - id: versions - uses: ./.github/actions/load-versions - - - name: Set up GPU cluster - uses: ./.github/actions/gpu-cluster-setup - with: - kind_node_image: ${{ steps.versions.outputs.h100_kind_node_image }} - min_gpu_count: '2' - gpu_model_pattern: H100 - min_free_disk_gb: '50' - min_available_memory_gb: '16' - cluster_create_timeout: 900s - control_plane_resource_patches: 'true' - control_plane_leader_election_tuning: 'true' - - - name: Build aicr and snapshot agent image - uses: ./.github/actions/aicr-build - with: - build_snapshot_agent: 'true' - validator_phases: 'none' - - # Fast readiness gate after cluster setup. Stability windows start after - # runtime install, where component rollouts can stress the control plane. - - name: Check control plane health - uses: ./.github/actions/check-control-plane-health - with: - cluster_name: ${{ env.KIND_CLUSTER_NAME }} - wait_timeout: 120s - stability_window: 0s - recover_unhealthy: 'true' - - - name: Install runtime bundle - id: bundle-install - uses: ./.github/actions/gpu-operator-install - with: - method: bundle - accelerator: h100 - platform: dynamo - wait: 'true' - best_effort: 'false' - - # Runtime install creates many CRDs, webhooks, and controllers. Keep a - # stability window here to catch KCM/scheduler restarts before snapshot. - - name: Check control plane health - uses: ./.github/actions/check-control-plane-health - with: - cluster_name: ${{ env.KIND_CLUSTER_NAME }} - wait_timeout: 120s - stability_window: 60s - recover_unhealthy: 'true' - - # --- Snapshot and GPU validation --- - - - name: Snapshot and validate GPU - uses: ./.github/actions/gpu-snapshot-validate - with: - gpu_model: H100 - min_gpu_count: '2' - cluster_name: ${{ env.KIND_CLUSTER_NAME }} - snapshot_timeout: 10m - - # --- Install Karpenter + KWOK early to give monitoring stack settle time --- - - # Snapshot deploys a GPU Job and exercises cluster discovery; verify the - # control plane stayed stable before adding Karpenter/KWOK. - - name: Check control plane health - uses: ./.github/actions/check-control-plane-health - with: - cluster_name: ${{ env.KIND_CLUSTER_NAME }} - wait_timeout: 120s - stability_window: 60s - recover_unhealthy: 'true' - - - name: Install Karpenter + KWOK - uses: ./.github/actions/install-karpenter-kwok - with: - cluster_name: ${{ env.KIND_CLUSTER_NAME }} - kwok_helm_timeout: 600s - ko_build_timeout: '1200' - karpenter_helm_timeout: 600s - - # --- Health checks --- - - - name: Install chainsaw - uses: ./.github/actions/setup-build-tools - with: - install_chainsaw: 'true' - chainsaw_version: '${{ steps.versions.outputs.chainsaw }}' - - # This gate is effectively post-Karpenter/KWOK; the chainsaw setup above - # only installs a runner-side binary. - - name: Check control plane health - uses: ./.github/actions/check-control-plane-health - with: - cluster_name: ${{ env.KIND_CLUSTER_NAME }} - wait_timeout: 120s - stability_window: 60s - recover_unhealthy: 'true' - - - name: Run chainsaw health checks - # The H100 stack can make namespace cleanup API calls slow under load. - # Keep cleanup enabled, but allow more than the default 30s deadline. - run: bash .github/scripts/gpu-chainsaw-health.sh tests/chainsaw/ai-conformance/kind-inference-dynamo - - # --- CNCF AI Conformance validation --- - # Runs after the stack health checks so gateway and metrics validators - # see a settled inference stack. - - - name: Build conformance validator image - uses: ./.github/actions/aicr-build - with: - build_cli: 'false' - build_snapshot_agent: 'false' - validator_phases: 'conformance' - - # Validator image build/load can contend with Docker and kind containerd; - # verify the control plane before the final conformance workload. - - name: Check control plane health - uses: ./.github/actions/check-control-plane-health - with: - cluster_name: ${{ env.KIND_CLUSTER_NAME }} - wait_timeout: 120s - stability_window: 60s - recover_unhealthy: 'true' - - - name: Validate CNCF AI Conformance - id: validate-conformance - run: bash .github/scripts/gpu-validate-conformance.sh - - # Dynamo smoke is intentionally disabled for now. The vLLM runtime image - # adds significant latency and flakiness in Kind CI, and training has no - # matching smoke path yet. Reintroduce it later alongside a symmetric - # training smoke test if needed. - # --- Validation artifacts --- - - - name: Upload validation artifacts - if: always() - timeout-minutes: 5 - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 - with: - name: conformance-evidence - path: | - conformance-evidence/ - validation-result.yaml - if-no-files-found: warn - - # --- Debug diagnostics (before cleanup so resources still exist) --- - - - name: Debug diagnostics - if: failure() - timeout-minutes: 5 - shell: bash - env: - GPU_TEST_DIAGNOSTIC_MODE: inference - run: bash .github/scripts/gpu-debug-diagnostics.sh - - - name: GPU Test Cleanup - if: always() - uses: ./.github/actions/gpu-test-cleanup - with: - cluster_name: ${{ env.KIND_CLUSTER_NAME }} - artifact_name_prefix: gpu-inference-test-debug + uses: ./.github/workflows/gpu-h100-kind-runtime-test.yaml + with: + job_name: GPU Inference Test (nvkind + H100 x2) + cluster_name: gpu-inference-test + intent: inference + platform: dynamo + chainsaw_path: tests/chainsaw/ai-conformance/kind-inference-dynamo + artifact_name_prefix: gpu-inference-test-debug diff --git a/.github/workflows/gpu-h100-kind-runtime-test.yaml b/.github/workflows/gpu-h100-kind-runtime-test.yaml new file mode 100644 index 000000000..6d0f8757b --- /dev/null +++ b/.github/workflows/gpu-h100-kind-runtime-test.yaml @@ -0,0 +1,221 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: GPU H100 Kind Runtime Test + +on: + workflow_call: + inputs: + job_name: + description: 'Display name for the H100 runtime job' + required: true + type: string + cluster_name: + description: 'Kind cluster name' + required: true + type: string + intent: + description: 'Runtime intent passed to the bundle installer' + required: true + type: string + platform: + description: 'Runtime platform passed to the bundle installer' + required: true + type: string + chainsaw_path: + description: 'Chainsaw health-check directory' + required: true + type: string + artifact_name_prefix: + description: 'Prefix for uploaded debug artifacts' + required: true + type: string + +permissions: + contents: read + +jobs: + gpu-h100-kind-runtime-test: + name: ${{ inputs.job_name }} + runs-on: linux-amd64-gpu-h100-latest-2 + # Cold self-hosted H100 runners can spend most of this budget pulling + # images and loading Kind nodes before validation starts. + timeout-minutes: 180 + concurrency: + group: gpu-h100-${{ inputs.cluster_name }}-${{ github.event_name }}-${{ github.ref }} + cancel-in-progress: true + + env: + KIND_CLUSTER_NAME: ${{ inputs.cluster_name }} + + steps: + - name: Checkout Code + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + + - name: Load GPU test versions + id: versions + uses: ./.github/actions/load-versions + + - name: Set up GPU cluster + uses: ./.github/actions/gpu-cluster-setup + with: + kind_node_image: ${{ steps.versions.outputs.h100_kind_node_image }} + min_gpu_count: '2' + gpu_model_pattern: H100 + min_free_disk_gb: '50' + min_available_memory_gb: '16' + cluster_create_timeout: 900s + control_plane_resource_patches: 'true' + control_plane_leader_election_tuning: 'true' + + - name: Build aicr and snapshot agent image + uses: ./.github/actions/aicr-build + with: + build_snapshot_agent: 'true' + validator_phases: 'none' + + # Fast readiness gate after cluster setup. Stability windows start after + # runtime install, where component rollouts can stress the control plane. + - name: Check control plane health + uses: ./.github/actions/check-control-plane-health + with: + cluster_name: ${{ env.KIND_CLUSTER_NAME }} + wait_timeout: 120s + stability_window: 0s + recover_unhealthy: 'true' + + - name: Install runtime bundle + id: bundle-install + uses: ./.github/actions/gpu-operator-install + with: + method: bundle + accelerator: h100 + intent: ${{ inputs.intent }} + platform: ${{ inputs.platform }} + wait: 'true' + best_effort: 'false' + + - name: Check runtime component health + run: bash .github/scripts/gpu-runtime-component-health.sh "${{ inputs.intent }}" + + # Runtime install creates many CRDs, webhooks, and controllers. Keep a + # stability window here to catch KCM/scheduler restarts before snapshot. + - name: Check control plane health + uses: ./.github/actions/check-control-plane-health + with: + cluster_name: ${{ env.KIND_CLUSTER_NAME }} + wait_timeout: 120s + stability_window: 60s + recover_unhealthy: 'true' + + - name: Snapshot and validate GPU + uses: ./.github/actions/gpu-snapshot-validate + with: + gpu_model: H100 + min_gpu_count: '2' + cluster_name: ${{ env.KIND_CLUSTER_NAME }} + snapshot_timeout: 10m + + # Snapshot deploys a GPU Job and exercises cluster discovery; verify the + # control plane stayed stable before adding Karpenter/KWOK. + - name: Check control plane health + uses: ./.github/actions/check-control-plane-health + with: + cluster_name: ${{ env.KIND_CLUSTER_NAME }} + wait_timeout: 120s + stability_window: 60s + recover_unhealthy: 'true' + + - name: Install Karpenter + KWOK + uses: ./.github/actions/install-karpenter-kwok + with: + cluster_name: ${{ env.KIND_CLUSTER_NAME }} + kwok_helm_timeout: 600s + ko_build_timeout: '1200' + karpenter_helm_timeout: 600s + + - name: Install chainsaw + uses: ./.github/actions/setup-build-tools + with: + install_chainsaw: 'true' + chainsaw_version: '${{ steps.versions.outputs.chainsaw }}' + + # This gate is effectively post-Karpenter/KWOK; the chainsaw setup above + # only installs a runner-side binary. + - name: Check control plane health + uses: ./.github/actions/check-control-plane-health + with: + cluster_name: ${{ env.KIND_CLUSTER_NAME }} + wait_timeout: 120s + stability_window: 60s + recover_unhealthy: 'true' + + - name: Run chainsaw health checks + run: bash .github/scripts/gpu-chainsaw-health.sh "${{ inputs.chainsaw_path }}" + + - name: Build conformance validator image + uses: ./.github/actions/aicr-build + with: + build_cli: 'false' + build_snapshot_agent: 'false' + validator_phases: 'conformance' + + # Validator image build/load can contend with Docker and kind containerd; + # verify the control plane before the final conformance workload. + - name: Check control plane health + uses: ./.github/actions/check-control-plane-health + with: + cluster_name: ${{ env.KIND_CLUSTER_NAME }} + wait_timeout: 120s + stability_window: 60s + recover_unhealthy: 'true' + + - name: Validate CNCF AI Conformance + id: validate-conformance + run: bash .github/scripts/gpu-validate-conformance.sh + + - name: Upload validation artifacts + if: always() + timeout-minutes: 5 + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: conformance-evidence + path: | + conformance-evidence/ + validation-result.yaml + if-no-files-found: warn + + - name: Debug diagnostics + if: failure() + timeout-minutes: 5 + uses: ./.github/actions/gpu-debug-diagnostics + with: + cluster_name: ${{ env.KIND_CLUSTER_NAME }} + mode: ${{ inputs.intent }} + + - name: Mark debug artifact collection + id: gpu-debug-artifacts + if: failure() || cancelled() + shell: bash + run: echo "collect=true" >> "${GITHUB_OUTPUT}" + + - name: GPU Test Cleanup + if: always() + uses: ./.github/actions/gpu-test-cleanup + with: + cluster_name: ${{ env.KIND_CLUSTER_NAME }} + artifact_name_prefix: ${{ inputs.artifact_name_prefix }} + collect_artifacts: ${{ steps.gpu-debug-artifacts.outputs.collect == 'true' }} diff --git a/.github/workflows/gpu-h100-training-test.yaml b/.github/workflows/gpu-h100-training-test.yaml index f5ecbaf7f..51fbed8ba 100644 --- a/.github/workflows/gpu-h100-training-test.yaml +++ b/.github/workflows/gpu-h100-training-test.yaml @@ -56,8 +56,14 @@ jobs: - '.github/actions/install-karpenter-kwok/**' - 'validators/*/Dockerfile' - 'pkg/evidence/**' + - '.github/workflows/gpu-h100-kind-runtime-test.yaml' + - '.github/actions/gpu-debug-diagnostics/**' - '.github/actions/gpu-test-cleanup/**' - '.github/actions/load-versions/**' + - '.github/scripts/gpu-chainsaw-health.sh' + - '.github/scripts/gpu-debug-diagnostics.sh' + - '.github/scripts/gpu-runtime-component-health.sh' + - '.github/scripts/gpu-validate-conformance.sh' - 'pkg/bundler/deployer/helm/**' - 'tests/chainsaw/chainsaw-config.yaml' - 'tests/chainsaw/ai-conformance/main.go' @@ -80,11 +86,11 @@ jobs: - 'pkg/defaults/timeouts.go' - 'validators/conformance/**' + # NVIDIA self-hosted GPU runners reject pull_request event jobs before + # checkout. PR GPU coverage runs through the pull-request/ push + # mirror after ok-to-test approval. gpu-training-test: needs: [check-paths] - # NVIDIA self-hosted GPU runners reject pull_request event jobs before - # checkout. PR GPU coverage runs through the pull-request/ push - # mirror after ok-to-test approval. if: > always() && ( github.event_name == 'schedule' || @@ -92,180 +98,11 @@ jobs: (github.event_name == 'push' && needs.check-paths.outputs.should-run == 'true') ) name: GPU Training Test (nvkind + H100 x2) - concurrency: - group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }} - cancel-in-progress: true - runs-on: linux-amd64-gpu-h100-latest-2 - # Cold self-hosted H100 runners can spend most of the old budget pulling - # images and loading Kind nodes before validation starts. - timeout-minutes: 180 - - env: - KIND_CLUSTER_NAME: gpu-training-test - - steps: - - - name: Checkout Code - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - persist-credentials: false - - - name: Load GPU test versions - id: versions - uses: ./.github/actions/load-versions - - - name: Set up GPU cluster - uses: ./.github/actions/gpu-cluster-setup - with: - kind_node_image: ${{ steps.versions.outputs.h100_kind_node_image }} - min_gpu_count: '2' - gpu_model_pattern: H100 - min_free_disk_gb: '50' - min_available_memory_gb: '16' - cluster_create_timeout: 900s - control_plane_resource_patches: 'true' - control_plane_leader_election_tuning: 'true' - - - name: Build aicr and snapshot agent image - uses: ./.github/actions/aicr-build - with: - build_snapshot_agent: 'true' - validator_phases: 'none' - - # Fast readiness gate after cluster setup. Stability windows start after - # runtime install, where component rollouts can stress the control plane. - - name: Check control plane health - uses: ./.github/actions/check-control-plane-health - with: - cluster_name: ${{ env.KIND_CLUSTER_NAME }} - wait_timeout: 120s - stability_window: 0s - recover_unhealthy: 'true' - - - name: Install runtime bundle - id: bundle-install - uses: ./.github/actions/gpu-operator-install - with: - method: bundle - accelerator: h100 - intent: training - platform: kubeflow - wait: 'true' - best_effort: 'false' - - # Runtime install creates many CRDs, webhooks, and controllers. Keep a - # stability window here to catch KCM/scheduler restarts before snapshot. - - name: Check control plane health - id: post_runtime_control_plane_health - uses: ./.github/actions/check-control-plane-health - with: - cluster_name: ${{ env.KIND_CLUSTER_NAME }} - wait_timeout: 120s - stability_window: 60s - recover_unhealthy: 'true' - - - name: Snapshot and validate GPU - uses: ./.github/actions/gpu-snapshot-validate - with: - gpu_model: H100 - min_gpu_count: '2' - cluster_name: ${{ env.KIND_CLUSTER_NAME }} - snapshot_timeout: 10m - - # --- Install Karpenter + KWOK early to give monitoring stack settle time --- - - # Snapshot deploys a GPU Job and exercises cluster discovery; verify the - # control plane stayed stable before adding Karpenter/KWOK. - - name: Check control plane health - uses: ./.github/actions/check-control-plane-health - with: - cluster_name: ${{ env.KIND_CLUSTER_NAME }} - wait_timeout: 120s - stability_window: 60s - recover_unhealthy: 'true' - - - name: Install Karpenter + KWOK - uses: ./.github/actions/install-karpenter-kwok - with: - cluster_name: ${{ env.KIND_CLUSTER_NAME }} - kwok_helm_timeout: 600s - ko_build_timeout: '1200' - karpenter_helm_timeout: 600s - - # --- Health checks --- - - - name: Install chainsaw - uses: ./.github/actions/setup-build-tools - with: - install_chainsaw: 'true' - chainsaw_version: '${{ steps.versions.outputs.chainsaw }}' - - # This gate is effectively post-Karpenter/KWOK; the chainsaw setup above - # only installs a runner-side binary. - - name: Check control plane health - uses: ./.github/actions/check-control-plane-health - with: - cluster_name: ${{ env.KIND_CLUSTER_NAME }} - wait_timeout: 120s - stability_window: 60s - recover_unhealthy: 'true' - - - name: Run chainsaw health checks - # The H100 stack can make namespace cleanup API calls slow under load. - # Keep cleanup enabled, but allow more than the default 30s deadline. - run: bash .github/scripts/gpu-chainsaw-health.sh tests/chainsaw/ai-conformance/kind-training-kubeflow - - # --- CNCF AI Conformance validation --- - # Runs last to ensure the DCGM → Prometheus → adapter pipeline - # has had time to bootstrap (pod-autoscaling check needs live metric data). - - - name: Build conformance validator image - uses: ./.github/actions/aicr-build - with: - build_cli: 'false' - build_snapshot_agent: 'false' - validator_phases: 'conformance' - - # Validator image build/load can contend with Docker and kind containerd; - # verify the control plane before the final conformance workload. - - name: Check control plane health - uses: ./.github/actions/check-control-plane-health - with: - cluster_name: ${{ env.KIND_CLUSTER_NAME }} - wait_timeout: 120s - stability_window: 60s - recover_unhealthy: 'true' - - - name: Validate CNCF AI Conformance - id: validate-conformance - run: bash .github/scripts/gpu-validate-conformance.sh - - # --- Validation artifacts --- - - - name: Upload validation artifacts - if: always() - timeout-minutes: 5 - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 - with: - name: conformance-evidence - path: | - conformance-evidence/ - validation-result.yaml - if-no-files-found: warn - - # --- Debug diagnostics (before cleanup so resources still exist) --- - - - name: Debug diagnostics - if: failure() - timeout-minutes: 5 - shell: bash - env: - GPU_TEST_DIAGNOSTIC_MODE: training - run: bash .github/scripts/gpu-debug-diagnostics.sh - - - name: GPU Test Cleanup - if: always() - uses: ./.github/actions/gpu-test-cleanup - with: - cluster_name: ${{ env.KIND_CLUSTER_NAME }} - artifact_name_prefix: gpu-training-test-debug + uses: ./.github/workflows/gpu-h100-kind-runtime-test.yaml + with: + job_name: GPU Training Test (nvkind + H100 x2) + cluster_name: gpu-training-test + intent: training + platform: kubeflow + chainsaw_path: tests/chainsaw/ai-conformance/kind-training-kubeflow + artifact_name_prefix: gpu-training-test-debug diff --git a/.github/workflows/gpu-smoke-test.yaml b/.github/workflows/gpu-smoke-test.yaml index bdf607e07..af8d3860c 100644 --- a/.github/workflows/gpu-smoke-test.yaml +++ b/.github/workflows/gpu-smoke-test.yaml @@ -50,8 +50,13 @@ jobs: - '.github/actions/gpu-cluster-setup/**' - '.github/actions/gpu-operator-install/**' - '.github/actions/aicr-build/**' + - '.github/actions/gpu-debug-diagnostics/**' - '.github/actions/gpu-test-cleanup/**' + - '.github/actions/gpu-smoke-nvidia-smi/**' - '.github/actions/load-versions/**' + - '.github/scripts/gpu-debug-diagnostics.sh' + - '.github/scripts/gpu-smoke-run-nvidia-smi.sh' + - '.github/scripts/gpu-smoke-show-nvidia-smi-log.sh' - 'pkg/collector/**' - 'pkg/snapshotter/**' - '.github/actions/gpu-snapshot-validate/**' @@ -108,10 +113,9 @@ jobs: method: helm - name: Run nvidia-smi in a pod - run: bash .github/scripts/gpu-smoke-run-nvidia-smi.sh - - - name: Show nvidia-smi output - run: bash .github/scripts/gpu-smoke-show-nvidia-smi-log.sh + uses: ./.github/actions/gpu-smoke-nvidia-smi + with: + cluster_name: ${{ env.KIND_CLUSTER_NAME }} # --- Snapshot and validation --- @@ -124,12 +128,20 @@ jobs: - name: Debug diagnostics if: failure() - env: - GPU_TEST_DIAGNOSTIC_MODE: smoke - run: bash .github/scripts/gpu-debug-diagnostics.sh + uses: ./.github/actions/gpu-debug-diagnostics + with: + cluster_name: ${{ env.KIND_CLUSTER_NAME }} + mode: smoke + + - name: Mark debug artifact collection + id: gpu-debug-artifacts + if: failure() || cancelled() + shell: bash + run: echo "collect=true" >> "${GITHUB_OUTPUT}" - name: GPU Test Cleanup if: always() uses: ./.github/actions/gpu-test-cleanup with: cluster_name: ${{ env.KIND_CLUSTER_NAME }} + collect_artifacts: ${{ steps.gpu-debug-artifacts.outputs.collect == 'true' }} diff --git a/pkg/bundler/deployer/helm/helm_test.go b/pkg/bundler/deployer/helm/helm_test.go index da23545a0..d266afb96 100644 --- a/pkg/bundler/deployer/helm/helm_test.go +++ b/pkg/bundler/deployer/helm/helm_test.go @@ -588,6 +588,10 @@ func TestGenerate_DeployScriptComponentTimeouts(t *testing.T) { `requires Helm v4.0.5 or later`, `--wait --timeout 20m`, }, + rejectSnippets: []string{ + `local prerelease`, + `if [[ -n "${prerelease}" ]]`, + }, }, { name: "kube-prometheus-stack", diff --git a/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl b/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl index 4c1e26f28..11ffb8475 100644 --- a/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl +++ b/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl @@ -72,8 +72,7 @@ function helm_supports_server_side_false_install() { version="$(helm version --short 2>/dev/null | head -n 1 || true)" version="${version#v}" version="${version%%+*}" - version="${version%%-*}" - if ! [[ "${version}" =~ ^([0-9]+)\.([0-9]+)\.([0-9]+)$ ]]; then + if ! [[ "${version}" =~ ^([0-9]+)\.([0-9]+)\.([0-9]+)(-[0-9A-Za-z.-]+)?$ ]]; then return 1 fi major="${BASH_REMATCH[1]}" diff --git a/recipes/overlays/kind.yaml b/recipes/overlays/kind.yaml index b0d8dbd76..791016135 100644 --- a/recipes/overlays/kind.yaml +++ b/recipes/overlays/kind.yaml @@ -115,6 +115,11 @@ spec: - name: kube-prometheus-stack type: Helm overrides: + # CI only needs component health, not the full upstream alerting rule + # set. Skipping default rules reduces PrometheusRule churn during + # install on small kind control planes. + defaultRules: + create: false prometheus: prometheusSpec: # Smaller storage for local testing @@ -132,7 +137,35 @@ spec: memory: 1Gi # Shorter retention for local testing retention: 7d + prometheusOperator: + # Keep operator-owned monitoring custom resources in the monitoring + # namespace for kind. Do not scope ServiceMonitor discovery here; + # GPU, Kubeflow, and Dynamo monitors may live in their own namespaces. + alertmanagerInstanceNamespaces: + - monitoring + alertmanagerConfigNamespaces: + - monitoring + prometheusInstanceNamespaces: + - monitoring + thanosRulerInstanceNamespaces: + - monitoring + # CI kind control planes can be slow under image pulls and controller + # startup. Avoid restarting the operator on short health probe stalls. + livenessProbe: + timeoutSeconds: 10 + failureThreshold: 10 + readinessProbe: + timeoutSeconds: 10 + failureThreshold: 6 + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: 500m + memory: 512Mi grafana: + enabled: false resources: requests: cpu: 100m From 727f1b0cfb515e58bf3820587bfc803760bb4047 Mon Sep 17 00:00:00 2001 From: Yuan Chen Date: Sun, 26 Apr 2026 19:59:25 -0700 Subject: [PATCH 21/21] ci: address follow-up GPU review feedback --- .../configure-nvidia-container-toolkit.sh | 2 +- .../collect-debug-artifacts.sh | 7 ++++ .../scripts/gpu-runtime-component-health.sh | 34 ++++++++++++++++--- .../scripts/gpu-smoke-show-nvidia-smi-log.sh | 5 ++- pkg/bundler/deployer/helm/helm_test.go | 10 ++++-- .../deployer/helm/templates/deploy.sh.tmpl | 28 ++++++++------- 6 files changed, 66 insertions(+), 20 deletions(-) diff --git a/.github/actions/gpu-cluster-setup/configure-nvidia-container-toolkit.sh b/.github/actions/gpu-cluster-setup/configure-nvidia-container-toolkit.sh index 16352077c..84635a988 100644 --- a/.github/actions/gpu-cluster-setup/configure-nvidia-container-toolkit.sh +++ b/.github/actions/gpu-cluster-setup/configure-nvidia-container-toolkit.sh @@ -25,7 +25,7 @@ set -e if (( restart_status != 0 )); then echo "::error::Docker restart failed after NVIDIA runtime configuration" sudo systemctl status docker --no-pager || true - journalctl -u docker --since "10 minutes ago" --no-pager || true + sudo journalctl -u docker --since "10 minutes ago" --no-pager || true exit "${restart_status}" fi diff --git a/.github/actions/gpu-test-cleanup/collect-debug-artifacts.sh b/.github/actions/gpu-test-cleanup/collect-debug-artifacts.sh index cb33e770a..7c780e3f7 100644 --- a/.github/actions/gpu-test-cleanup/collect-debug-artifacts.sh +++ b/.github/actions/gpu-test-cleanup/collect-debug-artifacts.sh @@ -19,6 +19,7 @@ set -uo pipefail rm -rf /tmp/debug-artifacts mkdir -p /tmp/debug-artifacts CONTROL_PLANE_COMPONENTS="kube-apiserver kube-controller-manager kube-scheduler etcd" +MAX_KIND_NODE_ARTIFACT_SECONDS="${MAX_KIND_NODE_ARTIFACT_SECONDS:-600}" kubectl_kind() { timeout 30s kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" "$@" } @@ -95,9 +96,15 @@ else echo "No recipe.yaml or bundle directory found; skipping runtime bundle archive" fi +artifact_loop_start="$(date +%s)" docker_timeout 30s ps --filter "label=io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}" \ --format '{{.Names}}' | sort | while read -r node_container; do [[ -z "${node_container}" ]] && continue + artifact_loop_elapsed=$(($(date +%s) - artifact_loop_start)) + if (( artifact_loop_elapsed > MAX_KIND_NODE_ARTIFACT_SECONDS )); then + echo "Kind node artifact collection exceeded ${MAX_KIND_NODE_ARTIFACT_SECONDS}s; stopping after partial collection." + break + fi node_file="${node_container//[^A-Za-z0-9_.-]/_}" docker_timeout 30s inspect "${node_container}" \ > "/tmp/debug-artifacts/${node_file}-docker-inspect.json" 2>&1 || true diff --git a/.github/scripts/gpu-runtime-component-health.sh b/.github/scripts/gpu-runtime-component-health.sh index 93b8efc7b..3d668d37b 100644 --- a/.github/scripts/gpu-runtime-component-health.sh +++ b/.github/scripts/gpu-runtime-component-health.sh @@ -23,6 +23,22 @@ fi mode="$1" COMPONENT_HEALTH_TIMEOUT="${COMPONENT_HEALTH_TIMEOUT:-120s}" +duration_seconds() { + local input_value="$1" + local number="${input_value%[smh]}" + local unit="${input_value: -1}" + + case "${unit}" in + s) echo "$((10#${number}))" ;; + m) echo "$((10#${number} * 60))" ;; + h) echo "$((10#${number} * 3600))" ;; + *) + echo "::error::unsupported duration '${input_value}'" >&2 + exit 1 + ;; + esac +} + kubectl_kind() { timeout 30s kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" "$@" } @@ -62,13 +78,23 @@ wait_for_deployments() { wait_for_required_object() { local resource="$1" + local timeout_seconds + local deadline - echo "Verifying ${resource}" - if kubectl_kind get "${resource}" >/dev/null; then - return 0 - fi + timeout_seconds="$(duration_seconds "${COMPONENT_HEALTH_TIMEOUT}")" + deadline=$((SECONDS + timeout_seconds)) + + echo "Waiting up to ${COMPONENT_HEALTH_TIMEOUT} for ${resource}" + while (( SECONDS <= deadline )); do + if kubectl_kind get "${resource}" >/dev/null; then + return 0 + fi + sleep 2 + done echo "::error::Required object is missing: ${resource}" + kubectl_kind get "${resource}" -o yaml 2>/dev/null || true + kubectl_kind describe "${resource}" 2>/dev/null || true return 1 } diff --git a/.github/scripts/gpu-smoke-show-nvidia-smi-log.sh b/.github/scripts/gpu-smoke-show-nvidia-smi-log.sh index 25b33c862..05bc09523 100644 --- a/.github/scripts/gpu-smoke-show-nvidia-smi-log.sh +++ b/.github/scripts/gpu-smoke-show-nvidia-smi-log.sh @@ -17,6 +17,7 @@ set -euo pipefail KUBECTL_REQUEST_TIMEOUT="${KUBECTL_REQUEST_TIMEOUT:-10s}" POD_NAME_FILE="${POD_NAME_FILE:-/tmp/aicr-gpu-smoke-pod-name-${KIND_CLUSTER_NAME}}" +trap 'rm -f "${POD_NAME_FILE}"' EXIT kubectl_kind() { kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" --context="kind-${KIND_CLUSTER_NAME}" "$@" @@ -25,6 +26,9 @@ kubectl_kind() { pod_name="" if [[ -f "${POD_NAME_FILE}" ]]; then pod_name="$(cat "${POD_NAME_FILE}")" + if [[ -n "${pod_name}" ]] && ! kubectl_kind get pod "${pod_name}" >/dev/null 2>&1; then + pod_name="" + fi fi if [[ -z "${pod_name}" ]]; then @@ -40,4 +44,3 @@ if [[ -z "${pod_name}" ]]; then fi kubectl_kind logs "${pod_name}" -rm -f "${POD_NAME_FILE}" diff --git a/pkg/bundler/deployer/helm/helm_test.go b/pkg/bundler/deployer/helm/helm_test.go index d266afb96..33dc2c6e9 100644 --- a/pkg/bundler/deployer/helm/helm_test.go +++ b/pkg/bundler/deployer/helm/helm_test.go @@ -557,6 +557,7 @@ func TestGenerate_DeployScriptComponentTimeouts(t *testing.T) { wantSnippets []string wantReadmeSnippets []string rejectSnippets []string + rejectScriptSnippets []string rejectReadmeSnippets []string rejectRetryCap bool }{ @@ -579,7 +580,7 @@ func TestGenerate_DeployScriptComponentTimeouts(t *testing.T) { `Require v4.0.5+ before relying on`, `--request-timeout="${KUBECTL_REQUEST_TIMEOUT}"`, `dynamo-platform conflict mitigation requires Helm v4.0.5+`, - `dump_dynamo_platform_helm_diagnostics "${namespace}"`, + `dump_dynamo_platform_helm_diagnostics "${component}" "${namespace}"`, `deployment/dynamo-platform-dynamo-operator-controller-manager`, `--previous --tail=200`, }, @@ -588,7 +589,7 @@ func TestGenerate_DeployScriptComponentTimeouts(t *testing.T) { `requires Helm v4.0.5 or later`, `--wait --timeout 20m`, }, - rejectSnippets: []string{ + rejectScriptSnippets: []string{ `local prerelease`, `if [[ -n "${prerelease}" ]]`, }, @@ -703,6 +704,11 @@ func TestGenerate_DeployScriptComponentTimeouts(t *testing.T) { t.Errorf("deploy.sh should not include %s snippet %q", tt.component.Name, snippet) } } + for _, snippet := range tt.rejectScriptSnippets { + if strings.Contains(script, snippet) { + t.Errorf("deploy.sh should not include %s script snippet %q", tt.component.Name, snippet) + } + } if tt.rejectRetryCap && retryCapPattern.MatchString(componentBlock) { t.Errorf("deploy.sh should not cap %s retries in its component block", tt.component.Name) } diff --git a/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl b/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl index 11ffb8475..6b946a969 100644 --- a/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl +++ b/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl @@ -174,8 +174,9 @@ function dump_kai_scheduler_helm_diagnostics() { } function dump_dynamo_platform_helm_diagnostics() { - local namespace="$1" - if [[ "${namespace}" != "dynamo-system" ]]; then + local component="$1" + local namespace="$2" + if [[ "${component}" != "dynamo-platform" ]]; then return fi @@ -202,15 +203,16 @@ function dump_dynamo_platform_helm_diagnostics() { } # helm_retry contract: -# helm_retry "" "" "" [args...] -# Callers must pass the retry budget as the third positional argument before the -# command to execute. This keeps per-component retry tuning explicit at the -# callsite instead of relying on the global MAX_RETRIES fallback. +# helm_retry "" "" "" "" [args...] +# Callers must pass the component name and retry budget before the command to +# execute. This keeps per-component retry tuning and diagnostics explicit at the +# callsite instead of relying on global fallbacks. function helm_retry() { local desc="$1" - local namespace="$2" - local max_retries="$3" - shift 3 + local component="$2" + local namespace="$3" + local max_retries="$4" + shift 4 local attempt=0 while true; do if "$@"; then @@ -218,7 +220,7 @@ function helm_retry() { fi attempt=$((attempt + 1)) dump_kai_scheduler_helm_diagnostics "${namespace}" - dump_dynamo_platform_helm_diagnostics "${namespace}" + dump_dynamo_platform_helm_diagnostics "${component}" "${namespace}" if [[ ${attempt} -gt ${max_retries} ]]; then echo "ERROR: ${desc} failed after ${attempt} attempts" return 1 @@ -471,7 +473,8 @@ if echo "${ASYNC_COMPONENTS}" | grep -qw "{{ .Name }}"; then echo " (async component — skipping --wait, keeping --timeout for hooks)" fi {{ if .IsOCI -}} -helm_retry "{{ .Name }} helm install" "{{ .Namespace }}" \ +helm_retry "{{ .Name }} helm install" "{{ .Name }}" \ + "{{ .Namespace }}" \ "${COMPONENT_MAX_RETRIES}" \ helm upgrade --install {{ .Name }} {{ .Repository }}/{{ .ChartName }} \ ${COMPONENT_HELM_APPLY_ARGS[@]+"${COMPONENT_HELM_APPLY_ARGS[@]}"} \ @@ -483,7 +486,8 @@ helm_retry "{{ .Name }} helm install" "{{ .Namespace }}" \ ${COMPONENT_WAIT_ARGS} \ || helm_failed "{{ .Name }}" {{ else -}} -helm_retry "{{ .Name }} helm install" "{{ .Namespace }}" \ +helm_retry "{{ .Name }} helm install" "{{ .Name }}" \ + "{{ .Namespace }}" \ "${COMPONENT_MAX_RETRIES}" \ helm upgrade --install {{ .Name }} {{ .ChartName }} \ ${COMPONENT_HELM_APPLY_ARGS[@]+"${COMPONENT_HELM_APPLY_ARGS[@]}"} \