From 40abdad034b18790788009994eeda6b0f27da98b Mon Sep 17 00:00:00 2001
From: Yuan Chen <yuanchen97@gmail.com>
Date: Sat, 25 Apr 2026 08:25:10 -0700
Subject: [PATCH 01/21] ci: harden H100 GPU qualification

---
 .../check-control-plane-health/action.yml     | 116 ++++++++++++++++++
 .../actions/gpu-operator-install/action.yml   |  39 +++---
 .../actions/gpu-snapshot-validate/action.yml  |   5 +
 .github/actions/gpu-test-cleanup/action.yml   |   9 ++
 .../actions/install-karpenter-kwok/action.yml |  15 +++
 .../workflows/gpu-h100-inference-test.yaml    |  46 ++++++-
 .github/workflows/gpu-h100-training-test.yaml |  46 ++++++-
 kwok/scripts/install-karpenter-kwok.sh        |  14 ++-
 pkg/bundler/deployer/helm/helm_test.go        | 109 +++++++++++++++-
 .../deployer/helm/templates/deploy.sh.tmpl    |  18 ++-
 .../kind-training-kubeflow/chainsaw-test.yaml |   2 +
 11 files changed, 390 insertions(+), 29 deletions(-)
 create mode 100644 .github/actions/check-control-plane-health/action.yml

diff --git a/.github/actions/check-control-plane-health/action.yml b/.github/actions/check-control-plane-health/action.yml
new file mode 100644
index 000000000..ae2a87db7
--- /dev/null
+++ b/.github/actions/check-control-plane-health/action.yml
@@ -0,0 +1,116 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: 'Check Control Plane Health'
+description: 'Fails if Kind control-plane static pods are missing, unready, or restarted.'
+
+inputs:
+  cluster_name:
+    description: 'Kind cluster name'
+    required: true
+  namespace:
+    description: 'Namespace that contains the control-plane pods'
+    required: false
+    default: kube-system
+  components:
+    description: 'Space-separated component label values to check'
+    required: false
+    default: kube-apiserver kube-controller-manager kube-scheduler etcd
+  wait_timeout:
+    description: 'Timeout for each component readiness wait'
+    required: false
+    default: 60s
+  max_restarts:
+    description: 'Maximum tolerated restart count for each control-plane container'
+    required: false
+    default: '1'
+
+runs:
+  using: 'composite'
+  steps:
+    - name: Check control-plane pods
+      shell: bash
+      env:
+        KIND_CLUSTER_NAME: ${{ inputs.cluster_name }}
+        NAMESPACE: ${{ inputs.namespace }}
+        COMPONENTS: ${{ inputs.components }}
+        WAIT_TIMEOUT: ${{ inputs.wait_timeout }}
+        MAX_RESTARTS: ${{ inputs.max_restarts }}
+      run: |
+        set -euo pipefail
+
+        MAX_RESTARTS="${MAX_RESTARTS#"${MAX_RESTARTS%%[![:space:]]*}"}"
+        MAX_RESTARTS="${MAX_RESTARTS%"${MAX_RESTARTS##*[![:space:]]}"}"
+        if ! [[ "${MAX_RESTARTS}" =~ ^[0-9]+$ ]]; then
+          echo "::error::max_restarts must be a non-negative integer, got '${MAX_RESTARTS}'"
+          exit 1
+        fi
+
+        kubectl --context="kind-${KIND_CLUSTER_NAME}" get --raw='/readyz' || true
+
+        check_component() {
+          local component="$1"
+          local selector="component=${component}"
+          local pods
+
+          pods=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" -n "${NAMESPACE}" \
+            get pod -l "${selector}" -o name)
+          if [[ -z "${pods}" ]]; then
+            echo "::error::no ${component} pods found in ${NAMESPACE} with selector ${selector}"
+            kubectl --context="kind-${KIND_CLUSTER_NAME}" -n "${NAMESPACE}" \
+              get pods -o wide || true
+            exit 1
+          fi
+
+          if ! kubectl --context="kind-${KIND_CLUSTER_NAME}" -n "${NAMESPACE}" \
+            wait --for=condition=Ready pod -l "${selector}" --timeout="${WAIT_TIMEOUT}"; then
+            echo "::error::${component} pods did not become Ready within ${WAIT_TIMEOUT}"
+            kubectl --context="kind-${KIND_CLUSTER_NAME}" -n "${NAMESPACE}" \
+              get pod -l "${selector}" -o wide || true
+            kubectl --context="kind-${KIND_CLUSTER_NAME}" -n "${NAMESPACE}" \
+              describe pod -l "${selector}" || true
+            kubectl --context="kind-${KIND_CLUSTER_NAME}" -n "${NAMESPACE}" \
+              get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true
+            kubectl --context="kind-${KIND_CLUSTER_NAME}" get --raw='/readyz' || true
+            exit 1
+          fi
+
+          local restart_counts
+          restart_counts=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" -n "${NAMESPACE}" \
+            get pod -l "${selector}" \
+            -o jsonpath='{range .items[*]}{range .status.containerStatuses[*]}{.restartCount}{"\n"}{end}{end}')
+          if [[ -z "${restart_counts}" ]]; then
+            echo "::error::no container statuses found for ${component} pods"
+            kubectl --context="kind-${KIND_CLUSTER_NAME}" -n "${NAMESPACE}" \
+              describe pod -l "${selector}" || true
+            exit 1
+          fi
+
+          while IFS= read -r restart_count; do
+            [[ -z "${restart_count}" ]] && continue
+            if (( restart_count > MAX_RESTARTS )); then
+              echo "::error::${component} restartCount=${restart_count}"
+              kubectl --context="kind-${KIND_CLUSTER_NAME}" -n "${NAMESPACE}" \
+                get pod -l "${selector}" -o wide || true
+              kubectl --context="kind-${KIND_CLUSTER_NAME}" -n "${NAMESPACE}" \
+                describe pod -l "${selector}" || true
+              exit 1
+            fi
+          done <<< "${restart_counts}"
+        }
+
+        for component in ${COMPONENTS}; do
+          check_component "${component}"
+        done
+        kubectl --context="kind-${KIND_CLUSTER_NAME}" get --raw='/readyz'
diff --git a/.github/actions/gpu-operator-install/action.yml b/.github/actions/gpu-operator-install/action.yml
index e2bdb300c..86d247932 100644
--- a/.github/actions/gpu-operator-install/action.yml
+++ b/.github/actions/gpu-operator-install/action.yml
@@ -31,6 +31,14 @@ inputs:
     description: 'Platform for recipe generation (bundle mode only, e.g. dynamo)'
     required: false
     default: ''
+  wait:
+    description: 'Wait for bundle Helm resources during deploy'
+    required: false
+    default: 'false'
+  best_effort:
+    description: 'Continue deploying remaining bundle components after a component failure'
+    required: false
+    default: 'true'
 
 runs:
   using: 'composite'
@@ -82,8 +90,7 @@ runs:
           --intent ${{ inputs.intent }} \
           ${PLATFORM_FLAG} \
           --output recipe.yaml
-        echo "--- Recipe ---"
-        cat recipe.yaml
+        echo "Recipe written to recipe.yaml"
 
     - name: Generate deployment bundle
       if: inputs.method == 'bundle'
@@ -101,19 +108,23 @@ runs:
       shell: bash
       run: |
         cd bundle
-        # Use --no-wait: several components (gpu-operator ClusterPolicy,
-        # kai-scheduler SchedulingShard, nvidia-dra-driver-gpu kubelet plugin)
-        # stay InProgress in kind because their CRs/DaemonSets require
-        # features not available in kind (DRA feature gates, driver modules).
-        # The explicit "Wait for GPU operands" step below gates on what
-        # actually matters (device plugin readiness).
-        # --best-effort: some components (e.g. network-operator) have Helm
-        # hooks that may time out in Kind; continue deploying remaining
-        # components so the overall stack is functional.
+        # The default keeps legacy bundle-mode behavior: do not wait on every
+        # Helm resource and keep deploying after component failures. H100
+        # qualification jobs override these inputs to hard-fail and wait.
         chmod +x deploy.sh
-        echo "--- deploy.sh ---"
-        cat deploy.sh
-        ./deploy.sh --no-wait --best-effort
+        DEPLOY_ARGS=()
+        if [[ "${{ inputs.wait }}" != "true" ]]; then
+          DEPLOY_ARGS+=(--no-wait)
+        fi
+        if [[ "${{ inputs.best_effort }}" == "true" ]]; then
+          DEPLOY_ARGS+=(--best-effort)
+        fi
+        if [[ "${#DEPLOY_ARGS[@]}" -gt 0 ]]; then
+          echo "Deploying bundle with args: ${DEPLOY_ARGS[*]}"
+        else
+          echo "Deploying bundle with default args"
+        fi
+        ./deploy.sh "${DEPLOY_ARGS[@]}"
 
     - name: Wait for GPU operands (bundle)
       if: inputs.method == 'bundle'
diff --git a/.github/actions/gpu-snapshot-validate/action.yml b/.github/actions/gpu-snapshot-validate/action.yml
index e1ee3c14b..9cfb67e9c 100644
--- a/.github/actions/gpu-snapshot-validate/action.yml
+++ b/.github/actions/gpu-snapshot-validate/action.yml
@@ -26,6 +26,10 @@ inputs:
   cluster_name:
     description: 'Kind cluster name (for kubectl context)'
     required: true
+  snapshot_timeout:
+    description: 'Timeout for aicr snapshot'
+    required: false
+    default: '5m'
 
 runs:
   using: composite
@@ -38,6 +42,7 @@ runs:
           --namespace=default \
           --image=ko.local:smoke-test \
           --require-gpu \
+          --timeout="${{ inputs.snapshot_timeout }}" \
           --output=snapshot.yaml
         echo "--- Snapshot output ---"
         cat snapshot.yaml
diff --git a/.github/actions/gpu-test-cleanup/action.yml b/.github/actions/gpu-test-cleanup/action.yml
index 30ac7831f..d2089816b 100644
--- a/.github/actions/gpu-test-cleanup/action.yml
+++ b/.github/actions/gpu-test-cleanup/action.yml
@@ -40,6 +40,15 @@ runs:
         kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator logs -l app=nvidia-device-plugin-daemonset --tail=100 > /tmp/debug-artifacts/device-plugin-logs.txt || true
         kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator logs -l app.kubernetes.io/component=gpu-operator --tail=100 > /tmp/debug-artifacts/gpu-operator-logs.txt || true
         kubectl --context="kind-${KIND_CLUSTER_NAME}" get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded > /tmp/debug-artifacts/non-running-pods.txt || true
+        tar_inputs=()
+        [[ -f recipe.yaml ]] && tar_inputs+=(recipe.yaml)
+        [[ -d bundle ]] && tar_inputs+=(bundle)
+        if [[ "${#tar_inputs[@]}" -gt 0 ]]; then
+          echo "Archiving runtime bundle inputs: ${tar_inputs[*]}"
+          tar -czf /tmp/debug-artifacts/aicr-runtime-bundle.tar.gz "${tar_inputs[@]}" || true
+        else
+          echo "No recipe.yaml or bundle directory found; skipping runtime bundle archive"
+        fi
 
     - name: Export kind logs
       if: failure()
diff --git a/.github/actions/install-karpenter-kwok/action.yml b/.github/actions/install-karpenter-kwok/action.yml
index fde7bddde..dab642174 100644
--- a/.github/actions/install-karpenter-kwok/action.yml
+++ b/.github/actions/install-karpenter-kwok/action.yml
@@ -19,6 +19,18 @@ inputs:
   cluster_name:
     description: 'Kind cluster name (used for kubectl context)'
     required: true
+  kwok_helm_timeout:
+    description: 'Timeout for KWOK controller Helm install'
+    required: false
+    default: '300s'
+  ko_build_timeout:
+    description: 'Timeout in seconds for Karpenter KWOK provider ko build'
+    required: false
+    default: '900'
+  karpenter_helm_timeout:
+    description: 'Timeout for Karpenter Helm install'
+    required: false
+    default: '300s'
 
 runs:
   using: 'composite'
@@ -46,6 +58,9 @@ runs:
       env:
         KIND_CLUSTER_NAME: ${{ inputs.cluster_name }}
         KARPENTER_VERSION: ${{ steps.versions.outputs.karpenter }}
+        KWOK_HELM_TIMEOUT: ${{ inputs.kwok_helm_timeout }}
+        KO_BUILD_TIMEOUT: ${{ inputs.ko_build_timeout }}
+        KARPENTER_HELM_TIMEOUT: ${{ inputs.karpenter_helm_timeout }}
       run: |
         set -euo pipefail
         bash kwok/scripts/install-karpenter-kwok.sh
diff --git a/.github/workflows/gpu-h100-inference-test.yaml b/.github/workflows/gpu-h100-inference-test.yaml
index c5e1882d4..373cd3c6e 100644
--- a/.github/workflows/gpu-h100-inference-test.yaml
+++ b/.github/workflows/gpu-h100-inference-test.yaml
@@ -50,6 +50,7 @@ jobs:
               - '.settings.yaml'
               - '.github/actions/gpu-cluster-setup/**'
               - '.github/actions/gpu-operator-install/**'
+              - '.github/actions/check-control-plane-health/**'
               - '.github/actions/aicr-build/**'
               - '.github/actions/setup-build-tools/**'
               - '.github/actions/install-karpenter-kwok/**'
@@ -96,7 +97,9 @@ jobs:
       group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}
       cancel-in-progress: true
     runs-on: linux-amd64-gpu-h100-latest-2
-    timeout-minutes: 120
+    # Cold self-hosted H100 runners can spend most of the old budget pulling
+    # images and loading Kind nodes before validation starts.
+    timeout-minutes: 180
 
     env:
       KIND_CLUSTER_NAME: gpu-inference-test
@@ -116,6 +119,12 @@ jobs:
         with:
           validator_phases: 'conformance'
 
+      - name: Check control plane health
+        uses: ./.github/actions/check-control-plane-health
+        with:
+          cluster_name: ${{ env.KIND_CLUSTER_NAME }}
+          wait_timeout: 120s
+
       - name: Install runtime bundle
         id: bundle-install
         uses: ./.github/actions/gpu-operator-install
@@ -123,6 +132,14 @@ jobs:
           method: bundle
           accelerator: h100
           platform: dynamo
+          wait: 'true'
+          best_effort: 'false'
+
+      - name: Check control plane health after runtime install
+        uses: ./.github/actions/check-control-plane-health
+        with:
+          cluster_name: ${{ env.KIND_CLUSTER_NAME }}
+          wait_timeout: 120s
 
       # --- Snapshot and GPU validation ---
 
@@ -132,16 +149,32 @@ jobs:
           gpu_model: H100
           min_gpu_count: '2'
           cluster_name: ${{ env.KIND_CLUSTER_NAME }}
+          snapshot_timeout: 10m
 
       # --- Install Karpenter + KWOK early to give monitoring stack settle time ---
 
+      - name: Check control plane health before Karpenter
+        uses: ./.github/actions/check-control-plane-health
+        with:
+          cluster_name: ${{ env.KIND_CLUSTER_NAME }}
+          wait_timeout: 120s
+
       - name: Install Karpenter + KWOK
         uses: ./.github/actions/install-karpenter-kwok
         with:
           cluster_name: ${{ env.KIND_CLUSTER_NAME }}
+          kwok_helm_timeout: 600s
+          ko_build_timeout: '1200'
+          karpenter_helm_timeout: 600s
 
       # --- Health checks ---
 
+      - name: Check control plane health after Karpenter
+        uses: ./.github/actions/check-control-plane-health
+        with:
+          cluster_name: ${{ env.KIND_CLUSTER_NAME }}
+          wait_timeout: 120s
+
       - name: Prepare chainsaw
         id: versions
         uses: ./.github/actions/load-versions
@@ -171,6 +204,12 @@ jobs:
             --kubeconfig="${HOME}/.kube/config" \
             --debug
 
+      - name: Check control plane health before conformance validation
+        uses: ./.github/actions/check-control-plane-health
+        with:
+          cluster_name: ${{ env.KIND_CLUSTER_NAME }}
+          wait_timeout: 120s
+
       - name: Validate CNCF AI Conformance
         id: validate-conformance
         run: |
@@ -193,8 +232,9 @@ jobs:
       # training smoke test if needed.
       # --- Validation artifacts ---
 
-      # Collect a post-run resource snapshot regardless of whether conformance
-      # validation ran, so triage always has a cluster-state artifact.
+      # Collect a post-run resource snapshot whenever the runtime bundle
+      # installed. This preserves triage data for snapshot/chainsaw/conformance
+      # failures; continue-on-error keeps the original failure intact.
       - name: Collect validation artifacts
         if: >-
           always()
diff --git a/.github/workflows/gpu-h100-training-test.yaml b/.github/workflows/gpu-h100-training-test.yaml
index d3a04de03..36a2d1eec 100644
--- a/.github/workflows/gpu-h100-training-test.yaml
+++ b/.github/workflows/gpu-h100-training-test.yaml
@@ -50,6 +50,7 @@ jobs:
               - '.settings.yaml'
               - '.github/actions/gpu-cluster-setup/**'
               - '.github/actions/gpu-operator-install/**'
+              - '.github/actions/check-control-plane-health/**'
               - '.github/actions/aicr-build/**'
               - '.github/actions/setup-build-tools/**'
               - '.github/actions/install-karpenter-kwok/**'
@@ -92,7 +93,9 @@ jobs:
       group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}
       cancel-in-progress: true
     runs-on: linux-amd64-gpu-h100-latest-2
-    timeout-minutes: 120
+    # Cold self-hosted H100 runners can spend most of the old budget pulling
+    # images and loading Kind nodes before validation starts.
+    timeout-minutes: 180
 
     env:
       KIND_CLUSTER_NAME: gpu-training-test
@@ -112,6 +115,12 @@ jobs:
         with:
           validator_phases: 'conformance'
 
+      - name: Check control plane health
+        uses: ./.github/actions/check-control-plane-health
+        with:
+          cluster_name: ${{ env.KIND_CLUSTER_NAME }}
+          wait_timeout: 120s
+
       - name: Install runtime bundle
         id: bundle-install
         uses: ./.github/actions/gpu-operator-install
@@ -120,6 +129,14 @@ jobs:
           accelerator: h100
           intent: training
           platform: kubeflow
+          wait: 'true'
+          best_effort: 'false'
+
+      - name: Check control plane health after runtime install
+        uses: ./.github/actions/check-control-plane-health
+        with:
+          cluster_name: ${{ env.KIND_CLUSTER_NAME }}
+          wait_timeout: 120s
 
       # --- Snapshot and GPU validation ---
 
@@ -129,16 +146,32 @@ jobs:
           gpu_model: H100
           min_gpu_count: '2'
           cluster_name: ${{ env.KIND_CLUSTER_NAME }}
+          snapshot_timeout: 10m
 
       # --- Install Karpenter + KWOK early to give monitoring stack settle time ---
 
+      - name: Check control plane health before Karpenter
+        uses: ./.github/actions/check-control-plane-health
+        with:
+          cluster_name: ${{ env.KIND_CLUSTER_NAME }}
+          wait_timeout: 120s
+
       - name: Install Karpenter + KWOK
         uses: ./.github/actions/install-karpenter-kwok
         with:
           cluster_name: ${{ env.KIND_CLUSTER_NAME }}
+          kwok_helm_timeout: 600s
+          ko_build_timeout: '1200'
+          karpenter_helm_timeout: 600s
 
       # --- Health checks ---
 
+      - name: Check control plane health after Karpenter
+        uses: ./.github/actions/check-control-plane-health
+        with:
+          cluster_name: ${{ env.KIND_CLUSTER_NAME }}
+          wait_timeout: 120s
+
       - name: Prepare chainsaw
         id: versions
         uses: ./.github/actions/load-versions
@@ -168,6 +201,12 @@ jobs:
             --kubeconfig="${HOME}/.kube/config" \
             --debug
 
+      - name: Check control plane health before conformance validation
+        uses: ./.github/actions/check-control-plane-health
+        with:
+          cluster_name: ${{ env.KIND_CLUSTER_NAME }}
+          wait_timeout: 120s
+
       - name: Validate CNCF AI Conformance
         id: validate-conformance
         run: |
@@ -186,8 +225,9 @@ jobs:
 
       # --- Validation artifacts ---
 
-      # Collect a post-run resource snapshot regardless of whether conformance
-      # validation ran, so triage always has a cluster-state artifact.
+      # Collect a post-run resource snapshot whenever the runtime bundle
+      # installed. This preserves triage data for snapshot/chainsaw/conformance
+      # failures; continue-on-error keeps the original failure intact.
       - name: Collect validation artifacts
         if: >-
           always()
diff --git a/kwok/scripts/install-karpenter-kwok.sh b/kwok/scripts/install-karpenter-kwok.sh
index 72b64dae1..d6a17481f 100755
--- a/kwok/scripts/install-karpenter-kwok.sh
+++ b/kwok/scripts/install-karpenter-kwok.sh
@@ -41,7 +41,9 @@ KARPENTER_VERSION="${KARPENTER_VERSION:-v1.8.0}"
 KIND_CLUSTER_NAME="${KIND_CLUSTER_NAME:?KIND_CLUSTER_NAME must be set}"
 KARPENTER_NAMESPACE="${KARPENTER_NAMESPACE:-karpenter}"
 KARPENTER_CLONE_DIR="${KARPENTER_CLONE_DIR:-/tmp/karpenter}"
+KWOK_HELM_TIMEOUT="${KWOK_HELM_TIMEOUT:-300s}"
 KO_BUILD_TIMEOUT="${KO_BUILD_TIMEOUT:-900}"  # 15 minutes
+KARPENTER_HELM_TIMEOUT="${KARPENTER_HELM_TIMEOUT:-300s}"
 
 RED='\033[0;31m'
 GREEN='\033[0;32m'
@@ -68,7 +70,7 @@ install_kwok() {
     helm upgrade --install kwok-controller kwok/kwok \
         --namespace kube-system \
         --set hostNetwork=true \
-        --wait --timeout 300s
+        --wait --timeout "${KWOK_HELM_TIMEOUT}"
 
     helm upgrade --install kwok-stage-fast kwok/stage-fast \
         --namespace kube-system
@@ -98,11 +100,16 @@ build_karpenter() {
     # Redirect stderr to avoid Go compilation warnings corrupting the image reference.
     # Output format: kind.local/<name>:<content-hash>
     # Hard timeout prevents a slow/stuck compilation from consuming the entire job.
+    local ko_stderr="${KARPENTER_CLONE_DIR}/ko-build.stderr"
     CONTROLLER_IMG=$(timeout "${KO_BUILD_TIMEOUT}" \
         env KO_DOCKER_REPO=kind.local \
         KIND_CLUSTER_NAME="${KIND_CLUSTER_NAME}" \
-        ko build sigs.k8s.io/karpenter/kwok 2>/dev/null) || {
+        ko build sigs.k8s.io/karpenter/kwok 2>"${ko_stderr}") || {
         log_error "ko build failed or timed out after ${KO_BUILD_TIMEOUT}s"
+        if [[ -s "${ko_stderr}" ]]; then
+            log_error "ko build stderr:"
+            sed 's/^/  /' "${ko_stderr}" || true
+        fi
         exit 1
     }
 
@@ -187,7 +194,7 @@ deploy_karpenter() {
         --set 'controller.extraVolumeMounts[0].readOnly=true' \
         --set 'controller.env[0].name=INSTANCE_TYPES_FILE_PATH' \
         --set 'controller.env[0].value=/etc/karpenter/instance-types/instance-types.json' \
-        --wait --timeout 300s \
+        --wait --timeout "${KARPENTER_HELM_TIMEOUT}" \
         || {
             log_error "Helm install failed. Diagnostics:"
             kubectl -n "${KARPENTER_NAMESPACE}" get pods -o wide 2>/dev/null || true
@@ -213,6 +220,7 @@ main() {
     log_info "Karpenter version: ${KARPENTER_VERSION}"
     log_info "Kind cluster: ${KIND_CLUSTER_NAME}"
     log_info "Namespace: ${KARPENTER_NAMESPACE}"
+    log_info "Timeouts: kwok=${KWOK_HELM_TIMEOUT} ko-build=${KO_BUILD_TIMEOUT}s karpenter=${KARPENTER_HELM_TIMEOUT}"
 
     install_kwok
     build_karpenter
diff --git a/pkg/bundler/deployer/helm/helm_test.go b/pkg/bundler/deployer/helm/helm_test.go
index 435948e8f..79c22f8fd 100644
--- a/pkg/bundler/deployer/helm/helm_test.go
+++ b/pkg/bundler/deployer/helm/helm_test.go
@@ -20,6 +20,7 @@ import (
 	"os"
 	"os/exec"
 	"path/filepath"
+	"regexp"
 	"strings"
 	"testing"
 	"time"
@@ -493,9 +494,9 @@ func TestGenerate_DeployScriptKaiSchedulerTimeout(t *testing.T) {
 	}
 	script := string(content)
 
-	// kai-scheduler should get a custom 20m timeout override
-	if !strings.Contains(script, `COMPONENT_HELM_TIMEOUT="20m"`) {
-		t.Error("deploy.sh missing kai-scheduler 20m timeout override")
+	// kai-scheduler should get a custom 30m timeout override
+	if !strings.Contains(script, `COMPONENT_HELM_TIMEOUT="30m"`) {
+		t.Error("deploy.sh missing kai-scheduler 30m timeout override")
 	}
 	// Other components should use the default HELM_TIMEOUT
 	if !strings.Contains(script, `COMPONENT_HELM_TIMEOUT="${HELM_TIMEOUT}"`) {
@@ -505,6 +506,9 @@ func TestGenerate_DeployScriptKaiSchedulerTimeout(t *testing.T) {
 	if !strings.Contains(script, `COMPONENT_MAX_RETRIES="1"`) {
 		t.Error("deploy.sh missing kai-scheduler retry override")
 	}
+	if !strings.Contains(script, `if [[ "${COMPONENT_MAX_RETRIES}" -gt 1 ]]`) {
+		t.Error("deploy.sh missing kai-scheduler retry cap")
+	}
 	if !strings.Contains(script, `dump_kai_scheduler_helm_diagnostics "${namespace}"`) {
 		t.Error("deploy.sh missing kai-scheduler diagnostics hook")
 	}
@@ -516,6 +520,105 @@ func TestGenerate_DeployScriptKaiSchedulerTimeout(t *testing.T) {
 	}
 }
 
+func TestGenerate_DeployScriptComponentTimeouts(t *testing.T) {
+	retryCapPattern := regexp.MustCompile(`(?m)(if \[\[ "\$\{COMPONENT_MAX_RETRIES\}" -gt \d+ \]\]|COMPONENT_MAX_RETRIES="\d+")`)
+	tests := []struct {
+		name                string
+		component           recipe.ComponentRef
+		wantTimeout         string
+		wantRetryAssignment string
+		wantRetryCap        string
+		wantComment         string
+		rejectRetryCap      bool
+	}{
+		{
+			name: "dynamo-platform",
+			component: recipe.ComponentRef{
+				Name:      "dynamo-platform",
+				Namespace: "dynamo-system",
+				Chart:     "dynamo-platform",
+				Version:   "0.9.0",
+				Type:      recipe.ComponentTypeHelm,
+				Source:    "oci://nvcr.io/nvidia/ai-dynamo",
+			},
+			wantTimeout:         `COMPONENT_HELM_TIMEOUT="30m"`,
+			wantRetryAssignment: `COMPONENT_MAX_RETRIES="1"`,
+			wantRetryCap:        `if [[ "${COMPONENT_MAX_RETRIES}" -gt 1 ]]`,
+		},
+		{
+			name: "kube-prometheus-stack",
+			component: recipe.ComponentRef{
+				Name:      "kube-prometheus-stack",
+				Namespace: "monitoring",
+				Chart:     "kube-prometheus-stack",
+				Version:   "82.8.0",
+				Type:      recipe.ComponentTypeHelm,
+				Source:    "https://prometheus-community.github.io/helm-charts",
+			},
+			wantTimeout:    `COMPONENT_HELM_TIMEOUT="20m"`,
+			wantComment:    `Keep the default retry budget for kube-prometheus-stack`,
+			rejectRetryCap: true,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			ctx := context.Background()
+			outputDir := t.TempDir()
+
+			g := &Generator{
+				RecipeResult: &recipe.RecipeResult{
+					Kind:            "RecipeResult",
+					APIVersion:      "aicr.nvidia.com/v1alpha1",
+					ComponentRefs:   []recipe.ComponentRef{tt.component},
+					DeploymentOrder: []string{tt.component.Name},
+				},
+				ComponentValues: map[string]map[string]any{
+					tt.component.Name: {},
+				},
+				Version: "v1.0.0",
+			}
+
+			_, err := g.Generate(ctx, outputDir)
+			if err != nil {
+				t.Fatalf("Generate failed: %v", err)
+			}
+
+			content, err := os.ReadFile(filepath.Join(outputDir, "deploy.sh"))
+			if err != nil {
+				t.Fatalf("failed to read deploy.sh: %v", err)
+			}
+			script := string(content)
+
+			blockStart := strings.Index(script, `Installing `+tt.component.Name)
+			if blockStart == -1 {
+				t.Fatalf("deploy.sh missing %s install block", tt.component.Name)
+			}
+			blockEnd := strings.Index(script[blockStart:], `helm upgrade --install `+tt.component.Name)
+			if blockEnd == -1 {
+				t.Fatalf("deploy.sh missing %s helm install command", tt.component.Name)
+			}
+			componentBlock := script[blockStart : blockStart+blockEnd]
+
+			if !strings.Contains(componentBlock, tt.wantTimeout) {
+				t.Errorf("deploy.sh missing %s timeout override %q", tt.component.Name, tt.wantTimeout)
+			}
+			if tt.wantRetryAssignment != "" && !strings.Contains(componentBlock, tt.wantRetryAssignment) {
+				t.Errorf("deploy.sh missing %s retry override %q", tt.component.Name, tt.wantRetryAssignment)
+			}
+			if tt.wantRetryCap != "" && !strings.Contains(componentBlock, tt.wantRetryCap) {
+				t.Errorf("deploy.sh missing %s retry cap %q", tt.component.Name, tt.wantRetryCap)
+			}
+			if tt.wantComment != "" && !strings.Contains(componentBlock, tt.wantComment) {
+				t.Errorf("deploy.sh missing %s retry rationale", tt.component.Name)
+			}
+			if tt.rejectRetryCap && retryCapPattern.MatchString(componentBlock) {
+				t.Errorf("deploy.sh should not cap %s retries in its component block", tt.component.Name)
+			}
+		})
+	}
+}
+
 func TestGenerate_UndeployScriptExecutable(t *testing.T) {
 	ctx := context.Background()
 	outputDir := t.TempDir()
diff --git a/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl b/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl
index 0f83eb71c..e456bba58 100644
--- a/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl
+++ b/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl
@@ -371,13 +371,25 @@ retry "{{ .Name }} pre-install manifests" apply_ignoring_crd_race "${SCRIPT_DIR}
   || helm_failed "{{ .Name }}"
 {{ end -}}
 # Per-component timeout override. Most components use HELM_TIMEOUT (10m).
-# Components with slow hooks (e.g., kai-scheduler crd-upgrader image pull
-# on cold runners) get a longer timeout to avoid unnecessary retry cycles.
+# Components with slow hooks on cold runners get a longer timeout to avoid
+# unnecessary retry cycles.
 COMPONENT_HELM_TIMEOUT="${HELM_TIMEOUT}"
 COMPONENT_MAX_RETRIES="${MAX_RETRIES}"
 {{ if eq .Name "kai-scheduler" -}}
+COMPONENT_HELM_TIMEOUT="30m"
+if [[ "${COMPONENT_MAX_RETRIES}" -gt 1 ]]; then
+  COMPONENT_MAX_RETRIES="1"
+fi
+{{ else if eq .Name "dynamo-platform" -}}
+COMPONENT_HELM_TIMEOUT="30m"
+if [[ "${COMPONENT_MAX_RETRIES}" -gt 1 ]]; then
+  COMPONENT_MAX_RETRIES="1"
+fi
+{{ else if eq .Name "kube-prometheus-stack" -}}
 COMPONENT_HELM_TIMEOUT="20m"
-COMPONENT_MAX_RETRIES="1"
+# Keep the default retry budget for kube-prometheus-stack. On cold H100
+# runners, Grafana can hit ProgressDeadlineExceeded multiple times before
+# images and rollout state are warm enough for a later retry to succeed.
 {{ end -}}
 # Derive wait args: global --wait/--no-wait behavior + component timeout.
 if [[ "${NO_WAIT}" == "true" ]]; then
diff --git a/tests/chainsaw/ai-conformance/kind-training-kubeflow/chainsaw-test.yaml b/tests/chainsaw/ai-conformance/kind-training-kubeflow/chainsaw-test.yaml
index 382d99104..9809cd0bb 100644
--- a/tests/chainsaw/ai-conformance/kind-training-kubeflow/chainsaw-test.yaml
+++ b/tests/chainsaw/ai-conformance/kind-training-kubeflow/chainsaw-test.yaml
@@ -73,6 +73,8 @@ spec:
 
     - name: assert-kubeflow-trainer
       description: Verify Kubeflow Trainer controller, validating webhook, and TrainJob CRD are available.
+      timeouts:
+        assert: 600s
       try:
         - assert:
             file: assert-kubeflow-trainer.yaml

From ae0b717e04746f7ed1481621f6747ed84b84d787 Mon Sep 17 00:00:00 2001
From: Yuan Chen <yuanchen97@gmail.com>
Date: Sat, 25 Apr 2026 18:06:56 -0700
Subject: [PATCH 02/21] ci: split aicr build artifacts

---
 .github/actions/aicr-build/action.yml | 32 +++++++++++++++++++++++----
 1 file changed, 28 insertions(+), 4 deletions(-)

diff --git a/.github/actions/aicr-build/action.yml b/.github/actions/aicr-build/action.yml
index 7a973ae21..ce9841f2f 100644
--- a/.github/actions/aicr-build/action.yml
+++ b/.github/actions/aicr-build/action.yml
@@ -13,9 +13,17 @@
 # limitations under the License.
 
 name: 'AICR Build'
-description: 'Builds the aicr validator image (via Dockerfile) and CLI binary, and loads the image into kind.'
+description: 'Builds the aicr CLI and optional snapshot/validator images, and loads requested images into kind.'
 
 inputs:
+  build_cli:
+    description: 'Build and stage the aicr CLI binary at the repository root'
+    required: false
+    default: 'true'
+  build_snapshot_agent:
+    description: 'Build the CUDA-based snapshot agent image and load it into kind'
+    required: false
+    default: 'true'
   build_validators:
     description: 'Deprecated: use validator_phases instead. Ignored when validator_phases is set.'
     required: false
@@ -35,15 +43,27 @@ runs:
         KO_VERSION=$(yq eval '.build_tools.ko' .settings.yaml)
         GOFLAGS= go install "github.com/google/ko@${KO_VERSION}"
 
-    - name: Build snapshot agent image and load into kind
+    - name: Build aicr CLI binary
+      if: inputs.build_cli == 'true' || inputs.build_snapshot_agent == 'true'
       shell: bash
       env:
         GOFLAGS: -mod=vendor
       run: |
+        set -euo pipefail
+        if [[ -x dist/aicr ]]; then
+          echo "Reusing existing dist/aicr"
+          exit 0
+        fi
+        CGO_ENABLED=0 go build -trimpath -o dist/aicr ./cmd/aicr
+
+    - name: Build snapshot agent image and load into kind
+      if: inputs.build_snapshot_agent == 'true'
+      shell: bash
+      run: |
+        set -euo pipefail
         # Build snapshot agent image with CUDA base (provides nvidia-smi for GPU detection).
         # Uses cuda:base (~250MB) instead of cuda:runtime (~1.8GB) — only nvidia-smi is needed.
         # GPU test workflows use --image=ko.local:smoke-test for aicr snapshot.
-        CGO_ENABLED=0 go build -trimpath -o dist/aicr ./cmd/aicr
         docker build -t ko.local:smoke-test -f - . <<'DOCKERFILE'
         FROM nvcr.io/nvidia/cuda:13.1.0-base-ubuntu24.04
         COPY dist/aicr /usr/local/bin/aicr
@@ -69,6 +89,7 @@ runs:
       env:
         GOFLAGS: -mod=vendor
       run: |
+        set -euo pipefail
         # Determine which validator phases to build.
         # validator_phases takes precedence; build_validators is a deprecated fallback.
         if [[ -n "${{ inputs.validator_phases }}" ]]; then
@@ -109,5 +130,8 @@ runs:
         done
 
     - name: Stage aicr binary at repo root
+      if: inputs.build_cli == 'true'
       shell: bash
-      run: cp dist/aicr ./aicr
+      run: |
+        set -euo pipefail
+        cp dist/aicr ./aicr

From 33417c3e4aac6bd4f5b70daab0e4b392221baad4 Mon Sep 17 00:00:00 2001
From: Yuan Chen <yuanchen97@gmail.com>
Date: Sat, 25 Apr 2026 18:07:00 -0700
Subject: [PATCH 03/21] ci: harden gpu cluster setup

---
 .github/actions/README.md                    |   3 +-
 .github/actions/gpu-cluster-setup/action.yml | 164 ++++++++++++++++++-
 .github/actions/load-versions/action.yml     |  10 ++
 .settings.yaml                               |   2 +
 4 files changed, 173 insertions(+), 6 deletions(-)

diff --git a/.github/actions/README.md b/.github/actions/README.md
index cef2fd6ca..3b1ee648d 100644
--- a/.github/actions/README.md
+++ b/.github/actions/README.md
@@ -50,7 +50,8 @@ This action runs `tools/setup-tools --skip-go --skip-docker` in auto mode, which
 **When to use**: When you need version values in workflow steps
 **Outputs**:
 - `go`, `goreleaser`, `ko`, `crane`, `golangci_lint`, `yamllint`, `addlicense`
-- `grype`, `kubectl`, `kind`, `ctlptl`, `tilt`, `helm`
+- `grype`, `kubectl`, `kind`, `nvkind`, `ctlptl`, `tilt`, `helm`
+- `kind_node_image`, `h100_kind_node_image`
 
 **Example**:
 ```yaml
diff --git a/.github/actions/gpu-cluster-setup/action.yml b/.github/actions/gpu-cluster-setup/action.yml
index b9bc3060f..c5b5bb3cd 100644
--- a/.github/actions/gpu-cluster-setup/action.yml
+++ b/.github/actions/gpu-cluster-setup/action.yml
@@ -15,6 +15,32 @@
 name: 'GPU Cluster Setup'
 description: 'Creates a GPU-enabled kind cluster using nvkind with CDI-mode GPU passthrough.'
 
+inputs:
+  kind_node_image:
+    description: 'Kind node image for nvkind cluster creation'
+    required: false
+    default: ''
+  min_gpu_count:
+    description: 'Minimum visible GPU count required before cluster setup'
+    required: false
+    default: '1'
+  gpu_model_pattern:
+    description: 'Optional grep-compatible GPU model pattern required for visible GPUs'
+    required: false
+    default: ''
+  min_free_disk_gb:
+    description: 'Minimum free disk space on / required before cluster setup'
+    required: false
+    default: '20'
+  min_available_memory_gb:
+    description: 'Minimum available system memory required before cluster setup'
+    required: false
+    default: '8'
+  cluster_create_timeout:
+    description: 'Timeout for nvkind cluster create'
+    required: false
+    default: '900s'
+
 runs:
   using: 'composite'
   steps:
@@ -53,12 +79,84 @@ runs:
     - name: Install nvkind
       shell: bash
       run: |
-        go install github.com/NVIDIA/nvkind/cmd/nvkind@latest
+        go install "github.com/NVIDIA/nvkind/cmd/nvkind@${{ steps.versions.outputs.nvkind }}"
         nvkind --help
 
-    - name: Verify host GPU
+    - name: Runner preflight
       shell: bash
-      run: nvidia-smi -L
+      env:
+        GPU_MODEL_PATTERN: ${{ inputs.gpu_model_pattern }}
+        MIN_GPU_COUNT: ${{ inputs.min_gpu_count }}
+        MIN_FREE_DISK_GB: ${{ inputs.min_free_disk_gb }}
+        MIN_AVAILABLE_MEMORY_GB: ${{ inputs.min_available_memory_gb }}
+        KIND_NODE_IMAGE: ${{ inputs.kind_node_image }}
+      run: |
+        set -euo pipefail
+
+        echo "=== Runner baseline ==="
+        date -u
+        hostname
+        uptime
+        nproc
+        free -h
+        df -h /
+        df -ih /
+
+        for value_name in MIN_GPU_COUNT MIN_FREE_DISK_GB MIN_AVAILABLE_MEMORY_GB; do
+          value="${!value_name}"
+          if ! [[ "${value}" =~ ^[0-9]+$ ]]; then
+            echo "::error::${value_name} must be an integer, got '${value}'"
+            exit 1
+          fi
+        done
+
+        free_disk_gb=$(df -BG --output=avail / | tail -1 | tr -dc '0-9')
+        if (( free_disk_gb < MIN_FREE_DISK_GB )); then
+          echo "::error::free disk on / is ${free_disk_gb}GiB, need at least ${MIN_FREE_DISK_GB}GiB"
+          exit 1
+        fi
+
+        available_memory_gb=$(free -g | awk '/^Mem:/ {print $7}')
+        if (( available_memory_gb < MIN_AVAILABLE_MEMORY_GB )); then
+          echo "::error::available memory is ${available_memory_gb}GiB, need at least ${MIN_AVAILABLE_MEMORY_GB}GiB"
+          exit 1
+        fi
+
+        echo "=== Docker health ==="
+        docker info >/dev/null
+        docker version
+
+        echo "=== Host GPUs ==="
+        nvidia-smi -L
+        nvidia-smi
+
+        mapfile -t gpu_names < <(nvidia-smi --query-gpu=name --format=csv,noheader)
+        if [[ -n "${GPU_MODEL_PATTERN}" ]]; then
+          gpu_count=$(printf '%s\n' "${gpu_names[@]}" | grep -Eic "${GPU_MODEL_PATTERN}" || true)
+          echo "Visible GPUs matching '${GPU_MODEL_PATTERN}': ${gpu_count}"
+        else
+          gpu_count="${#gpu_names[@]}"
+          echo "Visible GPUs: ${gpu_count}"
+        fi
+
+        if (( gpu_count < MIN_GPU_COUNT )); then
+          echo "::error::visible GPU count ${gpu_count} is below required minimum ${MIN_GPU_COUNT}"
+          exit 1
+        fi
+
+        echo "=== Existing kind state ==="
+        kind get clusters || true
+        docker ps -a --filter "name=${KIND_CLUSTER_NAME}" || true
+
+        if [[ -n "${KIND_NODE_IMAGE}" ]]; then
+          echo "=== Kind node image cache ==="
+          if docker image inspect "${KIND_NODE_IMAGE}" >/dev/null 2>&1; then
+            echo "Kind node image already cached: ${KIND_NODE_IMAGE}"
+          else
+            echo "Pulling kind node image: ${KIND_NODE_IMAGE}"
+            timeout 600s docker pull "${KIND_NODE_IMAGE}"
+          fi
+        fi
 
     - name: Configure NVIDIA Container Toolkit for kind
       shell: bash
@@ -70,7 +168,9 @@ runs:
 
     - name: Validate Docker GPU access
       shell: bash
-      run: docker run --rm -v /dev/null:/var/run/nvidia-container-devices/all ubuntu:22.04 nvidia-smi -L
+      run: |
+        set -euo pipefail
+        docker run --rm -v /dev/null:/var/run/nvidia-container-devices/all ubuntu:22.04 nvidia-smi -L
 
     - name: Increase inotify limits
       shell: bash
@@ -78,14 +178,68 @@ runs:
         sudo sysctl -w fs.inotify.max_user_watches=524288
         sudo sysctl -w fs.inotify.max_user_instances=1024
 
+    - name: Delete stale kind cluster
+      shell: bash
+      run: |
+        set -euo pipefail
+        if kind get clusters | grep -Fxq "${KIND_CLUSTER_NAME}"; then
+          echo "Deleting stale kind cluster: ${KIND_CLUSTER_NAME}"
+          timeout 180s kind delete cluster --name "${KIND_CLUSTER_NAME}"
+        else
+          echo "No stale kind cluster named ${KIND_CLUSTER_NAME}"
+        fi
+
+        remaining_containers=$(docker ps -aq --filter "name=${KIND_CLUSTER_NAME}")
+        if [[ -n "${remaining_containers}" ]]; then
+          echo "Removing stale containers for ${KIND_CLUSTER_NAME}:"
+          docker ps -a --filter "name=${KIND_CLUSTER_NAME}"
+          docker rm -f ${remaining_containers}
+        fi
+
+        remaining_containers=$(docker ps -aq --filter "name=${KIND_CLUSTER_NAME}")
+        if [[ -n "${remaining_containers}" ]]; then
+          echo "::error::stale containers still remain for ${KIND_CLUSTER_NAME}:"
+          docker ps -a --filter "name=${KIND_CLUSTER_NAME}"
+          exit 1
+        fi
+
     - name: Create GPU-enabled kind cluster
       shell: bash
+      env:
+        KIND_NODE_IMAGE: ${{ inputs.kind_node_image }}
+        CLUSTER_CREATE_TIMEOUT: ${{ inputs.cluster_create_timeout }}
       run: |
-        nvkind cluster create --name="${KIND_CLUSTER_NAME}" || echo "::warning::nvkind cluster create returned non-zero (umount errors are expected with CDI mode)"
+        set -euo pipefail
+
+        CREATE_ARGS=(--name="${KIND_CLUSTER_NAME}")
+        if [[ -n "${KIND_NODE_IMAGE}" ]]; then
+          echo "Using kind node image: ${KIND_NODE_IMAGE}"
+          CREATE_ARGS+=(--image="${KIND_NODE_IMAGE}")
+        fi
+
+        set +e
+        timeout "${CLUSTER_CREATE_TIMEOUT}" nvkind cluster create "${CREATE_ARGS[@]}"
+        create_status=$?
+        set -e
+        if (( create_status != 0 )); then
+          echo "::warning::nvkind cluster create exited with status ${create_status}; continuing only if post-create checks pass"
+        fi
+
         kubectl --context="kind-${KIND_CLUSTER_NAME}" wait --for=condition=Ready nodes --all --timeout=300s
         kubectl --context="kind-${KIND_CLUSTER_NAME}" cluster-info
         kubectl --context="kind-${KIND_CLUSTER_NAME}" get nodes -o wide
 
+        echo "=== Control-plane resource requests/limits ==="
+        kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kube-system \
+          get pods -l tier=control-plane -o json | jq -r '
+            .items[] as $pod |
+            $pod.metadata.name,
+            ($pod.spec.containers[] |
+              "  " + .name +
+              " requests=" + ((.resources.requests // {}) | tostring) +
+              " limits=" + ((.resources.limits // {}) | tostring))
+          ' || true
+
     - name: Print GPUs (nvkind)
       shell: bash
       run: nvkind cluster print-gpus --name="${KIND_CLUSTER_NAME}"
diff --git a/.github/actions/load-versions/action.yml b/.github/actions/load-versions/action.yml
index b87e321d1..b3c506d40 100644
--- a/.github/actions/load-versions/action.yml
+++ b/.github/actions/load-versions/action.yml
@@ -40,6 +40,9 @@ outputs:
   kind:
     description: 'Kind version'
     value: ${{ steps.versions.outputs.kind }}
+  nvkind:
+    description: 'nvkind git ref'
+    value: ${{ steps.versions.outputs.nvkind }}
   ctlptl:
     description: 'ctlptl version'
     value: ${{ steps.versions.outputs.ctlptl }}
@@ -91,6 +94,9 @@ outputs:
   kind_node_image:
     description: 'Kind node image for testing'
     value: ${{ steps.versions.outputs.kind_node_image }}
+  h100_kind_node_image:
+    description: 'Kind node image for H100 GPU tests'
+    value: ${{ steps.versions.outputs.h100_kind_node_image }}
 
 runs:
   using: 'composite'
@@ -121,6 +127,7 @@ runs:
         # Testing tools
         echo "kubectl=$(yq eval '.testing_tools.kubectl' .settings.yaml)" >> $GITHUB_OUTPUT
         echo "kind=$(yq eval '.testing_tools.kind' .settings.yaml)" >> $GITHUB_OUTPUT
+        echo "nvkind=$(yq eval '.testing_tools.nvkind' .settings.yaml)" >> $GITHUB_OUTPUT
         echo "ctlptl=$(yq eval '.testing_tools.ctlptl' .settings.yaml)" >> $GITHUB_OUTPUT
         echo "tilt=$(yq eval '.testing_tools.tilt' .settings.yaml)" >> $GITHUB_OUTPUT
         echo "helm=$(yq eval '.testing_tools.helm' .settings.yaml)" >> $GITHUB_OUTPUT
@@ -141,6 +148,7 @@ runs:
 
         # Testing configuration
         echo "kind_node_image=$(yq eval '.testing.kind_node_image' .settings.yaml)" >> $GITHUB_OUTPUT
+        echo "h100_kind_node_image=$(yq eval '.testing.h100_kind_node_image' .settings.yaml)" >> $GITHUB_OUTPUT
 
     - name: Display loaded versions
       shell: bash
@@ -158,6 +166,7 @@ runs:
         echo "  grype: ${{ steps.versions.outputs.grype }}"
         echo "  kubectl: ${{ steps.versions.outputs.kubectl }}"
         echo "  kind: ${{ steps.versions.outputs.kind }}"
+        echo "  nvkind: ${{ steps.versions.outputs.nvkind }}"
         echo "  ctlptl: ${{ steps.versions.outputs.ctlptl }}"
         echo "  tilt: ${{ steps.versions.outputs.tilt }}"
         echo "  helm: ${{ steps.versions.outputs.helm }}"
@@ -172,3 +181,4 @@ runs:
         echo "  lint_timeout: ${{ steps.versions.outputs.lint_timeout }}"
         echo "  test_timeout: ${{ steps.versions.outputs.test_timeout }}"
         echo "  kind_node_image: ${{ steps.versions.outputs.kind_node_image }}"
+        echo "  h100_kind_node_image: ${{ steps.versions.outputs.h100_kind_node_image }}"
diff --git a/.settings.yaml b/.settings.yaml
index 75b4559b1..5ef15198f 100644
--- a/.settings.yaml
+++ b/.settings.yaml
@@ -40,6 +40,7 @@ security_tools:
 testing_tools:
   kubectl: 'v1.35.0'
   kind: '0.31.0'
+  nvkind: '78a0a514c41c3e77ac0d935f38d971d3b4455138'
   ctlptl: '0.9.0'
   tilt: '0.37.0'
   helm: 'v4.1.1'
@@ -71,6 +72,7 @@ docs_tools:
 # Testing Configuration
 testing:
   kind_node_image: 'kindest/node:v1.32.0'
+  h100_kind_node_image: 'kindest/node:v1.35.0'
 
   # Component test harness configuration
   # Used by tools/component-test/ scripts to validate individual components

From b7ffe6beeed5059f8feeebce62686e97fe9a5975 Mon Sep 17 00:00:00 2001
From: Yuan Chen <yuanchen97@gmail.com>
Date: Sat, 25 Apr 2026 18:07:04 -0700
Subject: [PATCH 04/21] ci: tune H100 workflow reliability

---
 .../check-control-plane-health/action.yml     | 294 +++++++++++++++---
 .github/actions/gpu-test-cleanup/action.yml   |  37 ++-
 .../workflows/gpu-h100-inference-test.yaml    | 128 +++++---
 .github/workflows/gpu-h100-training-test.yaml | 117 ++++---
 .../kind-inference-dynamo/chainsaw-test.yaml  |   2 +
 .../kind-training-kubeflow/chainsaw-test.yaml |   2 +
 6 files changed, 443 insertions(+), 137 deletions(-)

diff --git a/.github/actions/check-control-plane-health/action.yml b/.github/actions/check-control-plane-health/action.yml
index ae2a87db7..bee31623b 100644
--- a/.github/actions/check-control-plane-health/action.yml
+++ b/.github/actions/check-control-plane-health/action.yml
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 name: 'Check Control Plane Health'
-description: 'Fails if Kind control-plane static pods are missing, unready, or restarted.'
+description: 'Fails if Kind control-plane static pods are missing, unready, or unstable.'
 
 inputs:
   cluster_name:
@@ -35,6 +35,22 @@ inputs:
     description: 'Maximum tolerated restart count for each control-plane container'
     required: false
     default: '1'
+  stability_window:
+    description: 'Optional duration to watch for new control-plane restarts after pods are Ready'
+    required: false
+    default: '0s'
+  recover_unhealthy:
+    description: 'Restart eligible Kind control-plane static pod containers when they are currently unhealthy'
+    required: false
+    default: 'false'
+  recovery_components:
+    description: 'Space-separated component label values eligible for recovery'
+    required: false
+    default: kube-controller-manager kube-scheduler kube-apiserver
+  max_recovery_attempts:
+    description: 'Maximum recovery attempts for each eligible component'
+    required: false
+    default: '1'
 
 runs:
   using: 'composite'
@@ -47,6 +63,10 @@ runs:
         COMPONENTS: ${{ inputs.components }}
         WAIT_TIMEOUT: ${{ inputs.wait_timeout }}
         MAX_RESTARTS: ${{ inputs.max_restarts }}
+        STABILITY_WINDOW: ${{ inputs.stability_window }}
+        RECOVER_UNHEALTHY: ${{ inputs.recover_unhealthy }}
+        RECOVERY_COMPONENTS: ${{ inputs.recovery_components }}
+        MAX_RECOVERY_ATTEMPTS: ${{ inputs.max_recovery_attempts }}
       run: |
         set -euo pipefail
 
@@ -57,60 +77,262 @@ runs:
           exit 1
         fi
 
-        kubectl --context="kind-${KIND_CLUSTER_NAME}" get --raw='/readyz' || true
+        MAX_RECOVERY_ATTEMPTS="${MAX_RECOVERY_ATTEMPTS#"${MAX_RECOVERY_ATTEMPTS%%[![:space:]]*}"}"
+        MAX_RECOVERY_ATTEMPTS="${MAX_RECOVERY_ATTEMPTS%"${MAX_RECOVERY_ATTEMPTS##*[![:space:]]}"}"
+        if ! [[ "${MAX_RECOVERY_ATTEMPTS}" =~ ^[0-9]+$ ]]; then
+          echo "::error::max_recovery_attempts must be a non-negative integer, got '${MAX_RECOVERY_ATTEMPTS}'"
+          exit 1
+        fi
 
-        check_component() {
-          local component="$1"
-          local selector="component=${component}"
-          local pods
+        STABILITY_WINDOW="${STABILITY_WINDOW#"${STABILITY_WINDOW%%[![:space:]]*}"}"
+        STABILITY_WINDOW="${STABILITY_WINDOW%"${STABILITY_WINDOW##*[![:space:]]}"}"
+        if [[ -z "${STABILITY_WINDOW}" ]]; then
+          STABILITY_WINDOW="0s"
+        fi
+        if ! [[ "${STABILITY_WINDOW}" =~ ^[0-9]+[smh]$ ]]; then
+          echo "::error::stability_window must be a duration like 0s, 60s, or 2m; got '${STABILITY_WINDOW}'"
+          exit 1
+        fi
 
-          pods=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" -n "${NAMESPACE}" \
-            get pod -l "${selector}" -o name)
-          if [[ -z "${pods}" ]]; then
-            echo "::error::no ${component} pods found in ${NAMESPACE} with selector ${selector}"
-            kubectl --context="kind-${KIND_CLUSTER_NAME}" -n "${NAMESPACE}" \
-              get pods -o wide || true
+        RECOVER_UNHEALTHY="${RECOVER_UNHEALTHY#"${RECOVER_UNHEALTHY%%[![:space:]]*}"}"
+        RECOVER_UNHEALTHY="${RECOVER_UNHEALTHY%"${RECOVER_UNHEALTHY##*[![:space:]]}"}"
+        case "${RECOVER_UNHEALTHY}" in
+          true|false) ;;
+          *)
+            echo "::error::recover_unhealthy must be true or false, got '${RECOVER_UNHEALTHY}'"
             exit 1
-          fi
+            ;;
+        esac
+
+        kubectl_kind() {
+          timeout 30s kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" "$@"
+        }
+
+        declare -A RECOVERY_ATTEMPTS=()
+
+        kubectl_kind get --raw='/readyz' || true
+
+        wait_ready() {
+          local component="$1"
+          local selector="component=${component}"
 
           if ! kubectl --context="kind-${KIND_CLUSTER_NAME}" -n "${NAMESPACE}" \
             wait --for=condition=Ready pod -l "${selector}" --timeout="${WAIT_TIMEOUT}"; then
-            echo "::error::${component} pods did not become Ready within ${WAIT_TIMEOUT}"
-            kubectl --context="kind-${KIND_CLUSTER_NAME}" -n "${NAMESPACE}" \
-              get pod -l "${selector}" -o wide || true
-            kubectl --context="kind-${KIND_CLUSTER_NAME}" -n "${NAMESPACE}" \
-              describe pod -l "${selector}" || true
-            kubectl --context="kind-${KIND_CLUSTER_NAME}" -n "${NAMESPACE}" \
-              get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true
-            kubectl --context="kind-${KIND_CLUSTER_NAME}" get --raw='/readyz' || true
-            exit 1
+            return 1
           fi
+        }
 
+        restart_total() {
+          local component="$1"
+          local selector="component=${component}"
           local restart_counts
-          restart_counts=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" -n "${NAMESPACE}" \
-            get pod -l "${selector}" \
-            -o jsonpath='{range .items[*]}{range .status.containerStatuses[*]}{.restartCount}{"\n"}{end}{end}')
+          local restart_count
+          local total=0
+
+          if ! restart_counts=$(kubectl_kind -n "${NAMESPACE}" get pod -l "${selector}" \
+            -o jsonpath='{range .items[*]}{range .status.containerStatuses[*]}{.restartCount}{"\n"}{end}{end}'); then
+            echo "::error::failed to read restart counts for ${component} pods" >&2
+            dump_component_diagnostics "${component}" >&2
+            exit 1
+          fi
           if [[ -z "${restart_counts}" ]]; then
-            echo "::error::no container statuses found for ${component} pods"
-            kubectl --context="kind-${KIND_CLUSTER_NAME}" -n "${NAMESPACE}" \
-              describe pod -l "${selector}" || true
+            echo "::error::no container statuses found for ${component} pods" >&2
+            dump_component_diagnostics "${component}" >&2
             exit 1
           fi
 
           while IFS= read -r restart_count; do
             [[ -z "${restart_count}" ]] && continue
-            if (( restart_count > MAX_RESTARTS )); then
-              echo "::error::${component} restartCount=${restart_count}"
-              kubectl --context="kind-${KIND_CLUSTER_NAME}" -n "${NAMESPACE}" \
-                get pod -l "${selector}" -o wide || true
-              kubectl --context="kind-${KIND_CLUSTER_NAME}" -n "${NAMESPACE}" \
-                describe pod -l "${selector}" || true
+            total=$((total + restart_count))
+          done <<< "${restart_counts}"
+          echo "${total}"
+        }
+
+        dump_component_diagnostics() {
+          local component="$1"
+          local selector="component=${component}"
+          local pods
+          local pod
+
+          kubectl_kind -n "${NAMESPACE}" get pod -l "${selector}" -o wide || true
+          kubectl_kind -n "${NAMESPACE}" describe pod -l "${selector}" || true
+          kubectl_kind -n "${NAMESPACE}" get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true
+
+          pods=$(kubectl_kind -n "${NAMESPACE}" get pod -l "${selector}" -o name 2>/dev/null || true)
+          while IFS= read -r pod; do
+            [[ -z "${pod}" ]] && continue
+            echo "=== ${pod} logs ==="
+            kubectl_kind -n "${NAMESPACE}" logs "${pod}" --all-containers --tail=100 2>/dev/null || true
+            echo "=== ${pod} previous logs ==="
+            kubectl_kind -n "${NAMESPACE}" logs "${pod}" --all-containers --previous --tail=100 2>/dev/null || true
+          done <<< "${pods}"
+
+          kubectl_kind -n kube-system get lease "${component}" -o yaml 2>/dev/null || true
+        }
+
+        is_recovery_component() {
+          local component="$1"
+          local candidate
+
+          for candidate in ${RECOVERY_COMPONENTS}; do
+            if [[ "${candidate}" == "${component}" ]]; then
+              return 0
+            fi
+          done
+          return 1
+        }
+
+        try_recover_component() {
+          local component="$1"
+          local reason="$2"
+          local node="${KIND_CLUSTER_NAME}-control-plane"
+          local attempt
+          local container_ids
+          local container_id
+
+          if [[ "${RECOVER_UNHEALTHY}" != "true" ]]; then
+            return 1
+          fi
+          if (( MAX_RECOVERY_ATTEMPTS == 0 )); then
+            return 1
+          fi
+          if ! is_recovery_component "${component}"; then
+            return 1
+          fi
+
+          attempt="${RECOVERY_ATTEMPTS[${component}]:-0}"
+          if (( attempt >= MAX_RECOVERY_ATTEMPTS )); then
+            return 1
+          fi
+          RECOVERY_ATTEMPTS["${component}"]=$((attempt + 1))
+
+          echo "::warning::${component} is unhealthy (${reason}); restarting static pod container (attempt $((attempt + 1))/${MAX_RECOVERY_ATTEMPTS})"
+          dump_component_diagnostics "${component}"
+
+          if ! docker inspect "${node}" >/dev/null 2>&1; then
+            echo "::warning::cannot recover ${component}: kind node container ${node} not found"
+            return 1
+          fi
+
+          container_ids=$(docker exec "${node}" crictl ps --name "${component}" -q 2>/dev/null || true)
+          if [[ -z "${container_ids}" ]]; then
+            echo "::warning::cannot recover ${component}: no running container found in ${node}"
+            return 1
+          fi
+
+          for container_id in ${container_ids}; do
+            echo "Stopping ${component} container ${container_id} in ${node}..."
+            if ! docker exec "${node}" crictl stop "${container_id}"; then
+              echo "::warning::failed to stop ${component} container ${container_id}"
+              return 1
+            fi
+          done
+
+          sleep 5
+          if ! wait_ready "${component}"; then
+            echo "::warning::${component} did not recover after static pod container restart"
+            dump_component_diagnostics "${component}"
+            kubectl_kind get --raw='/readyz' || true
+            return 1
+          fi
+
+          echo "${component} recovered after static pod container restart."
+          return 0
+        }
+
+        check_component() {
+          local component="$1"
+          local selector="component=${component}"
+          local pods
+          local initial_restarts
+          local final_restarts
+
+          if ! pods=$(kubectl_kind -n "${NAMESPACE}" get pod -l "${selector}" -o name); then
+            if ! try_recover_component "${component}" "failed to list pods in ${NAMESPACE} with selector ${selector}"; then
+              echo "::error::failed to list ${component} pods in ${NAMESPACE} with selector ${selector}"
+              kubectl_kind -n "${NAMESPACE}" get pods -o wide || true
               exit 1
             fi
-          done <<< "${restart_counts}"
+            if ! pods=$(kubectl_kind -n "${NAMESPACE}" get pod -l "${selector}" -o name); then
+              echo "::error::failed to list ${component} pods after recovery"
+              kubectl_kind -n "${NAMESPACE}" get pods -o wide || true
+              exit 1
+            fi
+          fi
+          if [[ -z "${pods}" ]]; then
+            echo "::error::no ${component} pods found in ${NAMESPACE} with selector ${selector}"
+            kubectl_kind -n "${NAMESPACE}" get pods -o wide || true
+            exit 1
+          fi
+
+          if ! wait_ready "${component}"; then
+            if ! try_recover_component "${component}" "pods did not become Ready within ${WAIT_TIMEOUT}"; then
+              echo "::error::${component} pods did not become Ready within ${WAIT_TIMEOUT}"
+              dump_component_diagnostics "${component}"
+              kubectl_kind get --raw='/readyz' || true
+              exit 1
+            fi
+          fi
+          initial_restarts=$(restart_total "${component}")
+
+          if [[ "${STABILITY_WINDOW}" != "0s" ]]; then
+            if (( initial_restarts > MAX_RESTARTS )); then
+              echo "::warning::${component} historical restartCount=${initial_restarts}; checking for stability over ${STABILITY_WINDOW}"
+              dump_component_diagnostics "${component}"
+            fi
+
+            sleep "${STABILITY_WINDOW}"
+            if ! wait_ready "${component}"; then
+              if ! try_recover_component "${component}" "pods became unready during ${STABILITY_WINDOW}"; then
+                echo "::error::${component} pods became unready during ${STABILITY_WINDOW}"
+                dump_component_diagnostics "${component}"
+                kubectl_kind get --raw='/readyz' || true
+                exit 1
+              fi
+              initial_restarts=$(restart_total "${component}")
+              sleep "${STABILITY_WINDOW}"
+              if ! wait_ready "${component}"; then
+                echo "::error::${component} pods became unready after recovery"
+                dump_component_diagnostics "${component}"
+                kubectl_kind get --raw='/readyz' || true
+                exit 1
+              fi
+            fi
+            final_restarts=$(restart_total "${component}")
+            if (( final_restarts > initial_restarts )); then
+              if ! try_recover_component "${component}" "restartCount increased from ${initial_restarts} to ${final_restarts} during ${STABILITY_WINDOW}"; then
+                echo "::error::${component} restartCount increased from ${initial_restarts} to ${final_restarts} during ${STABILITY_WINDOW}"
+                dump_component_diagnostics "${component}"
+                exit 1
+              fi
+              initial_restarts=$(restart_total "${component}")
+              sleep "${STABILITY_WINDOW}"
+              if ! wait_ready "${component}"; then
+                echo "::error::${component} pods became unready after recovery"
+                dump_component_diagnostics "${component}"
+                kubectl_kind get --raw='/readyz' || true
+                exit 1
+              fi
+              final_restarts=$(restart_total "${component}")
+              if (( final_restarts > initial_restarts )); then
+                echo "::error::${component} restartCount increased from ${initial_restarts} to ${final_restarts} after recovery"
+                dump_component_diagnostics "${component}"
+                exit 1
+              fi
+            fi
+            return
+          fi
+
+          if (( initial_restarts > MAX_RESTARTS )); then
+            if ! try_recover_component "${component}" "restartCount=${initial_restarts}"; then
+              echo "::error::${component} restartCount=${initial_restarts}"
+              dump_component_diagnostics "${component}"
+              exit 1
+            fi
+          fi
         }
 
         for component in ${COMPONENTS}; do
           check_component "${component}"
         done
-        kubectl --context="kind-${KIND_CLUSTER_NAME}" get --raw='/readyz'
+        kubectl_kind get --raw='/readyz'
diff --git a/.github/actions/gpu-test-cleanup/action.yml b/.github/actions/gpu-test-cleanup/action.yml
index d2089816b..e8536e530 100644
--- a/.github/actions/gpu-test-cleanup/action.yml
+++ b/.github/actions/gpu-test-cleanup/action.yml
@@ -33,13 +33,18 @@ runs:
       env:
         KIND_CLUSTER_NAME: ${{ inputs.cluster_name }}
       run: |
+        set -o pipefail
         mkdir -p /tmp/debug-artifacts
-        kubectl --context="kind-${KIND_CLUSTER_NAME}" get all --all-namespaces > /tmp/debug-artifacts/all-resources.txt || true
-        kubectl --context="kind-${KIND_CLUSTER_NAME}" get events --all-namespaces --sort-by='.lastTimestamp' > /tmp/debug-artifacts/events.txt || true
-        kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods -o wide > /tmp/debug-artifacts/gpu-operator-pods.txt || true
-        kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator logs -l app=nvidia-device-plugin-daemonset --tail=100 > /tmp/debug-artifacts/device-plugin-logs.txt || true
-        kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator logs -l app.kubernetes.io/component=gpu-operator --tail=100 > /tmp/debug-artifacts/gpu-operator-logs.txt || true
-        kubectl --context="kind-${KIND_CLUSTER_NAME}" get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded > /tmp/debug-artifacts/non-running-pods.txt || true
+        kubectl_kind() {
+          timeout 30s kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" "$@"
+        }
+
+        kubectl_kind get all --all-namespaces > /tmp/debug-artifacts/all-resources.txt || true
+        kubectl_kind get events --all-namespaces --sort-by='.lastTimestamp' > /tmp/debug-artifacts/events.txt || true
+        kubectl_kind -n gpu-operator get pods -o wide > /tmp/debug-artifacts/gpu-operator-pods.txt || true
+        kubectl_kind -n gpu-operator logs -l app=nvidia-device-plugin-daemonset --tail=100 > /tmp/debug-artifacts/device-plugin-logs.txt || true
+        kubectl_kind -n gpu-operator logs -l app.kubernetes.io/component=gpu-operator --tail=100 > /tmp/debug-artifacts/gpu-operator-logs.txt || true
+        kubectl_kind get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded > /tmp/debug-artifacts/non-running-pods.txt || true
         tar_inputs=()
         [[ -f recipe.yaml ]] && tar_inputs+=(recipe.yaml)
         [[ -d bundle ]] && tar_inputs+=(bundle)
@@ -57,7 +62,16 @@ runs:
         KIND_CLUSTER_NAME: ${{ inputs.cluster_name }}
       run: |
         mkdir -p /tmp/kind-logs
-        kind export logs /tmp/kind-logs --name "${KIND_CLUSTER_NAME}" || true
+        timeout 300s kind export logs /tmp/kind-logs --name "${KIND_CLUSTER_NAME}" || true
+
+    - name: Cleanup
+      if: always()
+      shell: bash
+      env:
+        KIND_CLUSTER_NAME: ${{ inputs.cluster_name }}
+      run: |
+        timeout 300s kind delete cluster --name "${KIND_CLUSTER_NAME}" || true
+        timeout 120s docker system prune -f || true
 
     - name: Upload debug artifacts
       if: failure()
@@ -68,12 +82,3 @@ runs:
           /tmp/debug-artifacts/
           /tmp/kind-logs/
         retention-days: 7
-
-    - name: Cleanup
-      if: always()
-      shell: bash
-      env:
-        KIND_CLUSTER_NAME: ${{ inputs.cluster_name }}
-      run: |
-        kind delete cluster --name "${KIND_CLUSTER_NAME}" || true
-        docker system prune -f || true
diff --git a/.github/workflows/gpu-h100-inference-test.yaml b/.github/workflows/gpu-h100-inference-test.yaml
index 373cd3c6e..95df8740e 100644
--- a/.github/workflows/gpu-h100-inference-test.yaml
+++ b/.github/workflows/gpu-h100-inference-test.yaml
@@ -111,19 +111,33 @@ jobs:
         with:
           persist-credentials: false
 
+      - name: Load GPU test versions
+        id: versions
+        uses: ./.github/actions/load-versions
+
       - name: Set up GPU cluster
         uses: ./.github/actions/gpu-cluster-setup
+        with:
+          kind_node_image: ${{ steps.versions.outputs.h100_kind_node_image }}
+          min_gpu_count: '2'
+          gpu_model_pattern: H100
+          min_free_disk_gb: '50'
+          min_available_memory_gb: '16'
+          cluster_create_timeout: 900s
 
       - name: Build aicr
         uses: ./.github/actions/aicr-build
         with:
-          validator_phases: 'conformance'
+          build_snapshot_agent: 'false'
+          validator_phases: 'none'
 
       - name: Check control plane health
         uses: ./.github/actions/check-control-plane-health
         with:
           cluster_name: ${{ env.KIND_CLUSTER_NAME }}
           wait_timeout: 120s
+          stability_window: 60s
+          recover_unhealthy: 'true'
 
       - name: Install runtime bundle
         id: bundle-install
@@ -140,9 +154,18 @@ jobs:
         with:
           cluster_name: ${{ env.KIND_CLUSTER_NAME }}
           wait_timeout: 120s
+          stability_window: 60s
+          recover_unhealthy: 'true'
 
       # --- Snapshot and GPU validation ---
 
+      - name: Build snapshot agent image
+        uses: ./.github/actions/aicr-build
+        with:
+          build_cli: 'false'
+          build_snapshot_agent: 'true'
+          validator_phases: 'none'
+
       - name: Snapshot and validate GPU
         uses: ./.github/actions/gpu-snapshot-validate
         with:
@@ -158,6 +181,8 @@ jobs:
         with:
           cluster_name: ${{ env.KIND_CLUSTER_NAME }}
           wait_timeout: 120s
+          stability_window: 60s
+          recover_unhealthy: 'true'
 
       - name: Install Karpenter + KWOK
         uses: ./.github/actions/install-karpenter-kwok
@@ -174,10 +199,8 @@ jobs:
         with:
           cluster_name: ${{ env.KIND_CLUSTER_NAME }}
           wait_timeout: 120s
-
-      - name: Prepare chainsaw
-        id: versions
-        uses: ./.github/actions/load-versions
+          stability_window: 60s
+          recover_unhealthy: 'true'
 
       - name: Install chainsaw
         uses: ./.github/actions/setup-build-tools
@@ -196,6 +219,7 @@ jobs:
       # see a settled inference stack.
 
       - name: Verify expected resources exist
+        timeout-minutes: 3
         run: |
           go run ./tests/chainsaw/ai-conformance/ \
             --dir tests/chainsaw/ai-conformance/kind-inference-dynamo \
@@ -204,11 +228,20 @@ jobs:
             --kubeconfig="${HOME}/.kube/config" \
             --debug
 
+      - name: Build conformance validator image
+        uses: ./.github/actions/aicr-build
+        with:
+          build_cli: 'false'
+          build_snapshot_agent: 'false'
+          validator_phases: 'conformance'
+
       - name: Check control plane health before conformance validation
         uses: ./.github/actions/check-control-plane-health
         with:
           cluster_name: ${{ env.KIND_CLUSTER_NAME }}
           wait_timeout: 120s
+          stability_window: 60s
+          recover_unhealthy: 'true'
 
       - name: Validate CNCF AI Conformance
         id: validate-conformance
@@ -232,28 +265,9 @@ jobs:
       # training smoke test if needed.
       # --- Validation artifacts ---
 
-      # Collect a post-run resource snapshot whenever the runtime bundle
-      # installed. This preserves triage data for snapshot/chainsaw/conformance
-      # failures; continue-on-error keeps the original failure intact.
-      - name: Collect validation artifacts
-        if: >-
-          always()
-          && !cancelled()
-          && steps.bundle-install.outcome == 'success'
-        continue-on-error: true
-        shell: bash
-        run: |
-          set -o pipefail
-          mkdir -p conformance-evidence
-          go run ./tests/chainsaw/ai-conformance/ \
-            --dir tests/chainsaw/ai-conformance/kind-inference-dynamo \
-            --dir tests/chainsaw/ai-conformance/common \
-            --dir tests/chainsaw/ai-conformance/kind-common \
-            --kubeconfig="${HOME}/.kube/config" \
-            --debug | tee conformance-evidence/resource-existence-post.txt
-
       - name: Upload validation artifacts
         if: always()
+        timeout-minutes: 5
         uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a  # v7.0.1
         with:
           name: conformance-evidence
@@ -264,57 +278,81 @@ jobs:
 
       - name: Debug diagnostics
         if: failure()
+        timeout-minutes: 5
+        shell: bash
         run: |
+          set -o pipefail
+          kubectl_kind() {
+            timeout 30s kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" "$@"
+          }
+
           echo "=== ClusterPolicy status ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" get clusterpolicy -o yaml 2>/dev/null || true
+          kubectl_kind get clusterpolicy -o yaml 2>/dev/null || true
           echo "=== GPU Operator pods ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods -o wide 2>/dev/null || true
+          kubectl_kind -n gpu-operator get pods -o wide 2>/dev/null || true
           echo "=== Non-running pods (all namespaces) ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded 2>/dev/null || true
+          kubectl_kind get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded 2>/dev/null || true
           echo "=== Recent events (gpu-operator) ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true
+          kubectl_kind -n gpu-operator get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true
           echo "=== Dynamo pods ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system get pods -o wide 2>/dev/null || true
+          kubectl_kind -n dynamo-system get pods -o wide 2>/dev/null || true
           echo "=== Dynamo operator logs ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system \
+          kubectl_kind -n dynamo-system \
             logs deployment/dynamo-operator-controller-manager --tail=100 -c manager 2>/dev/null || true
           echo "=== Recent events (dynamo-system) ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true
+          kubectl_kind -n dynamo-system get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true
+          echo "=== KAI scheduler pods ==="
+          kubectl_kind -n kai-scheduler get pods -o wide 2>/dev/null || true
+          echo "=== KAI admission deployment ==="
+          kubectl_kind -n kai-scheduler get deployment admission -o wide 2>/dev/null || true
+          echo "=== KAI admission deployment describe ==="
+          kubectl_kind -n kai-scheduler describe deployment admission 2>/dev/null || true
+          echo "=== KAI admission pod describe ==="
+          kubectl_kind -n kai-scheduler get pods -o name 2>/dev/null \
+            | grep '^pod/admission-' \
+            | while read -r pod; do
+                kubectl_kind -n kai-scheduler describe "${pod}" 2>/dev/null || true
+              done || true
+          echo "=== KAI admission logs ==="
+          kubectl_kind -n kai-scheduler \
+            logs deployment/admission --all-containers --tail=200 2>/dev/null || true
+          echo "=== Recent events (kai-scheduler) ==="
+          kubectl_kind -n kai-scheduler get events --sort-by='.lastTimestamp' 2>/dev/null | tail -50 || true
           echo "=== Custom metrics API ==="
           for METRIC in gpu_utilization gpu_memory_used gpu_power_usage; do
             echo "--- ${METRIC} ---"
             for NS in gpu-operator dynamo-system; do
-              kubectl --context="kind-${KIND_CLUSTER_NAME}" get --raw \
+              kubectl_kind get --raw \
                 "/apis/custom.metrics.k8s.io/v1beta1/namespaces/${NS}/pods/*/${METRIC}" 2>/dev/null | jq . || true
             done
           done
           echo "=== Grafana deployment ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring get deployment grafana -o wide 2>/dev/null || true
+          kubectl_kind -n monitoring get deployment grafana -o wide 2>/dev/null || true
           echo "=== Grafana pods ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring get pods \
+          kubectl_kind -n monitoring get pods \
             -l app.kubernetes.io/name=grafana -o wide 2>/dev/null || true
           echo "=== Grafana deployment describe ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring describe deployment grafana 2>/dev/null || true
+          kubectl_kind -n monitoring describe deployment grafana 2>/dev/null || true
           echo "=== Grafana pod describe ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring describe pods \
+          kubectl_kind -n monitoring describe pods \
             -l app.kubernetes.io/name=grafana 2>/dev/null || true
           echo "=== prometheus-adapter pods ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring get pods -l app.kubernetes.io/name=prometheus-adapter -o wide 2>/dev/null || true
+          kubectl_kind -n monitoring get pods -l app.kubernetes.io/name=prometheus-adapter -o wide 2>/dev/null || true
           echo "=== kgateway pods ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kgateway-system get pods -o wide 2>/dev/null || true
+          kubectl_kind -n kgateway-system get pods -o wide 2>/dev/null || true
           echo "=== GatewayClass status ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" get gatewayclass -o yaml 2>/dev/null || true
+          kubectl_kind get gatewayclass -o yaml 2>/dev/null || true
           echo "=== Gateway status ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" get gateways -A -o yaml 2>/dev/null || true
+          kubectl_kind get gateways -A -o yaml 2>/dev/null || true
           echo "=== DCGM Exporter pods ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \
+          kubectl_kind -n gpu-operator \
             get pods -l app=nvidia-dcgm-exporter -o wide 2>/dev/null || true
           echo "=== Monitoring pods ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring get pods -o wide 2>/dev/null || true
+          kubectl_kind -n monitoring get pods -o wide 2>/dev/null || true
           echo "=== DRA ResourceSlices ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" get resourceslices -o wide 2>/dev/null || true
+          kubectl_kind get resourceslices -o wide 2>/dev/null || true
           echo "=== Node status ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" get nodes -o wide 2>/dev/null || true
+          kubectl_kind get nodes -o wide 2>/dev/null || true
 
       - name: GPU Test Cleanup
         if: always()
diff --git a/.github/workflows/gpu-h100-training-test.yaml b/.github/workflows/gpu-h100-training-test.yaml
index 36a2d1eec..6193544fa 100644
--- a/.github/workflows/gpu-h100-training-test.yaml
+++ b/.github/workflows/gpu-h100-training-test.yaml
@@ -107,19 +107,33 @@ jobs:
         with:
           persist-credentials: false
 
+      - name: Load GPU test versions
+        id: versions
+        uses: ./.github/actions/load-versions
+
       - name: Set up GPU cluster
         uses: ./.github/actions/gpu-cluster-setup
+        with:
+          kind_node_image: ${{ steps.versions.outputs.h100_kind_node_image }}
+          min_gpu_count: '2'
+          gpu_model_pattern: H100
+          min_free_disk_gb: '50'
+          min_available_memory_gb: '16'
+          cluster_create_timeout: 900s
 
       - name: Build aicr
         uses: ./.github/actions/aicr-build
         with:
-          validator_phases: 'conformance'
+          build_snapshot_agent: 'false'
+          validator_phases: 'none'
 
       - name: Check control plane health
         uses: ./.github/actions/check-control-plane-health
         with:
           cluster_name: ${{ env.KIND_CLUSTER_NAME }}
           wait_timeout: 120s
+          stability_window: 60s
+          recover_unhealthy: 'true'
 
       - name: Install runtime bundle
         id: bundle-install
@@ -133,13 +147,23 @@ jobs:
           best_effort: 'false'
 
       - name: Check control plane health after runtime install
+        id: post_runtime_control_plane_health
         uses: ./.github/actions/check-control-plane-health
         with:
           cluster_name: ${{ env.KIND_CLUSTER_NAME }}
           wait_timeout: 120s
+          stability_window: 60s
+          recover_unhealthy: 'true'
 
       # --- Snapshot and GPU validation ---
 
+      - name: Build snapshot agent image
+        uses: ./.github/actions/aicr-build
+        with:
+          build_cli: 'false'
+          build_snapshot_agent: 'true'
+          validator_phases: 'none'
+
       - name: Snapshot and validate GPU
         uses: ./.github/actions/gpu-snapshot-validate
         with:
@@ -155,6 +179,8 @@ jobs:
         with:
           cluster_name: ${{ env.KIND_CLUSTER_NAME }}
           wait_timeout: 120s
+          stability_window: 60s
+          recover_unhealthy: 'true'
 
       - name: Install Karpenter + KWOK
         uses: ./.github/actions/install-karpenter-kwok
@@ -171,10 +197,8 @@ jobs:
         with:
           cluster_name: ${{ env.KIND_CLUSTER_NAME }}
           wait_timeout: 120s
-
-      - name: Prepare chainsaw
-        id: versions
-        uses: ./.github/actions/load-versions
+          stability_window: 60s
+          recover_unhealthy: 'true'
 
       - name: Install chainsaw
         uses: ./.github/actions/setup-build-tools
@@ -193,6 +217,7 @@ jobs:
       # has had time to bootstrap (pod-autoscaling check needs live metric data).
 
       - name: Verify expected resources exist
+        timeout-minutes: 3
         run: |
           go run ./tests/chainsaw/ai-conformance/ \
             --dir tests/chainsaw/ai-conformance/kind-training-kubeflow \
@@ -201,11 +226,20 @@ jobs:
             --kubeconfig="${HOME}/.kube/config" \
             --debug
 
+      - name: Build conformance validator image
+        uses: ./.github/actions/aicr-build
+        with:
+          build_cli: 'false'
+          build_snapshot_agent: 'false'
+          validator_phases: 'conformance'
+
       - name: Check control plane health before conformance validation
         uses: ./.github/actions/check-control-plane-health
         with:
           cluster_name: ${{ env.KIND_CLUSTER_NAME }}
           wait_timeout: 120s
+          stability_window: 60s
+          recover_unhealthy: 'true'
 
       - name: Validate CNCF AI Conformance
         id: validate-conformance
@@ -225,28 +259,9 @@ jobs:
 
       # --- Validation artifacts ---
 
-      # Collect a post-run resource snapshot whenever the runtime bundle
-      # installed. This preserves triage data for snapshot/chainsaw/conformance
-      # failures; continue-on-error keeps the original failure intact.
-      - name: Collect validation artifacts
-        if: >-
-          always()
-          && !cancelled()
-          && steps.bundle-install.outcome == 'success'
-        continue-on-error: true
-        shell: bash
-        run: |
-          set -o pipefail
-          mkdir -p conformance-evidence
-          go run ./tests/chainsaw/ai-conformance/ \
-            --dir tests/chainsaw/ai-conformance/kind-training-kubeflow \
-            --dir tests/chainsaw/ai-conformance/common \
-            --dir tests/chainsaw/ai-conformance/kind-common \
-            --kubeconfig="${HOME}/.kube/config" \
-            --debug | tee conformance-evidence/resource-existence-post.txt
-
       - name: Upload validation artifacts
         if: always()
+        timeout-minutes: 5
         uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a  # v7.0.1
         with:
           name: conformance-evidence
@@ -259,41 +274,63 @@ jobs:
 
       - name: Debug diagnostics
         if: failure()
+        timeout-minutes: 5
+        shell: bash
         run: |
+          set -o pipefail
+          kubectl_kind() {
+            timeout 30s kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" "$@"
+          }
+
           echo "=== Grafana deployment ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring get deployment grafana -o wide 2>/dev/null || true
+          kubectl_kind -n monitoring get deployment grafana -o wide 2>/dev/null || true
           echo "=== Grafana pods ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring get pods \
+          kubectl_kind -n monitoring get pods \
             -l app.kubernetes.io/name=grafana -o wide 2>/dev/null || true
           echo "=== Grafana deployment describe ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring describe deployment grafana 2>/dev/null || true
+          kubectl_kind -n monitoring describe deployment grafana 2>/dev/null || true
           echo "=== Grafana pod describe ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring describe pods \
+          kubectl_kind -n monitoring describe pods \
             -l app.kubernetes.io/name=grafana 2>/dev/null || true
           echo "=== KAI scheduler pods ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kai-scheduler get pods -o wide 2>/dev/null || true
+          kubectl_kind -n kai-scheduler get pods -o wide 2>/dev/null || true
+          echo "=== KAI admission deployment ==="
+          kubectl_kind -n kai-scheduler get deployment admission -o wide 2>/dev/null || true
+          echo "=== KAI admission deployment describe ==="
+          kubectl_kind -n kai-scheduler describe deployment admission 2>/dev/null || true
+          echo "=== KAI admission pod describe ==="
+          kubectl_kind -n kai-scheduler get pods -o name 2>/dev/null \
+            | grep '^pod/admission-' \
+            | while read -r pod; do
+                kubectl_kind -n kai-scheduler describe "${pod}" 2>/dev/null || true
+              done || true
+          echo "=== KAI admission logs ==="
+          kubectl_kind -n kai-scheduler \
+            logs deployment/admission --all-containers --tail=200 2>/dev/null || true
           echo "=== KAI scheduler logs ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kai-scheduler \
+          kubectl_kind -n kai-scheduler \
             logs deployment/kai-scheduler-default --tail=100 2>/dev/null || true
           echo "=== KAI scheduler queues ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" get queues -A 2>/dev/null || true
+          kubectl_kind get queues -A 2>/dev/null || true
           echo "=== KAI scheduler podgroups ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" get podgroups -A 2>/dev/null || true
+          kubectl_kind get podgroups -A 2>/dev/null || true
+          echo "=== Recent events (kai-scheduler) ==="
+          kubectl_kind -n kai-scheduler get events --sort-by='.lastTimestamp' 2>/dev/null | tail -50 || true
           echo "=== Kubeflow Trainer deployment ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kubeflow get deployment kubeflow-trainer-controller-manager -o wide 2>/dev/null || true
+          kubectl_kind -n kubeflow get deployment kubeflow-trainer-controller-manager -o wide 2>/dev/null || true
           echo "=== Kubeflow pods ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kubeflow get pods -o wide 2>/dev/null || true
+          kubectl_kind -n kubeflow get pods -o wide 2>/dev/null || true
           echo "=== Kubeflow validating webhooks ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" get validatingwebhookconfigurations validator.trainer.kubeflow.org -o yaml 2>/dev/null || true
+          kubectl_kind get validatingwebhookconfigurations validator.trainer.kubeflow.org -o yaml 2>/dev/null || true
           echo "=== Kubeflow Trainer CRD ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" get crd trainjobs.trainer.kubeflow.org -o yaml 2>/dev/null || true
+          kubectl_kind get crd trainjobs.trainer.kubeflow.org -o yaml 2>/dev/null || true
           echo "=== Non-running pods (all namespaces) ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" get pods -A \
+          kubectl_kind get pods -A \
             --field-selector=status.phase!=Running,status.phase!=Succeeded 2>/dev/null || true
           echo "=== GPU Operator pods ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods -o wide 2>/dev/null || true
+          kubectl_kind -n gpu-operator get pods -o wide 2>/dev/null || true
           echo "=== Node resources ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" describe nodes 2>/dev/null | \
+          kubectl_kind describe nodes 2>/dev/null | \
             grep -A 20 "Allocated resources" || true
 
       - name: GPU Test Cleanup
diff --git a/tests/chainsaw/ai-conformance/kind-inference-dynamo/chainsaw-test.yaml b/tests/chainsaw/ai-conformance/kind-inference-dynamo/chainsaw-test.yaml
index 1b1f701ad..85aa33ab6 100644
--- a/tests/chainsaw/ai-conformance/kind-inference-dynamo/chainsaw-test.yaml
+++ b/tests/chainsaw/ai-conformance/kind-inference-dynamo/chainsaw-test.yaml
@@ -110,6 +110,8 @@ spec:
     # ── KAI Scheduler ──────────────────────────────────────────────────
     - name: assert-kai-scheduler
       description: Verify KAI scheduler is available.
+      timeouts:
+        assert: 600s
       try:
         - assert:
             file: ../common/assert-kai-scheduler.yaml
diff --git a/tests/chainsaw/ai-conformance/kind-training-kubeflow/chainsaw-test.yaml b/tests/chainsaw/ai-conformance/kind-training-kubeflow/chainsaw-test.yaml
index 9809cd0bb..e3d2b35a9 100644
--- a/tests/chainsaw/ai-conformance/kind-training-kubeflow/chainsaw-test.yaml
+++ b/tests/chainsaw/ai-conformance/kind-training-kubeflow/chainsaw-test.yaml
@@ -101,6 +101,8 @@ spec:
 
     - name: assert-kai-scheduler
       description: Verify KAI scheduler is available.
+      timeouts:
+        assert: 600s
       try:
         - assert:
             file: ../common/assert-kai-scheduler.yaml

From 0834778adcdd5fce8ae6f0fd3f6cc2b20713c9b4 Mon Sep 17 00:00:00 2001
From: Yuan Chen <yuanchen97@gmail.com>
Date: Sat, 25 Apr 2026 18:27:13 -0700
Subject: [PATCH 05/21] ci: address H100 review feedback

---
 .github/actions/aicr-build/action.yml         |  7 --
 .../check-control-plane-health/action.yml     | 10 ++-
 .github/actions/gpu-cluster-setup/action.yml  | 65 ++++++++++++-------
 .../actions/gpu-snapshot-validate/action.yml  | 23 ++++---
 .github/actions/gpu-test-cleanup/action.yml   |  1 -
 5 files changed, 58 insertions(+), 48 deletions(-)

diff --git a/.github/actions/aicr-build/action.yml b/.github/actions/aicr-build/action.yml
index ce9841f2f..14b68dcf8 100644
--- a/.github/actions/aicr-build/action.yml
+++ b/.github/actions/aicr-build/action.yml
@@ -36,13 +36,6 @@ inputs:
 runs:
   using: 'composite'
   steps:
-
-    - name: Install ko
-      shell: bash
-      run: |
-        KO_VERSION=$(yq eval '.build_tools.ko' .settings.yaml)
-        GOFLAGS= go install "github.com/google/ko@${KO_VERSION}"
-
     - name: Build aicr CLI binary
       if: inputs.build_cli == 'true' || inputs.build_snapshot_agent == 'true'
       shell: bash
diff --git a/.github/actions/check-control-plane-health/action.yml b/.github/actions/check-control-plane-health/action.yml
index bee31623b..34d42937c 100644
--- a/.github/actions/check-control-plane-health/action.yml
+++ b/.github/actions/check-control-plane-health/action.yml
@@ -108,6 +108,10 @@ runs:
           timeout 30s kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" "$@"
         }
 
+        docker_timeout() {
+          timeout 30s docker "$@"
+        }
+
         declare -A RECOVERY_ATTEMPTS=()
 
         kubectl_kind get --raw='/readyz' || true
@@ -209,12 +213,12 @@ runs:
           echo "::warning::${component} is unhealthy (${reason}); restarting static pod container (attempt $((attempt + 1))/${MAX_RECOVERY_ATTEMPTS})"
           dump_component_diagnostics "${component}"
 
-          if ! docker inspect "${node}" >/dev/null 2>&1; then
+          if ! docker_timeout inspect "${node}" >/dev/null 2>&1; then
             echo "::warning::cannot recover ${component}: kind node container ${node} not found"
             return 1
           fi
 
-          container_ids=$(docker exec "${node}" crictl ps --name "${component}" -q 2>/dev/null || true)
+          container_ids=$(docker_timeout exec "${node}" crictl ps --name "${component}" -q 2>/dev/null || true)
           if [[ -z "${container_ids}" ]]; then
             echo "::warning::cannot recover ${component}: no running container found in ${node}"
             return 1
@@ -222,7 +226,7 @@ runs:
 
           for container_id in ${container_ids}; do
             echo "Stopping ${component} container ${container_id} in ${node}..."
-            if ! docker exec "${node}" crictl stop "${container_id}"; then
+            if ! docker_timeout exec "${node}" crictl stop "${container_id}"; then
               echo "::warning::failed to stop ${component} container ${container_id}"
               return 1
             fi
diff --git a/.github/actions/gpu-cluster-setup/action.yml b/.github/actions/gpu-cluster-setup/action.yml
index c5b5bb3cd..d4ea8f744 100644
--- a/.github/actions/gpu-cluster-setup/action.yml
+++ b/.github/actions/gpu-cluster-setup/action.yml
@@ -89,7 +89,6 @@ runs:
         MIN_GPU_COUNT: ${{ inputs.min_gpu_count }}
         MIN_FREE_DISK_GB: ${{ inputs.min_free_disk_gb }}
         MIN_AVAILABLE_MEMORY_GB: ${{ inputs.min_available_memory_gb }}
-        KIND_NODE_IMAGE: ${{ inputs.kind_node_image }}
       run: |
         set -euo pipefail
 
@@ -110,18 +109,6 @@ runs:
           fi
         done
 
-        free_disk_gb=$(df -BG --output=avail / | tail -1 | tr -dc '0-9')
-        if (( free_disk_gb < MIN_FREE_DISK_GB )); then
-          echo "::error::free disk on / is ${free_disk_gb}GiB, need at least ${MIN_FREE_DISK_GB}GiB"
-          exit 1
-        fi
-
-        available_memory_gb=$(free -g | awk '/^Mem:/ {print $7}')
-        if (( available_memory_gb < MIN_AVAILABLE_MEMORY_GB )); then
-          echo "::error::available memory is ${available_memory_gb}GiB, need at least ${MIN_AVAILABLE_MEMORY_GB}GiB"
-          exit 1
-        fi
-
         echo "=== Docker health ==="
         docker info >/dev/null
         docker version
@@ -148,16 +135,6 @@ runs:
         kind get clusters || true
         docker ps -a --filter "name=${KIND_CLUSTER_NAME}" || true
 
-        if [[ -n "${KIND_NODE_IMAGE}" ]]; then
-          echo "=== Kind node image cache ==="
-          if docker image inspect "${KIND_NODE_IMAGE}" >/dev/null 2>&1; then
-            echo "Kind node image already cached: ${KIND_NODE_IMAGE}"
-          else
-            echo "Pulling kind node image: ${KIND_NODE_IMAGE}"
-            timeout 600s docker pull "${KIND_NODE_IMAGE}"
-          fi
-        fi
-
     - name: Configure NVIDIA Container Toolkit for kind
       shell: bash
       run: |
@@ -170,7 +147,7 @@ runs:
       shell: bash
       run: |
         set -euo pipefail
-        docker run --rm -v /dev/null:/var/run/nvidia-container-devices/all ubuntu:22.04 nvidia-smi -L
+        timeout 300s docker run --rm -v /dev/null:/var/run/nvidia-container-devices/all ubuntu:22.04 nvidia-smi -L
 
     - name: Increase inotify limits
       shell: bash
@@ -184,7 +161,9 @@ runs:
         set -euo pipefail
         if kind get clusters | grep -Fxq "${KIND_CLUSTER_NAME}"; then
           echo "Deleting stale kind cluster: ${KIND_CLUSTER_NAME}"
-          timeout 180s kind delete cluster --name "${KIND_CLUSTER_NAME}"
+          if ! timeout 180s kind delete cluster --name "${KIND_CLUSTER_NAME}"; then
+            echo "::warning::kind delete cluster timed out or failed; falling back to direct container cleanup"
+          fi
         else
           echo "No stale kind cluster named ${KIND_CLUSTER_NAME}"
         fi
@@ -203,6 +182,42 @@ runs:
           exit 1
         fi
 
+    - name: Check runner capacity
+      shell: bash
+      env:
+        MIN_FREE_DISK_GB: ${{ inputs.min_free_disk_gb }}
+        MIN_AVAILABLE_MEMORY_GB: ${{ inputs.min_available_memory_gb }}
+      run: |
+        set -euo pipefail
+        free_disk_gb=$(df -BG --output=avail / | tail -1 | tr -dc '0-9')
+        if (( free_disk_gb < MIN_FREE_DISK_GB )); then
+          echo "::error::free disk on / is ${free_disk_gb}GiB, need at least ${MIN_FREE_DISK_GB}GiB"
+          exit 1
+        fi
+
+        available_memory_gb=$(free -g | awk '/^Mem:/ {print $7}')
+        if (( available_memory_gb < MIN_AVAILABLE_MEMORY_GB )); then
+          echo "::error::available memory is ${available_memory_gb}GiB, need at least ${MIN_AVAILABLE_MEMORY_GB}GiB"
+          exit 1
+        fi
+
+        echo "Runner capacity is sufficient: disk=${free_disk_gb}GiB memory=${available_memory_gb}GiB"
+
+    - name: Warm kind node image
+      if: ${{ inputs.kind_node_image != '' }}
+      shell: bash
+      env:
+        KIND_NODE_IMAGE: ${{ inputs.kind_node_image }}
+      run: |
+        set -euo pipefail
+        echo "=== Kind node image cache ==="
+        if docker image inspect "${KIND_NODE_IMAGE}" >/dev/null 2>&1; then
+          echo "Kind node image already cached: ${KIND_NODE_IMAGE}"
+        else
+          echo "Pulling kind node image: ${KIND_NODE_IMAGE}"
+          timeout 600s docker pull "${KIND_NODE_IMAGE}"
+        fi
+
     - name: Create GPU-enabled kind cluster
       shell: bash
       env:
diff --git a/.github/actions/gpu-snapshot-validate/action.yml b/.github/actions/gpu-snapshot-validate/action.yml
index 9cfb67e9c..b89224a60 100644
--- a/.github/actions/gpu-snapshot-validate/action.yml
+++ b/.github/actions/gpu-snapshot-validate/action.yml
@@ -69,22 +69,21 @@ runs:
       if: failure()
       shell: bash
       run: |
+        kubectl_kind() {
+          timeout 30s kubectl --request-timeout=10s --context="kind-${{ inputs.cluster_name }}" "$@"
+        }
+
         echo "=== Snapshot Job ==="
-        kubectl --context="kind-${{ inputs.cluster_name }}" -n default get job aicr -o yaml || true
+        kubectl_kind -n default get job aicr -o yaml || true
         echo "=== Snapshot Pods ==="
-        kubectl --context="kind-${{ inputs.cluster_name }}" -n default \
-          get pods -l app.kubernetes.io/name=aicr -o wide || true
+        kubectl_kind -n default get pods -l app.kubernetes.io/name=aicr -o wide || true
         echo "=== Snapshot Job describe ==="
-        kubectl --context="kind-${{ inputs.cluster_name }}" -n default describe job aicr || true
+        kubectl_kind -n default describe job aicr || true
         echo "=== Snapshot Pod describe ==="
-        kubectl --context="kind-${{ inputs.cluster_name }}" -n default \
-          describe pods -l app.kubernetes.io/name=aicr || true
+        kubectl_kind -n default describe pods -l app.kubernetes.io/name=aicr || true
         echo "=== Snapshot current logs ==="
-        kubectl --context="kind-${{ inputs.cluster_name }}" -n default \
-          logs -l app.kubernetes.io/name=aicr --all-containers --tail=200 || true
+        kubectl_kind -n default logs -l app.kubernetes.io/name=aicr --all-containers --tail=200 || true
         echo "=== Snapshot previous logs ==="
-        kubectl --context="kind-${{ inputs.cluster_name }}" -n default \
-          logs -l app.kubernetes.io/name=aicr --all-containers --previous --tail=200 || true
+        kubectl_kind -n default logs -l app.kubernetes.io/name=aicr --all-containers --previous --tail=200 || true
         echo "=== Snapshot ConfigMap ==="
-        kubectl --context="kind-${{ inputs.cluster_name }}" -n default \
-          get configmap aicr-snapshot -o yaml || true
+        kubectl_kind -n default get configmap aicr-snapshot -o yaml || true
diff --git a/.github/actions/gpu-test-cleanup/action.yml b/.github/actions/gpu-test-cleanup/action.yml
index e8536e530..2e3ca2685 100644
--- a/.github/actions/gpu-test-cleanup/action.yml
+++ b/.github/actions/gpu-test-cleanup/action.yml
@@ -71,7 +71,6 @@ runs:
         KIND_CLUSTER_NAME: ${{ inputs.cluster_name }}
       run: |
         timeout 300s kind delete cluster --name "${KIND_CLUSTER_NAME}" || true
-        timeout 120s docker system prune -f || true
 
     - name: Upload debug artifacts
       if: failure()

From dfd2685dbdc557e7847d7fbc3d364965627948c3 Mon Sep 17 00:00:00 2001
From: Yuan Chen <yuanchen97@gmail.com>
Date: Sat, 25 Apr 2026 18:28:54 -0700
Subject: [PATCH 06/21] ci: clarify control-plane recovery handling

---
 .github/actions/check-control-plane-health/action.yml | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/.github/actions/check-control-plane-health/action.yml b/.github/actions/check-control-plane-health/action.yml
index 34d42937c..fda1075c7 100644
--- a/.github/actions/check-control-plane-health/action.yml
+++ b/.github/actions/check-control-plane-health/action.yml
@@ -32,7 +32,7 @@ inputs:
     required: false
     default: 60s
   max_restarts:
-    description: 'Maximum tolerated restart count for each control-plane container'
+    description: 'Maximum tolerated restart count. With stability_window > 0, historical restarts are diagnostic and new restarts fail.'
     required: false
     default: '1'
   stability_window:
@@ -218,7 +218,10 @@ runs:
             return 1
           fi
 
-          container_ids=$(docker_timeout exec "${node}" crictl ps --name "${component}" -q 2>/dev/null || true)
+          if ! container_ids=$(docker_timeout exec "${node}" crictl ps --name "${component}" -q 2>/dev/null); then
+            echo "::warning::cannot recover ${component}: timed out or failed to list containers in ${node}"
+            return 1
+          fi
           if [[ -z "${container_ids}" ]]; then
             echo "::warning::cannot recover ${component}: no running container found in ${node}"
             return 1

From 87d84a0bdaf74a460758ef03dd80f506c94d7642 Mon Sep 17 00:00:00 2001
From: Yuan Chen <yuanchen97@gmail.com>
Date: Sat, 25 Apr 2026 18:35:44 -0700
Subject: [PATCH 07/21] ci: address remaining review comments

---
 .github/actions/aicr-build/action.yml         |  1 +
 .../check-control-plane-health/action.yml     |  2 +-
 .github/actions/gpu-cluster-setup/action.yml  | 30 +++++++++++++++----
 3 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/.github/actions/aicr-build/action.yml b/.github/actions/aicr-build/action.yml
index 14b68dcf8..14d6a595b 100644
--- a/.github/actions/aicr-build/action.yml
+++ b/.github/actions/aicr-build/action.yml
@@ -43,6 +43,7 @@ runs:
         GOFLAGS: -mod=vendor
       run: |
         set -euo pipefail
+        mkdir -p dist
         if [[ -x dist/aicr ]]; then
           echo "Reusing existing dist/aicr"
           exit 0
diff --git a/.github/actions/check-control-plane-health/action.yml b/.github/actions/check-control-plane-health/action.yml
index fda1075c7..4063a330f 100644
--- a/.github/actions/check-control-plane-health/action.yml
+++ b/.github/actions/check-control-plane-health/action.yml
@@ -120,7 +120,7 @@ runs:
           local component="$1"
           local selector="component=${component}"
 
-          if ! kubectl --context="kind-${KIND_CLUSTER_NAME}" -n "${NAMESPACE}" \
+          if ! timeout "${WAIT_TIMEOUT}" kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" -n "${NAMESPACE}" \
             wait --for=condition=Ready pod -l "${selector}" --timeout="${WAIT_TIMEOUT}"; then
             return 1
           fi
diff --git a/.github/actions/gpu-cluster-setup/action.yml b/.github/actions/gpu-cluster-setup/action.yml
index d4ea8f744..fd310e8fa 100644
--- a/.github/actions/gpu-cluster-setup/action.yml
+++ b/.github/actions/gpu-cluster-setup/action.yml
@@ -119,7 +119,17 @@ runs:
 
         mapfile -t gpu_names < <(nvidia-smi --query-gpu=name --format=csv,noheader)
         if [[ -n "${GPU_MODEL_PATTERN}" ]]; then
-          gpu_count=$(printf '%s\n' "${gpu_names[@]}" | grep -Eic "${GPU_MODEL_PATTERN}" || true)
+          set +e
+          gpu_count=$(printf '%s\n' "${gpu_names[@]}" | grep -Eic -- "${GPU_MODEL_PATTERN}")
+          grep_status=$?
+          set -e
+          if (( grep_status == 2 )); then
+            echo "::error::invalid gpu_model_pattern regex: ${GPU_MODEL_PATTERN}"
+            exit 1
+          fi
+          if (( grep_status != 0 )); then
+            gpu_count=0
+          fi
           echo "Visible GPUs matching '${GPU_MODEL_PATTERN}': ${gpu_count}"
         else
           gpu_count="${#gpu_names[@]}"
@@ -133,7 +143,7 @@ runs:
 
         echo "=== Existing kind state ==="
         kind get clusters || true
-        docker ps -a --filter "name=${KIND_CLUSTER_NAME}" || true
+        docker ps -a --filter "label=io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}" || true
 
     - name: Configure NVIDIA Container Toolkit for kind
       shell: bash
@@ -159,6 +169,7 @@ runs:
       shell: bash
       run: |
         set -euo pipefail
+        kind_cluster_label="io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}"
         if kind get clusters | grep -Fxq "${KIND_CLUSTER_NAME}"; then
           echo "Deleting stale kind cluster: ${KIND_CLUSTER_NAME}"
           if ! timeout 180s kind delete cluster --name "${KIND_CLUSTER_NAME}"; then
@@ -168,17 +179,17 @@ runs:
           echo "No stale kind cluster named ${KIND_CLUSTER_NAME}"
         fi
 
-        remaining_containers=$(docker ps -aq --filter "name=${KIND_CLUSTER_NAME}")
+        remaining_containers=$(docker ps -aq --filter "label=${kind_cluster_label}")
         if [[ -n "${remaining_containers}" ]]; then
           echo "Removing stale containers for ${KIND_CLUSTER_NAME}:"
-          docker ps -a --filter "name=${KIND_CLUSTER_NAME}"
+          docker ps -a --filter "label=${kind_cluster_label}"
           docker rm -f ${remaining_containers}
         fi
 
-        remaining_containers=$(docker ps -aq --filter "name=${KIND_CLUSTER_NAME}")
+        remaining_containers=$(docker ps -aq --filter "label=${kind_cluster_label}")
         if [[ -n "${remaining_containers}" ]]; then
           echo "::error::stale containers still remain for ${KIND_CLUSTER_NAME}:"
-          docker ps -a --filter "name=${KIND_CLUSTER_NAME}"
+          docker ps -a --filter "label=${kind_cluster_label}"
           exit 1
         fi
 
@@ -208,6 +219,7 @@ runs:
       shell: bash
       env:
         KIND_NODE_IMAGE: ${{ inputs.kind_node_image }}
+        MIN_FREE_DISK_GB: ${{ inputs.min_free_disk_gb }}
       run: |
         set -euo pipefail
         echo "=== Kind node image cache ==="
@@ -217,6 +229,12 @@ runs:
           echo "Pulling kind node image: ${KIND_NODE_IMAGE}"
           timeout 600s docker pull "${KIND_NODE_IMAGE}"
         fi
+        free_disk_gb=$(df -BG --output=avail / | tail -1 | tr -dc '0-9')
+        if (( free_disk_gb < MIN_FREE_DISK_GB )); then
+          echo "::error::free disk on / is ${free_disk_gb}GiB after warming ${KIND_NODE_IMAGE}, need at least ${MIN_FREE_DISK_GB}GiB"
+          exit 1
+        fi
+        echo "Runner disk remains sufficient after kind image warm-up: ${free_disk_gb}GiB"
 
     - name: Create GPU-enabled kind cluster
       shell: bash

From d14f9c56d4a8fc358d200e58d0a4b44d4ddf3fa1 Mon Sep 17 00:00:00 2001
From: Yuan Chen <yuanchen97@gmail.com>
Date: Sat, 25 Apr 2026 18:39:51 -0700
Subject: [PATCH 08/21] ci: bound H100 retry budgets

---
 .../check-control-plane-health/action.yml     | 32 +++++++++++--------
 pkg/bundler/deployer/helm/helm_test.go        |  6 ++--
 .../deployer/helm/templates/deploy.sh.tmpl    |  8 +++--
 3 files changed, 26 insertions(+), 20 deletions(-)

diff --git a/.github/actions/check-control-plane-health/action.yml b/.github/actions/check-control-plane-health/action.yml
index 4063a330f..153ecfa2b 100644
--- a/.github/actions/check-control-plane-health/action.yml
+++ b/.github/actions/check-control-plane-health/action.yml
@@ -32,7 +32,7 @@ inputs:
     required: false
     default: 60s
   max_restarts:
-    description: 'Maximum tolerated restart count. With stability_window > 0, historical restarts are diagnostic and new restarts fail.'
+    description: 'Maximum tolerated restart count for each control-plane component'
     required: false
     default: '1'
   stability_window:
@@ -152,6 +152,18 @@ runs:
           echo "${total}"
         }
 
+        enforce_restart_budget() {
+          local component="$1"
+          local restart_count="$2"
+
+          if (( restart_count > MAX_RESTARTS )); then
+            echo "::error::${component} restartCount=${restart_count} exceeds max_restarts=${MAX_RESTARTS}"
+            dump_component_diagnostics "${component}"
+            kubectl_kind get --raw='/readyz' || true
+            exit 1
+          fi
+        }
+
         dump_component_diagnostics() {
           local component="$1"
           local selector="component=${component}"
@@ -281,13 +293,9 @@ runs:
             fi
           fi
           initial_restarts=$(restart_total "${component}")
+          enforce_restart_budget "${component}" "${initial_restarts}"
 
           if [[ "${STABILITY_WINDOW}" != "0s" ]]; then
-            if (( initial_restarts > MAX_RESTARTS )); then
-              echo "::warning::${component} historical restartCount=${initial_restarts}; checking for stability over ${STABILITY_WINDOW}"
-              dump_component_diagnostics "${component}"
-            fi
-
             sleep "${STABILITY_WINDOW}"
             if ! wait_ready "${component}"; then
               if ! try_recover_component "${component}" "pods became unready during ${STABILITY_WINDOW}"; then
@@ -297,6 +305,7 @@ runs:
                 exit 1
               fi
               initial_restarts=$(restart_total "${component}")
+              enforce_restart_budget "${component}" "${initial_restarts}"
               sleep "${STABILITY_WINDOW}"
               if ! wait_ready "${component}"; then
                 echo "::error::${component} pods became unready after recovery"
@@ -306,6 +315,7 @@ runs:
               fi
             fi
             final_restarts=$(restart_total "${component}")
+            enforce_restart_budget "${component}" "${final_restarts}"
             if (( final_restarts > initial_restarts )); then
               if ! try_recover_component "${component}" "restartCount increased from ${initial_restarts} to ${final_restarts} during ${STABILITY_WINDOW}"; then
                 echo "::error::${component} restartCount increased from ${initial_restarts} to ${final_restarts} during ${STABILITY_WINDOW}"
@@ -313,6 +323,7 @@ runs:
                 exit 1
               fi
               initial_restarts=$(restart_total "${component}")
+              enforce_restart_budget "${component}" "${initial_restarts}"
               sleep "${STABILITY_WINDOW}"
               if ! wait_ready "${component}"; then
                 echo "::error::${component} pods became unready after recovery"
@@ -321,6 +332,7 @@ runs:
                 exit 1
               fi
               final_restarts=$(restart_total "${component}")
+              enforce_restart_budget "${component}" "${final_restarts}"
               if (( final_restarts > initial_restarts )); then
                 echo "::error::${component} restartCount increased from ${initial_restarts} to ${final_restarts} after recovery"
                 dump_component_diagnostics "${component}"
@@ -329,14 +341,6 @@ runs:
             fi
             return
           fi
-
-          if (( initial_restarts > MAX_RESTARTS )); then
-            if ! try_recover_component "${component}" "restartCount=${initial_restarts}"; then
-              echo "::error::${component} restartCount=${initial_restarts}"
-              dump_component_diagnostics "${component}"
-              exit 1
-            fi
-          fi
         }
 
         for component in ${COMPONENTS}; do
diff --git a/pkg/bundler/deployer/helm/helm_test.go b/pkg/bundler/deployer/helm/helm_test.go
index 79c22f8fd..0a14dc03e 100644
--- a/pkg/bundler/deployer/helm/helm_test.go
+++ b/pkg/bundler/deployer/helm/helm_test.go
@@ -555,9 +555,9 @@ func TestGenerate_DeployScriptComponentTimeouts(t *testing.T) {
 				Type:      recipe.ComponentTypeHelm,
 				Source:    "https://prometheus-community.github.io/helm-charts",
 			},
-			wantTimeout:    `COMPONENT_HELM_TIMEOUT="20m"`,
-			wantComment:    `Keep the default retry budget for kube-prometheus-stack`,
-			rejectRetryCap: true,
+			wantTimeout:  `COMPONENT_HELM_TIMEOUT="20m"`,
+			wantRetryCap: `if [[ "${COMPONENT_MAX_RETRIES}" -gt 2 ]]`,
+			wantComment:  `Allow the observed third-attempt Grafana success pattern`,
 		},
 	}
 
diff --git a/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl b/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl
index e456bba58..02c04d0a9 100644
--- a/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl
+++ b/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl
@@ -387,9 +387,11 @@ if [[ "${COMPONENT_MAX_RETRIES}" -gt 1 ]]; then
 fi
 {{ else if eq .Name "kube-prometheus-stack" -}}
 COMPONENT_HELM_TIMEOUT="20m"
-# Keep the default retry budget for kube-prometheus-stack. On cold H100
-# runners, Grafana can hit ProgressDeadlineExceeded multiple times before
-# images and rollout state are warm enough for a later retry to succeed.
+# Allow the observed third-attempt Grafana success pattern, but cap the budget
+# so kube-prometheus-stack cannot consume most of the H100 workflow timeout.
+if [[ "${COMPONENT_MAX_RETRIES}" -gt 2 ]]; then
+  COMPONENT_MAX_RETRIES="2"
+fi
 {{ end -}}
 # Derive wait args: global --wait/--no-wait behavior + component timeout.
 if [[ "${NO_WAIT}" == "true" ]]; then

From 7a3505ccec453aded444fe995d150861ec0c23dd Mon Sep 17 00:00:00 2001
From: Yuan Chen <yuanchen97@gmail.com>
Date: Sat, 25 Apr 2026 18:50:33 -0700
Subject: [PATCH 09/21] ci: install ko for Karpenter KWOK

---
 .github/actions/install-karpenter-kwok/action.yml | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/.github/actions/install-karpenter-kwok/action.yml b/.github/actions/install-karpenter-kwok/action.yml
index dab642174..c917b2abc 100644
--- a/.github/actions/install-karpenter-kwok/action.yml
+++ b/.github/actions/install-karpenter-kwok/action.yml
@@ -40,8 +40,15 @@ runs:
       shell: bash
       run: |
         echo "karpenter=$(yq eval '.testing_tools.karpenter' .settings.yaml)" >> "$GITHUB_OUTPUT"
+        echo "ko=$(yq eval '.build_tools.ko' .settings.yaml)" >> "$GITHUB_OUTPUT"
         echo "go=$(go env GOVERSION)" >> "$GITHUB_OUTPUT"
 
+    - name: Install ko
+      uses: ./.github/actions/setup-build-tools
+      with:
+        install_ko: 'true'
+        ko_version: ${{ steps.versions.outputs.ko }}
+
     - name: Cache Karpenter Go build cache
       uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684  # v4.2.3
       with:

From c1ccd86ab53c8a653dd0f2d6fae3480af0d534b0 Mon Sep 17 00:00:00 2001
From: Yuan Chen <yuanchen97@gmail.com>
Date: Sat, 25 Apr 2026 20:23:37 -0700
Subject: [PATCH 10/21] ci: retry KWOK Helm bootstrap

---
 kwok/scripts/run-all-recipes.sh | 34 ++++++++++++++++++++++++++++++---
 1 file changed, 31 insertions(+), 3 deletions(-)

diff --git a/kwok/scripts/run-all-recipes.sh b/kwok/scripts/run-all-recipes.sh
index 459b054b5..6b4af1549 100755
--- a/kwok/scripts/run-all-recipes.sh
+++ b/kwok/scripts/run-all-recipes.sh
@@ -37,6 +37,31 @@ log_info() { echo -e "${GREEN}[INFO]${NC} $*"; }
 log_warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
 log_error() { echo -e "${RED}[ERROR]${NC} $*"; }
 
+retry_command() {
+    local description="$1"
+    shift
+
+    local max_attempts="${KWOK_COMMAND_RETRIES:-3}"
+    local delay="${KWOK_COMMAND_RETRY_DELAY:-5}"
+    local attempt=1
+
+    while true; do
+        if "$@"; then
+            return 0
+        fi
+
+        if ((attempt >= max_attempts)); then
+            log_error "${description} failed after ${attempt} attempt(s)"
+            return 1
+        fi
+
+        log_warn "${description} failed (attempt ${attempt}/${max_attempts}); retrying in ${delay}s..."
+        sleep "${delay}"
+        attempt=$((attempt + 1))
+        delay=$((delay * 2))
+    done
+}
+
 # Find recipes with service criteria (testable cloud configurations)
 get_recipes() {
     for overlay in "${OVERLAYS_DIR}"/*.yaml; do
@@ -68,10 +93,13 @@ ensure_cluster() {
 
     if ! kubectl get deployment -n kube-system kwok-controller &>/dev/null; then
         log_info "Installing KWOK controller..."
-        helm repo add kwok https://kwok.sigs.k8s.io/charts/ --force-update
-        helm upgrade --install kwok-controller kwok/kwok \
+        retry_command "Adding KWOK Helm repository" \
+            helm repo add kwok https://kwok.sigs.k8s.io/charts/ --force-update
+        retry_command "Installing KWOK controller" \
+            helm upgrade --install kwok-controller kwok/kwok \
             --namespace kube-system --set hostNetwork=true --wait
-        helm upgrade --install kwok-stage-fast kwok/stage-fast --namespace kube-system
+        retry_command "Installing KWOK stage-fast" \
+            helm upgrade --install kwok-stage-fast kwok/stage-fast --namespace kube-system
     fi
 
     # Patch kindnet to exclude KWOK nodes

From 34f115e3c20b927c9ec1e14d47858a47808b3dd9 Mon Sep 17 00:00:00 2001
From: Yuan Chen <yuanchen97@gmail.com>
Date: Sat, 25 Apr 2026 20:45:06 -0700
Subject: [PATCH 11/21] ci: share control plane stability window

---
 .../check-control-plane-health/action.yml     | 81 +++++++++++++------
 1 file changed, 57 insertions(+), 24 deletions(-)

diff --git a/.github/actions/check-control-plane-health/action.yml b/.github/actions/check-control-plane-health/action.yml
index 153ecfa2b..600310972 100644
--- a/.github/actions/check-control-plane-health/action.yml
+++ b/.github/actions/check-control-plane-health/action.yml
@@ -264,7 +264,6 @@ runs:
           local selector="component=${component}"
           local pods
           local initial_restarts
-          local final_restarts
 
           if ! pods=$(kubectl_kind -n "${NAMESPACE}" get pod -l "${selector}" -o name); then
             if ! try_recover_component "${component}" "failed to list pods in ${NAMESPACE} with selector ${selector}"; then
@@ -294,9 +293,27 @@ runs:
           fi
           initial_restarts=$(restart_total "${component}")
           enforce_restart_budget "${component}" "${initial_restarts}"
+          INITIAL_RESTARTS["${component}"]="${initial_restarts}"
+        }
+
+        verify_stability_window() {
+          local component
+          local initial_restarts
+          local final_restarts
+          local recovered=false
 
-          if [[ "${STABILITY_WINDOW}" != "0s" ]]; then
-            sleep "${STABILITY_WINDOW}"
+          if [[ "${STABILITY_WINDOW}" == "0s" ]]; then
+            return
+          fi
+
+          echo "Observing control-plane stability for ${STABILITY_WINDOW}..."
+          sleep "${STABILITY_WINDOW}"
+          for component in ${COMPONENTS}; do
+            initial_restarts="${INITIAL_RESTARTS[${component}]:-}"
+            if [[ -z "${initial_restarts}" ]]; then
+              echo "::error::missing initial restart count for ${component}"
+              exit 1
+            fi
             if ! wait_ready "${component}"; then
               if ! try_recover_component "${component}" "pods became unready during ${STABILITY_WINDOW}"; then
                 echo "::error::${component} pods became unready during ${STABILITY_WINDOW}"
@@ -306,13 +323,9 @@ runs:
               fi
               initial_restarts=$(restart_total "${component}")
               enforce_restart_budget "${component}" "${initial_restarts}"
-              sleep "${STABILITY_WINDOW}"
-              if ! wait_ready "${component}"; then
-                echo "::error::${component} pods became unready after recovery"
-                dump_component_diagnostics "${component}"
-                kubectl_kind get --raw='/readyz' || true
-                exit 1
-              fi
+              INITIAL_RESTARTS["${component}"]="${initial_restarts}"
+              recovered=true
+              continue
             fi
             final_restarts=$(restart_total "${component}")
             enforce_restart_budget "${component}" "${final_restarts}"
@@ -324,26 +337,46 @@ runs:
               fi
               initial_restarts=$(restart_total "${component}")
               enforce_restart_budget "${component}" "${initial_restarts}"
-              sleep "${STABILITY_WINDOW}"
-              if ! wait_ready "${component}"; then
-                echo "::error::${component} pods became unready after recovery"
-                dump_component_diagnostics "${component}"
-                kubectl_kind get --raw='/readyz' || true
-                exit 1
-              fi
-              final_restarts=$(restart_total "${component}")
-              enforce_restart_budget "${component}" "${final_restarts}"
-              if (( final_restarts > initial_restarts )); then
-                echo "::error::${component} restartCount increased from ${initial_restarts} to ${final_restarts} after recovery"
-                dump_component_diagnostics "${component}"
-                exit 1
-              fi
+              INITIAL_RESTARTS["${component}"]="${initial_restarts}"
+              recovered=true
+              continue
             fi
+            INITIAL_RESTARTS["${component}"]="${final_restarts}"
+          done
+
+          if [[ "${recovered}" != "true" ]]; then
             return
           fi
+
+          echo "::warning::control-plane recovery occurred; observing one additional ${STABILITY_WINDOW} stability window"
+          sleep "${STABILITY_WINDOW}"
+          for component in ${COMPONENTS}; do
+            initial_restarts="${INITIAL_RESTARTS[${component}]:-}"
+            if [[ -z "${initial_restarts}" ]]; then
+              echo "::error::missing post-recovery restart count for ${component}"
+              exit 1
+            fi
+            if ! wait_ready "${component}"; then
+              echo "::error::${component} pods became unready after recovery"
+              dump_component_diagnostics "${component}"
+              kubectl_kind get --raw='/readyz' || true
+              exit 1
+            fi
+            final_restarts=$(restart_total "${component}")
+            enforce_restart_budget "${component}" "${final_restarts}"
+            if (( final_restarts > initial_restarts )); then
+              echo "::error::${component} restartCount increased from ${initial_restarts} to ${final_restarts} after recovery"
+              dump_component_diagnostics "${component}"
+              exit 1
+            fi
+            INITIAL_RESTARTS["${component}"]="${final_restarts}"
+          done
         }
 
+        declare -A INITIAL_RESTARTS=()
+
         for component in ${COMPONENTS}; do
           check_component "${component}"
         done
+        verify_stability_window
         kubectl_kind get --raw='/readyz'

From 73dfd1e4929a14eca1a03f0e00239770b6d937de Mon Sep 17 00:00:00 2001
From: Yuan Chen <yuanchen97@gmail.com>
Date: Sat, 25 Apr 2026 22:12:31 -0700
Subject: [PATCH 12/21] ci: harden H100 runtime diagnostics

---
 .github/actions/gpu-cluster-setup/action.yml  | 180 ++++++++++++++++++
 .../actions/gpu-operator-install/action.yml   |  13 +-
 .github/actions/gpu-test-cleanup/action.yml   |   9 +
 .../workflows/gpu-h100-inference-test.yaml    |  71 ++++---
 .github/workflows/gpu-h100-training-test.yaml |  62 +++---
 pkg/bundler/deployer/helm/helm_test.go        |   6 +-
 .../deployer/helm/templates/deploy.sh.tmpl    |   9 +-
 7 files changed, 296 insertions(+), 54 deletions(-)

diff --git a/.github/actions/gpu-cluster-setup/action.yml b/.github/actions/gpu-cluster-setup/action.yml
index fd310e8fa..3800b4b99 100644
--- a/.github/actions/gpu-cluster-setup/action.yml
+++ b/.github/actions/gpu-cluster-setup/action.yml
@@ -40,6 +40,42 @@ inputs:
     description: 'Timeout for nvkind cluster create'
     required: false
     default: '900s'
+  control_plane_resource_patches:
+    description: 'Apply kubeadm patches that raise control-plane static pod resource requests'
+    required: false
+    default: 'false'
+  api_server_cpu_request:
+    description: 'kube-apiserver CPU request when control_plane_resource_patches is true'
+    required: false
+    default: '1000m'
+  api_server_memory_request:
+    description: 'kube-apiserver memory request when control_plane_resource_patches is true'
+    required: false
+    default: '1Gi'
+  controller_manager_cpu_request:
+    description: 'kube-controller-manager CPU request when control_plane_resource_patches is true'
+    required: false
+    default: '1000m'
+  controller_manager_memory_request:
+    description: 'kube-controller-manager memory request when control_plane_resource_patches is true'
+    required: false
+    default: '512Mi'
+  scheduler_cpu_request:
+    description: 'kube-scheduler CPU request when control_plane_resource_patches is true'
+    required: false
+    default: '500m'
+  scheduler_memory_request:
+    description: 'kube-scheduler memory request when control_plane_resource_patches is true'
+    required: false
+    default: '256Mi'
+  etcd_cpu_request:
+    description: 'etcd CPU request when control_plane_resource_patches is true'
+    required: false
+    default: '1000m'
+  etcd_memory_request:
+    description: 'etcd memory request when control_plane_resource_patches is true'
+    required: false
+    default: '1Gi'
 
 runs:
   using: 'composite'
@@ -241,6 +277,15 @@ runs:
       env:
         KIND_NODE_IMAGE: ${{ inputs.kind_node_image }}
         CLUSTER_CREATE_TIMEOUT: ${{ inputs.cluster_create_timeout }}
+        CONTROL_PLANE_RESOURCE_PATCHES: ${{ inputs.control_plane_resource_patches }}
+        API_SERVER_CPU_REQUEST: ${{ inputs.api_server_cpu_request }}
+        API_SERVER_MEMORY_REQUEST: ${{ inputs.api_server_memory_request }}
+        CONTROLLER_MANAGER_CPU_REQUEST: ${{ inputs.controller_manager_cpu_request }}
+        CONTROLLER_MANAGER_MEMORY_REQUEST: ${{ inputs.controller_manager_memory_request }}
+        SCHEDULER_CPU_REQUEST: ${{ inputs.scheduler_cpu_request }}
+        SCHEDULER_MEMORY_REQUEST: ${{ inputs.scheduler_memory_request }}
+        ETCD_CPU_REQUEST: ${{ inputs.etcd_cpu_request }}
+        ETCD_MEMORY_REQUEST: ${{ inputs.etcd_memory_request }}
       run: |
         set -euo pipefail
 
@@ -250,6 +295,131 @@ runs:
           CREATE_ARGS+=(--image="${KIND_NODE_IMAGE}")
         fi
 
+        case "${CONTROL_PLANE_RESOURCE_PATCHES}" in
+          true) ;;
+          ""|false) CONTROL_PLANE_RESOURCE_PATCHES=false ;;
+          *)
+            echo "::error::control_plane_resource_patches must be true or false, got '${CONTROL_PLANE_RESOURCE_PATCHES}'"
+            exit 1
+            ;;
+        esac
+
+        if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then
+          patch_dir="$(mktemp -d)"
+          config_template="$(mktemp)"
+
+          # Keep heredoc body indentation aligned with this run block. GitHub
+          # Actions strips the common run: | indent before bash sees it.
+          cat > "${patch_dir}/kube-apiserver+strategic.yaml" <<EOF
+        apiVersion: v1
+        kind: Pod
+        metadata:
+          name: kube-apiserver
+          namespace: kube-system
+        spec:
+          containers:
+          - name: kube-apiserver
+            resources:
+              requests:
+                cpu: ${API_SERVER_CPU_REQUEST}
+                memory: ${API_SERVER_MEMORY_REQUEST}
+        EOF
+
+          cat > "${patch_dir}/kube-controller-manager+strategic.yaml" <<EOF
+        apiVersion: v1
+        kind: Pod
+        metadata:
+          name: kube-controller-manager
+          namespace: kube-system
+        spec:
+          containers:
+          - name: kube-controller-manager
+            resources:
+              requests:
+                cpu: ${CONTROLLER_MANAGER_CPU_REQUEST}
+                memory: ${CONTROLLER_MANAGER_MEMORY_REQUEST}
+        EOF
+
+          cat > "${patch_dir}/kube-scheduler+strategic.yaml" <<EOF
+        apiVersion: v1
+        kind: Pod
+        metadata:
+          name: kube-scheduler
+          namespace: kube-system
+        spec:
+          containers:
+          - name: kube-scheduler
+            resources:
+              requests:
+                cpu: ${SCHEDULER_CPU_REQUEST}
+                memory: ${SCHEDULER_MEMORY_REQUEST}
+        EOF
+
+          cat > "${patch_dir}/etcd+strategic.yaml" <<EOF
+        apiVersion: v1
+        kind: Pod
+        metadata:
+          name: etcd
+          namespace: kube-system
+        spec:
+          containers:
+          - name: etcd
+            resources:
+              requests:
+                cpu: ${ETCD_CPU_REQUEST}
+                memory: ${ETCD_MEMORY_REQUEST}
+        EOF
+
+          cat > "${config_template}" <<'EOF'
+        kind: Cluster
+        apiVersion: kind.x-k8s.io/v1alpha4
+        {{- if hasKey $ "name" }}
+        name: {{ $.name }}
+        {{- end }}
+        nodes:
+        - role: control-plane
+          {{- if hasKey $ "image" }}
+          image: {{ $.image }}
+          {{- end }}
+          extraMounts:
+          - hostPath: __PATCH_DIR__
+            containerPath: /patches
+          kubeadmConfigPatches:
+          - |
+            kind: InitConfiguration
+            patches:
+              directory: /patches
+        {{- range $.workers }}
+        - role: worker
+          {{- if hasKey $ "image" }}
+          image: {{ $.image }}
+          {{- end }}
+
+          {{- if hasKey . "devices" }}
+          {{- $devices := .devices }}
+          {{- if not (kindIs "slice" $devices) }}
+            {{- $devices = list .devices }}
+          {{- end }}
+          extraMounts:
+            # We inject all NVIDIA GPUs using the nvidia-container-runtime.
+            # This requires `accept-nvidia-visible-devices-as-volume-mounts = true` be set
+            # in `/etc/nvidia-container-runtime/config.toml`
+            {{- range $d := $devices }}
+            - hostPath: /dev/null
+              containerPath: /var/run/nvidia-container-devices/{{ $d }}
+            {{- end }}
+          {{- end }}
+        {{- end }}
+        EOF
+          sed -i "s#__PATCH_DIR__#${patch_dir}#g" "${config_template}"
+          echo "Applying control-plane static pod resource patches from ${patch_dir}:"
+          for patch_file in "${patch_dir}"/*.yaml; do
+            echo "--- ${patch_file}"
+            sed 's/^/  /' "${patch_file}"
+          done
+          CREATE_ARGS+=(--config-template="${config_template}")
+        fi
+
         set +e
         timeout "${CLUSTER_CREATE_TIMEOUT}" nvkind cluster create "${CREATE_ARGS[@]}"
         create_status=$?
@@ -261,6 +431,16 @@ runs:
         kubectl --context="kind-${KIND_CLUSTER_NAME}" wait --for=condition=Ready nodes --all --timeout=300s
         kubectl --context="kind-${KIND_CLUSTER_NAME}" cluster-info
         kubectl --context="kind-${KIND_CLUSTER_NAME}" get nodes -o wide
+        kubectl --context="kind-${KIND_CLUSTER_NAME}" describe nodes | \
+          grep -E "^(Name:|Capacity:|Allocatable:|Allocated resources:|  cpu|  memory|  nvidia.com/gpu)" || true
+
+        echo "=== Kind node container resources ==="
+        docker ps --filter "label=io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}" \
+          --format '{{.Names}}' | sort | while read -r node_container; do
+            [[ -z "${node_container}" ]] && continue
+            docker inspect "${node_container}" \
+              --format '{{.Name}} NanoCpus={{.HostConfig.NanoCpus}} CpuShares={{.HostConfig.CpuShares}} Memory={{.HostConfig.Memory}} MemoryReservation={{.HostConfig.MemoryReservation}}'
+          done
 
         echo "=== Control-plane resource requests/limits ==="
         kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kube-system \
diff --git a/.github/actions/gpu-operator-install/action.yml b/.github/actions/gpu-operator-install/action.yml
index 86d247932..2727c5e6a 100644
--- a/.github/actions/gpu-operator-install/action.yml
+++ b/.github/actions/gpu-operator-install/action.yml
@@ -39,6 +39,10 @@ inputs:
     description: 'Continue deploying remaining bundle components after a component failure'
     required: false
     default: 'true'
+  deploy_trace:
+    description: 'Run generated deploy.sh with bash xtrace for CI diagnostics. Do not enable for bundles with secret command arguments.'
+    required: false
+    default: 'false'
 
 runs:
   using: 'composite'
@@ -124,7 +128,14 @@ runs:
         else
           echo "Deploying bundle with default args"
         fi
-        ./deploy.sh "${DEPLOY_ARGS[@]}"
+        if [[ "${{ inputs.deploy_trace }}" == "true" ]]; then
+          echo "Deploy trace enabled: running deploy.sh with bash xtrace"
+          echo "::warning::deploy_trace prints shell command arguments; disable it for bundles with secret values"
+          export PS4='+ ${BASH_SOURCE}:${LINENO}: '
+          bash -x ./deploy.sh "${DEPLOY_ARGS[@]}"
+        else
+          ./deploy.sh "${DEPLOY_ARGS[@]}"
+        fi
 
     - name: Wait for GPU operands (bundle)
       if: inputs.method == 'bundle'
diff --git a/.github/actions/gpu-test-cleanup/action.yml b/.github/actions/gpu-test-cleanup/action.yml
index 2e3ca2685..5d1cef3e4 100644
--- a/.github/actions/gpu-test-cleanup/action.yml
+++ b/.github/actions/gpu-test-cleanup/action.yml
@@ -71,6 +71,15 @@ runs:
         KIND_CLUSTER_NAME: ${{ inputs.cluster_name }}
       run: |
         timeout 300s kind delete cluster --name "${KIND_CLUSTER_NAME}" || true
+        kind_cluster_label="io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}"
+        remaining_containers=$(docker ps -aq --filter "label=${kind_cluster_label}")
+        if [[ -n "${remaining_containers}" ]]; then
+          echo "Removing leftover kind containers for ${KIND_CLUSTER_NAME}:"
+          docker ps -a --filter "label=${kind_cluster_label}"
+          docker rm -f ${remaining_containers} || true
+        fi
+        timeout 60s docker builder prune -f --filter "until=24h" || true
+        timeout 60s docker system prune -f --filter "until=24h" || true
 
     - name: Upload debug artifacts
       if: failure()
diff --git a/.github/workflows/gpu-h100-inference-test.yaml b/.github/workflows/gpu-h100-inference-test.yaml
index 95df8740e..95340fbc8 100644
--- a/.github/workflows/gpu-h100-inference-test.yaml
+++ b/.github/workflows/gpu-h100-inference-test.yaml
@@ -58,6 +58,7 @@ jobs:
               - 'pkg/evidence/**'
               - '.github/actions/gpu-test-cleanup/**'
               - '.github/actions/load-versions/**'
+              - 'pkg/bundler/deployer/helm/**'
               - 'tests/chainsaw/chainsaw-config.yaml'
               - 'tests/chainsaw/ai-conformance/main.go'
               - 'tests/chainsaw/ai-conformance/common/**'
@@ -124,6 +125,7 @@ jobs:
           min_free_disk_gb: '50'
           min_available_memory_gb: '16'
           cluster_create_timeout: 900s
+          control_plane_resource_patches: 'true'
 
       - name: Build aicr
         uses: ./.github/actions/aicr-build
@@ -131,12 +133,14 @@ jobs:
           build_snapshot_agent: 'false'
           validator_phases: 'none'
 
+      # Fast readiness gate after cluster setup. Stability windows start after
+      # runtime install, where component rollouts can stress the control plane.
       - name: Check control plane health
         uses: ./.github/actions/check-control-plane-health
         with:
           cluster_name: ${{ env.KIND_CLUSTER_NAME }}
           wait_timeout: 120s
-          stability_window: 60s
+          stability_window: 0s
           recover_unhealthy: 'true'
 
       - name: Install runtime bundle
@@ -148,8 +152,9 @@ jobs:
           platform: dynamo
           wait: 'true'
           best_effort: 'false'
+          deploy_trace: 'true'
 
-      - name: Check control plane health after runtime install
+      - name: Check control plane health
         uses: ./.github/actions/check-control-plane-health
         with:
           cluster_name: ${{ env.KIND_CLUSTER_NAME }}
@@ -176,7 +181,7 @@ jobs:
 
       # --- Install Karpenter + KWOK early to give monitoring stack settle time ---
 
-      - name: Check control plane health before Karpenter
+      - name: Check control plane health
         uses: ./.github/actions/check-control-plane-health
         with:
           cluster_name: ${{ env.KIND_CLUSTER_NAME }}
@@ -194,7 +199,13 @@ jobs:
 
       # --- Health checks ---
 
-      - name: Check control plane health after Karpenter
+      - name: Install chainsaw
+        uses: ./.github/actions/setup-build-tools
+        with:
+          install_chainsaw: 'true'
+          chainsaw_version: '${{ steps.versions.outputs.chainsaw }}'
+
+      - name: Check control plane health
         uses: ./.github/actions/check-control-plane-health
         with:
           cluster_name: ${{ env.KIND_CLUSTER_NAME }}
@@ -202,32 +213,20 @@ jobs:
           stability_window: 60s
           recover_unhealthy: 'true'
 
-      - name: Install chainsaw
-        uses: ./.github/actions/setup-build-tools
-        with:
-          install_chainsaw: 'true'
-          chainsaw_version: '${{ steps.versions.outputs.chainsaw }}'
-
       - name: Run chainsaw health checks
+        # The H100 stack can make namespace cleanup API calls slow under load.
+        # Keep cleanup enabled, but allow more than the default 30s deadline.
         run: |
           chainsaw test \
             --test-dir tests/chainsaw/ai-conformance/kind-inference-dynamo \
-            --config tests/chainsaw/chainsaw-config.yaml
+            --config tests/chainsaw/chainsaw-config.yaml \
+            --cleanup-timeout 120s \
+            --delete-timeout 120s
 
       # --- CNCF AI Conformance validation ---
       # Runs after the stack health checks so gateway and metrics validators
       # see a settled inference stack.
 
-      - name: Verify expected resources exist
-        timeout-minutes: 3
-        run: |
-          go run ./tests/chainsaw/ai-conformance/ \
-            --dir tests/chainsaw/ai-conformance/kind-inference-dynamo \
-            --dir tests/chainsaw/ai-conformance/common \
-            --dir tests/chainsaw/ai-conformance/kind-common \
-            --kubeconfig="${HOME}/.kube/config" \
-            --debug
-
       - name: Build conformance validator image
         uses: ./.github/actions/aicr-build
         with:
@@ -235,7 +234,7 @@ jobs:
           build_snapshot_agent: 'false'
           validator_phases: 'conformance'
 
-      - name: Check control plane health before conformance validation
+      - name: Check control plane health
         uses: ./.github/actions/check-control-plane-health
         with:
           cluster_name: ${{ env.KIND_CLUSTER_NAME }}
@@ -276,6 +275,8 @@ jobs:
             validation-result.yaml
           if-no-files-found: warn
 
+      # --- Debug diagnostics (before cleanup so resources still exist) ---
+
       - name: Debug diagnostics
         if: failure()
         timeout-minutes: 5
@@ -285,7 +286,26 @@ jobs:
           kubectl_kind() {
             timeout 30s kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" "$@"
           }
+          print_workload_images() {
+            local ns="$1"
+            kubectl_kind -n "${ns}" get deployment,daemonset,statefulset -o json 2>/dev/null \
+              | jq -r '
+                .items[] |
+                [
+                  .kind,
+                  .metadata.namespace + "/" + .metadata.name,
+                  (([.spec.template.spec.containers[]?.image] +
+                    [.spec.template.spec.initContainers[]?.image]) | unique | join(","))
+                ] | @tsv
+              ' || true
+          }
 
+          echo "=== Workload image inventory ==="
+          for NS in cert-manager gpu-operator monitoring skyhook nvsentinel nvidia-dra-driver \
+            nvidia-network-operator kai-scheduler dynamo-system kgateway-system; do
+            echo "--- ${NS} ---"
+            print_workload_images "${NS}"
+          done
           echo "=== ClusterPolicy status ==="
           kubectl_kind get clusterpolicy -o yaml 2>/dev/null || true
           echo "=== GPU Operator pods ==="
@@ -316,6 +336,13 @@ jobs:
           echo "=== KAI admission logs ==="
           kubectl_kind -n kai-scheduler \
             logs deployment/admission --all-containers --tail=200 2>/dev/null || true
+          echo "=== KAI scheduler logs ==="
+          kubectl_kind -n kai-scheduler \
+            logs deployment/kai-scheduler-default --tail=100 2>/dev/null || true
+          echo "=== KAI scheduler queues ==="
+          kubectl_kind get queues -A 2>/dev/null || true
+          echo "=== KAI scheduler podgroups ==="
+          kubectl_kind get podgroups -A 2>/dev/null || true
           echo "=== Recent events (kai-scheduler) ==="
           kubectl_kind -n kai-scheduler get events --sort-by='.lastTimestamp' 2>/dev/null | tail -50 || true
           echo "=== Custom metrics API ==="
diff --git a/.github/workflows/gpu-h100-training-test.yaml b/.github/workflows/gpu-h100-training-test.yaml
index 6193544fa..5ac900685 100644
--- a/.github/workflows/gpu-h100-training-test.yaml
+++ b/.github/workflows/gpu-h100-training-test.yaml
@@ -58,6 +58,7 @@ jobs:
               - 'pkg/evidence/**'
               - '.github/actions/gpu-test-cleanup/**'
               - '.github/actions/load-versions/**'
+              - 'pkg/bundler/deployer/helm/**'
               - 'tests/chainsaw/chainsaw-config.yaml'
               - 'tests/chainsaw/ai-conformance/main.go'
               - 'tests/chainsaw/ai-conformance/common/**'
@@ -120,6 +121,7 @@ jobs:
           min_free_disk_gb: '50'
           min_available_memory_gb: '16'
           cluster_create_timeout: 900s
+          control_plane_resource_patches: 'true'
 
       - name: Build aicr
         uses: ./.github/actions/aicr-build
@@ -127,12 +129,14 @@ jobs:
           build_snapshot_agent: 'false'
           validator_phases: 'none'
 
+      # Fast readiness gate after cluster setup. Stability windows start after
+      # runtime install, where component rollouts can stress the control plane.
       - name: Check control plane health
         uses: ./.github/actions/check-control-plane-health
         with:
           cluster_name: ${{ env.KIND_CLUSTER_NAME }}
           wait_timeout: 120s
-          stability_window: 60s
+          stability_window: 0s
           recover_unhealthy: 'true'
 
       - name: Install runtime bundle
@@ -145,8 +149,9 @@ jobs:
           platform: kubeflow
           wait: 'true'
           best_effort: 'false'
+          deploy_trace: 'true'
 
-      - name: Check control plane health after runtime install
+      - name: Check control plane health
         id: post_runtime_control_plane_health
         uses: ./.github/actions/check-control-plane-health
         with:
@@ -174,7 +179,7 @@ jobs:
 
       # --- Install Karpenter + KWOK early to give monitoring stack settle time ---
 
-      - name: Check control plane health before Karpenter
+      - name: Check control plane health
         uses: ./.github/actions/check-control-plane-health
         with:
           cluster_name: ${{ env.KIND_CLUSTER_NAME }}
@@ -192,7 +197,13 @@ jobs:
 
       # --- Health checks ---
 
-      - name: Check control plane health after Karpenter
+      - name: Install chainsaw
+        uses: ./.github/actions/setup-build-tools
+        with:
+          install_chainsaw: 'true'
+          chainsaw_version: '${{ steps.versions.outputs.chainsaw }}'
+
+      - name: Check control plane health
         uses: ./.github/actions/check-control-plane-health
         with:
           cluster_name: ${{ env.KIND_CLUSTER_NAME }}
@@ -200,32 +211,20 @@ jobs:
           stability_window: 60s
           recover_unhealthy: 'true'
 
-      - name: Install chainsaw
-        uses: ./.github/actions/setup-build-tools
-        with:
-          install_chainsaw: 'true'
-          chainsaw_version: '${{ steps.versions.outputs.chainsaw }}'
-
       - name: Run chainsaw health checks
+        # The H100 stack can make namespace cleanup API calls slow under load.
+        # Keep cleanup enabled, but allow more than the default 30s deadline.
         run: |
           chainsaw test \
             --test-dir tests/chainsaw/ai-conformance/kind-training-kubeflow \
-            --config tests/chainsaw/chainsaw-config.yaml
+            --config tests/chainsaw/chainsaw-config.yaml \
+            --cleanup-timeout 120s \
+            --delete-timeout 120s
 
       # --- CNCF AI Conformance validation ---
       # Runs last to ensure the DCGM → Prometheus → adapter pipeline
       # has had time to bootstrap (pod-autoscaling check needs live metric data).
 
-      - name: Verify expected resources exist
-        timeout-minutes: 3
-        run: |
-          go run ./tests/chainsaw/ai-conformance/ \
-            --dir tests/chainsaw/ai-conformance/kind-training-kubeflow \
-            --dir tests/chainsaw/ai-conformance/common \
-            --dir tests/chainsaw/ai-conformance/kind-common \
-            --kubeconfig="${HOME}/.kube/config" \
-            --debug
-
       - name: Build conformance validator image
         uses: ./.github/actions/aicr-build
         with:
@@ -233,7 +232,7 @@ jobs:
           build_snapshot_agent: 'false'
           validator_phases: 'conformance'
 
-      - name: Check control plane health before conformance validation
+      - name: Check control plane health
         uses: ./.github/actions/check-control-plane-health
         with:
           cluster_name: ${{ env.KIND_CLUSTER_NAME }}
@@ -281,7 +280,26 @@ jobs:
           kubectl_kind() {
             timeout 30s kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" "$@"
           }
+          print_workload_images() {
+            local ns="$1"
+            kubectl_kind -n "${ns}" get deployment,daemonset,statefulset -o json 2>/dev/null \
+              | jq -r '
+                .items[] |
+                [
+                  .kind,
+                  .metadata.namespace + "/" + .metadata.name,
+                  (([.spec.template.spec.containers[]?.image] +
+                    [.spec.template.spec.initContainers[]?.image]) | unique | join(","))
+                ] | @tsv
+              ' || true
+          }
 
+          echo "=== Workload image inventory ==="
+          for NS in cert-manager gpu-operator monitoring skyhook nvsentinel nvidia-dra-driver \
+            nvidia-network-operator kai-scheduler kubeflow; do
+            echo "--- ${NS} ---"
+            print_workload_images "${NS}"
+          done
           echo "=== Grafana deployment ==="
           kubectl_kind -n monitoring get deployment grafana -o wide 2>/dev/null || true
           echo "=== Grafana pods ==="
diff --git a/pkg/bundler/deployer/helm/helm_test.go b/pkg/bundler/deployer/helm/helm_test.go
index 0a14dc03e..8524e4bb7 100644
--- a/pkg/bundler/deployer/helm/helm_test.go
+++ b/pkg/bundler/deployer/helm/helm_test.go
@@ -555,9 +555,9 @@ func TestGenerate_DeployScriptComponentTimeouts(t *testing.T) {
 				Type:      recipe.ComponentTypeHelm,
 				Source:    "https://prometheus-community.github.io/helm-charts",
 			},
-			wantTimeout:  `COMPONENT_HELM_TIMEOUT="20m"`,
-			wantRetryCap: `if [[ "${COMPONENT_MAX_RETRIES}" -gt 2 ]]`,
-			wantComment:  `Allow the observed third-attempt Grafana success pattern`,
+			wantTimeout:    `COMPONENT_HELM_TIMEOUT="${HELM_TIMEOUT}"`,
+			wantComment:    `preserve the default retry`,
+			rejectRetryCap: true,
 		},
 	}
 
diff --git a/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl b/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl
index 02c04d0a9..8359fc07f 100644
--- a/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl
+++ b/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl
@@ -386,12 +386,9 @@ if [[ "${COMPONENT_MAX_RETRIES}" -gt 1 ]]; then
   COMPONENT_MAX_RETRIES="1"
 fi
 {{ else if eq .Name "kube-prometheus-stack" -}}
-COMPONENT_HELM_TIMEOUT="20m"
-# Allow the observed third-attempt Grafana success pattern, but cap the budget
-# so kube-prometheus-stack cannot consume most of the H100 workflow timeout.
-if [[ "${COMPONENT_MAX_RETRIES}" -gt 2 ]]; then
-  COMPONENT_MAX_RETRIES="2"
-fi
+# Grafana can trip its Deployment progress deadline before a longer Helm
+# timeout helps. Keep the default 10m timeout and preserve the default retry
+# budget so later upgrades can succeed after images and controllers settle.
 {{ end -}}
 # Derive wait args: global --wait/--no-wait behavior + component timeout.
 if [[ "${NO_WAIT}" == "true" ]]; then

From 2887460ad90e1041277eaf06a3e236a173c4f063 Mon Sep 17 00:00:00 2001
From: Yuan Chen <yuanchen97@gmail.com>
Date: Sun, 26 Apr 2026 07:50:33 -0700
Subject: [PATCH 13/21] ci: harden H100 control plane and Dynamo retries

---
 .github/actions/gpu-cluster-setup/action.yml  | 64 ++++++++++++++++---
 .../actions/gpu-operator-install/action.yml   | 13 +---
 .../workflows/gpu-h100-inference-test.yaml    |  2 +-
 .github/workflows/gpu-h100-training-test.yaml |  2 +-
 docs/user/cli-reference.md                    |  2 +-
 pkg/bundler/deployer/helm/helm_test.go        |  8 +++
 .../deployer/helm/templates/deploy.sh.tmpl    |  4 ++
 7 files changed, 72 insertions(+), 23 deletions(-)

diff --git a/.github/actions/gpu-cluster-setup/action.yml b/.github/actions/gpu-cluster-setup/action.yml
index 3800b4b99..d20d7cbdd 100644
--- a/.github/actions/gpu-cluster-setup/action.yml
+++ b/.github/actions/gpu-cluster-setup/action.yml
@@ -44,6 +44,10 @@ inputs:
     description: 'Apply kubeadm patches that raise control-plane static pod resource requests'
     required: false
     default: 'false'
+  disable_control_plane_leader_election:
+    description: 'Disable kube-controller-manager and kube-scheduler leader election for single-control-plane kind CI clusters'
+    required: false
+    default: 'false'
   api_server_cpu_request:
     description: 'kube-apiserver CPU request when control_plane_resource_patches is true'
     required: false
@@ -278,6 +282,7 @@ runs:
         KIND_NODE_IMAGE: ${{ inputs.kind_node_image }}
         CLUSTER_CREATE_TIMEOUT: ${{ inputs.cluster_create_timeout }}
         CONTROL_PLANE_RESOURCE_PATCHES: ${{ inputs.control_plane_resource_patches }}
+        DISABLE_CONTROL_PLANE_LEADER_ELECTION: ${{ inputs.disable_control_plane_leader_election }}
         API_SERVER_CPU_REQUEST: ${{ inputs.api_server_cpu_request }}
         API_SERVER_MEMORY_REQUEST: ${{ inputs.api_server_memory_request }}
         CONTROLLER_MANAGER_CPU_REQUEST: ${{ inputs.controller_manager_cpu_request }}
@@ -304,12 +309,22 @@ runs:
             ;;
         esac
 
-        if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then
+        case "${DISABLE_CONTROL_PLANE_LEADER_ELECTION}" in
+          true) ;;
+          ""|false) DISABLE_CONTROL_PLANE_LEADER_ELECTION=false ;;
+          *)
+            echo "::error::disable_control_plane_leader_election must be true or false, got '${DISABLE_CONTROL_PLANE_LEADER_ELECTION}'"
+            exit 1
+            ;;
+        esac
+
+        if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" || "${DISABLE_CONTROL_PLANE_LEADER_ELECTION}" == "true" ]]; then
           patch_dir="$(mktemp -d)"
           config_template="$(mktemp)"
 
           # Keep heredoc body indentation aligned with this run block. GitHub
           # Actions strips the common run: | indent before bash sees it.
+          if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then
           cat > "${patch_dir}/kube-apiserver+strategic.yaml" <<EOF
         apiVersion: v1
         kind: Pod
@@ -369,6 +384,7 @@ runs:
                 cpu: ${ETCD_CPU_REQUEST}
                 memory: ${ETCD_MEMORY_REQUEST}
         EOF
+          fi
 
           cat > "${config_template}" <<'EOF'
         kind: Cluster
@@ -381,14 +397,42 @@ runs:
           {{- if hasKey $ "image" }}
           image: {{ $.image }}
           {{- end }}
+        EOF
+          if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then
+          cat >> "${config_template}" <<EOF
           extraMounts:
-          - hostPath: __PATCH_DIR__
+          - hostPath: ${patch_dir}
             containerPath: /patches
+        EOF
+          fi
+          cat >> "${config_template}" <<'EOF'
           kubeadmConfigPatches:
+        EOF
+          if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then
+          cat >> "${config_template}" <<'EOF'
           - |
             kind: InitConfiguration
+            apiVersion: kubeadm.k8s.io/v1beta4
             patches:
               directory: /patches
+        EOF
+          fi
+          if [[ "${DISABLE_CONTROL_PLANE_LEADER_ELECTION}" == "true" ]]; then
+          cat >> "${config_template}" <<'EOF'
+          - |
+            kind: ClusterConfiguration
+            apiVersion: kubeadm.k8s.io/v1beta4
+            controllerManager:
+              extraArgs:
+              - name: leader-elect
+                value: "false"
+            scheduler:
+              extraArgs:
+              - name: leader-elect
+                value: "false"
+        EOF
+          fi
+          cat >> "${config_template}" <<'EOF'
         {{- range $.workers }}
         - role: worker
           {{- if hasKey $ "image" }}
@@ -411,12 +455,16 @@ runs:
           {{- end }}
         {{- end }}
         EOF
-          sed -i "s#__PATCH_DIR__#${patch_dir}#g" "${config_template}"
-          echo "Applying control-plane static pod resource patches from ${patch_dir}:"
-          for patch_file in "${patch_dir}"/*.yaml; do
-            echo "--- ${patch_file}"
-            sed 's/^/  /' "${patch_file}"
-          done
+          if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then
+            echo "Applying control-plane static pod resource patches from ${patch_dir}:"
+            for patch_file in "${patch_dir}"/*.yaml; do
+              echo "--- ${patch_file}"
+              sed 's/^/  /' "${patch_file}"
+            done
+          fi
+          if [[ "${DISABLE_CONTROL_PLANE_LEADER_ELECTION}" == "true" ]]; then
+            echo "Disabling kube-controller-manager and kube-scheduler leader election for single-control-plane kind CI."
+          fi
           CREATE_ARGS+=(--config-template="${config_template}")
         fi
 
diff --git a/.github/actions/gpu-operator-install/action.yml b/.github/actions/gpu-operator-install/action.yml
index 2727c5e6a..86d247932 100644
--- a/.github/actions/gpu-operator-install/action.yml
+++ b/.github/actions/gpu-operator-install/action.yml
@@ -39,10 +39,6 @@ inputs:
     description: 'Continue deploying remaining bundle components after a component failure'
     required: false
     default: 'true'
-  deploy_trace:
-    description: 'Run generated deploy.sh with bash xtrace for CI diagnostics. Do not enable for bundles with secret command arguments.'
-    required: false
-    default: 'false'
 
 runs:
   using: 'composite'
@@ -128,14 +124,7 @@ runs:
         else
           echo "Deploying bundle with default args"
         fi
-        if [[ "${{ inputs.deploy_trace }}" == "true" ]]; then
-          echo "Deploy trace enabled: running deploy.sh with bash xtrace"
-          echo "::warning::deploy_trace prints shell command arguments; disable it for bundles with secret values"
-          export PS4='+ ${BASH_SOURCE}:${LINENO}: '
-          bash -x ./deploy.sh "${DEPLOY_ARGS[@]}"
-        else
-          ./deploy.sh "${DEPLOY_ARGS[@]}"
-        fi
+        ./deploy.sh "${DEPLOY_ARGS[@]}"
 
     - name: Wait for GPU operands (bundle)
       if: inputs.method == 'bundle'
diff --git a/.github/workflows/gpu-h100-inference-test.yaml b/.github/workflows/gpu-h100-inference-test.yaml
index 95340fbc8..ef08a55e0 100644
--- a/.github/workflows/gpu-h100-inference-test.yaml
+++ b/.github/workflows/gpu-h100-inference-test.yaml
@@ -126,6 +126,7 @@ jobs:
           min_available_memory_gb: '16'
           cluster_create_timeout: 900s
           control_plane_resource_patches: 'true'
+          disable_control_plane_leader_election: 'true'
 
       - name: Build aicr
         uses: ./.github/actions/aicr-build
@@ -152,7 +153,6 @@ jobs:
           platform: dynamo
           wait: 'true'
           best_effort: 'false'
-          deploy_trace: 'true'
 
       - name: Check control plane health
         uses: ./.github/actions/check-control-plane-health
diff --git a/.github/workflows/gpu-h100-training-test.yaml b/.github/workflows/gpu-h100-training-test.yaml
index 5ac900685..f3317b72b 100644
--- a/.github/workflows/gpu-h100-training-test.yaml
+++ b/.github/workflows/gpu-h100-training-test.yaml
@@ -122,6 +122,7 @@ jobs:
           min_available_memory_gb: '16'
           cluster_create_timeout: 900s
           control_plane_resource_patches: 'true'
+          disable_control_plane_leader_election: 'true'
 
       - name: Build aicr
         uses: ./.github/actions/aicr-build
@@ -149,7 +150,6 @@ jobs:
           platform: kubeflow
           wait: 'true'
           best_effort: 'false'
-          deploy_trace: 'true'
 
       - name: Check control plane health
         id: post_runtime_control_plane_health
diff --git a/docs/user/cli-reference.md b/docs/user/cli-reference.md
index 302684d5a..862c098b4 100644
--- a/docs/user/cli-reference.md
+++ b/docs/user/cli-reference.md
@@ -1318,7 +1318,7 @@ After `helm install`, the same manifests are re-applied as post-install to ensur
 
 **Async components:**
 
-Components that use operator patterns with custom resources that reconcile asynchronously (e.g., `kai-scheduler`) are installed without `--wait` to avoid Helm timing out on CR readiness.
+Components that use operator patterns with custom resources that reconcile asynchronously (e.g., `kai-scheduler`) are installed without `--wait` to avoid Helm timing out on CR readiness. The `dynamo-platform` Helm command uses client-side apply so retries and upgrades do not conflict with Grove-managed webhook certificate Secret data, while preserving Helm `--wait` behavior.
 
 ##### DRA kubelet plugin registration
 
diff --git a/pkg/bundler/deployer/helm/helm_test.go b/pkg/bundler/deployer/helm/helm_test.go
index 8524e4bb7..7b52b74cf 100644
--- a/pkg/bundler/deployer/helm/helm_test.go
+++ b/pkg/bundler/deployer/helm/helm_test.go
@@ -528,6 +528,7 @@ func TestGenerate_DeployScriptComponentTimeouts(t *testing.T) {
 		wantTimeout         string
 		wantRetryAssignment string
 		wantRetryCap        string
+		wantApplyArgs       string
 		wantComment         string
 		rejectRetryCap      bool
 	}{
@@ -544,6 +545,7 @@ func TestGenerate_DeployScriptComponentTimeouts(t *testing.T) {
 			wantTimeout:         `COMPONENT_HELM_TIMEOUT="30m"`,
 			wantRetryAssignment: `COMPONENT_MAX_RETRIES="1"`,
 			wantRetryCap:        `if [[ "${COMPONENT_MAX_RETRIES}" -gt 1 ]]`,
+			wantApplyArgs:       `COMPONENT_HELM_APPLY_ARGS=(--server-side=false)`,
 		},
 		{
 			name: "kube-prometheus-stack",
@@ -609,6 +611,12 @@ func TestGenerate_DeployScriptComponentTimeouts(t *testing.T) {
 			if tt.wantRetryCap != "" && !strings.Contains(componentBlock, tt.wantRetryCap) {
 				t.Errorf("deploy.sh missing %s retry cap %q", tt.component.Name, tt.wantRetryCap)
 			}
+			if tt.wantApplyArgs != "" && !strings.Contains(componentBlock, tt.wantApplyArgs) {
+				t.Errorf("deploy.sh missing %s apply args %q", tt.component.Name, tt.wantApplyArgs)
+			}
+			if tt.wantApplyArgs != "" && !strings.Contains(script[blockStart:], `"${COMPONENT_HELM_APPLY_ARGS[@]}"`) {
+				t.Errorf("deploy.sh missing %s apply args in helm command", tt.component.Name)
+			}
 			if tt.wantComment != "" && !strings.Contains(componentBlock, tt.wantComment) {
 				t.Errorf("deploy.sh missing %s retry rationale", tt.component.Name)
 			}
diff --git a/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl b/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl
index 8359fc07f..711135bf4 100644
--- a/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl
+++ b/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl
@@ -375,6 +375,7 @@ retry "{{ .Name }} pre-install manifests" apply_ignoring_crd_race "${SCRIPT_DIR}
 # unnecessary retry cycles.
 COMPONENT_HELM_TIMEOUT="${HELM_TIMEOUT}"
 COMPONENT_MAX_RETRIES="${MAX_RETRIES}"
+COMPONENT_HELM_APPLY_ARGS=()
 {{ if eq .Name "kai-scheduler" -}}
 COMPONENT_HELM_TIMEOUT="30m"
 if [[ "${COMPONENT_MAX_RETRIES}" -gt 1 ]]; then
@@ -382,6 +383,7 @@ if [[ "${COMPONENT_MAX_RETRIES}" -gt 1 ]]; then
 fi
 {{ else if eq .Name "dynamo-platform" -}}
 COMPONENT_HELM_TIMEOUT="30m"
+COMPONENT_HELM_APPLY_ARGS=(--server-side=false)
 if [[ "${COMPONENT_MAX_RETRIES}" -gt 1 ]]; then
   COMPONENT_MAX_RETRIES="1"
 fi
@@ -405,6 +407,7 @@ fi
 helm_retry "{{ .Name }} helm install" "{{ .Namespace }}" \
   "${COMPONENT_MAX_RETRIES}" \
   helm upgrade --install {{ .Name }} {{ .Repository }}/{{ .ChartName }} \
+  "${COMPONENT_HELM_APPLY_ARGS[@]}" \
   {{ if .Version }}--version {{ .Version }} \
   {{ end -}}
   -n {{ .Namespace }} --create-namespace \
@@ -416,6 +419,7 @@ helm_retry "{{ .Name }} helm install" "{{ .Namespace }}" \
 helm_retry "{{ .Name }} helm install" "{{ .Namespace }}" \
   "${COMPONENT_MAX_RETRIES}" \
   helm upgrade --install {{ .Name }} {{ .ChartName }} \
+  "${COMPONENT_HELM_APPLY_ARGS[@]}" \
   --repo {{ .Repository }} \
   {{ if .Version }}--version {{ .Version }} \
   {{ end -}}

From 23b3d5fa26daff9238ed7f193248c5cd71d359c4 Mon Sep 17 00:00:00 2001
From: Yuan Chen <yuanchen97@gmail.com>
Date: Sun, 26 Apr 2026 09:39:15 -0700
Subject: [PATCH 14/21] ci: harden H100 control plane and Dynamo retries

---
 .../check-control-plane-health/action.yml     | 59 ++++++++++---------
 .github/actions/gpu-cluster-setup/action.yml  | 56 +++++++++++++-----
 .../workflows/gpu-h100-inference-test.yaml    |  2 +-
 .github/workflows/gpu-h100-training-test.yaml |  2 +-
 docs/user/cli-reference.md                    |  2 +-
 pkg/bundler/deployer/helm/helm_test.go        | 17 +++++-
 .../deployer/helm/templates/deploy.sh.tmpl    | 35 ++++++++++-
 7 files changed, 120 insertions(+), 53 deletions(-)

diff --git a/.github/actions/check-control-plane-health/action.yml b/.github/actions/check-control-plane-health/action.yml
index 600310972..fdb9b4a2e 100644
--- a/.github/actions/check-control-plane-health/action.yml
+++ b/.github/actions/check-control-plane-health/action.yml
@@ -32,7 +32,7 @@ inputs:
     required: false
     default: 60s
   max_restarts:
-    description: 'Maximum tolerated restart count for each control-plane component'
+    description: 'Deprecated compatibility input; historical restart counts are reported but not capped'
     required: false
     default: '1'
   stability_window:
@@ -62,7 +62,6 @@ runs:
         NAMESPACE: ${{ inputs.namespace }}
         COMPONENTS: ${{ inputs.components }}
         WAIT_TIMEOUT: ${{ inputs.wait_timeout }}
-        MAX_RESTARTS: ${{ inputs.max_restarts }}
         STABILITY_WINDOW: ${{ inputs.stability_window }}
         RECOVER_UNHEALTHY: ${{ inputs.recover_unhealthy }}
         RECOVERY_COMPONENTS: ${{ inputs.recovery_components }}
@@ -70,13 +69,6 @@ runs:
       run: |
         set -euo pipefail
 
-        MAX_RESTARTS="${MAX_RESTARTS#"${MAX_RESTARTS%%[![:space:]]*}"}"
-        MAX_RESTARTS="${MAX_RESTARTS%"${MAX_RESTARTS##*[![:space:]]}"}"
-        if ! [[ "${MAX_RESTARTS}" =~ ^[0-9]+$ ]]; then
-          echo "::error::max_restarts must be a non-negative integer, got '${MAX_RESTARTS}'"
-          exit 1
-        fi
-
         MAX_RECOVERY_ATTEMPTS="${MAX_RECOVERY_ATTEMPTS#"${MAX_RECOVERY_ATTEMPTS%%[![:space:]]*}"}"
         MAX_RECOVERY_ATTEMPTS="${MAX_RECOVERY_ATTEMPTS%"${MAX_RECOVERY_ATTEMPTS##*[![:space:]]}"}"
         if ! [[ "${MAX_RECOVERY_ATTEMPTS}" =~ ^[0-9]+$ ]]; then
@@ -152,14 +144,30 @@ runs:
           echo "${total}"
         }
 
-        enforce_restart_budget() {
+        report_restart_baseline() {
           local component="$1"
           local restart_count="$2"
 
-          if (( restart_count > MAX_RESTARTS )); then
-            echo "::error::${component} restartCount=${restart_count} exceeds max_restarts=${MAX_RESTARTS}"
-            dump_component_diagnostics "${component}"
-            kubectl_kind get --raw='/readyz' || true
+          if (( restart_count > 0 )); then
+            echo "::warning::${component} has historical restartCount=${restart_count}; checking current readiness and stability window only"
+            return
+          fi
+          echo "${component} restartCount=${restart_count}"
+        }
+
+        dump_control_plane_summary() {
+          echo "=== Control-plane pod restart summary ==="
+          kubectl_kind -n "${NAMESPACE}" get pods -l tier=control-plane -o wide || true
+          kubectl_kind -n "${NAMESPACE}" get pods -l tier=control-plane \
+            -o jsonpath='{range .items[*]}{.metadata.name}{" restartCount="}{range .status.containerStatuses[*]}{.restartCount}{" "}{end}{"\n"}{end}' || true
+        }
+
+        require_readyz() {
+          local reason="$1"
+
+          if ! kubectl_kind get --raw='/readyz'; then
+            echo "::error::kube-apiserver /readyz failed ${reason}"
+            dump_control_plane_summary
             exit 1
           fi
         }
@@ -170,6 +178,7 @@ runs:
           local pods
           local pod
 
+          dump_control_plane_summary
           kubectl_kind -n "${NAMESPACE}" get pod -l "${selector}" -o wide || true
           kubectl_kind -n "${NAMESPACE}" describe pod -l "${selector}" || true
           kubectl_kind -n "${NAMESPACE}" get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true
@@ -292,7 +301,7 @@ runs:
             fi
           fi
           initial_restarts=$(restart_total "${component}")
-          enforce_restart_budget "${component}" "${initial_restarts}"
+          report_restart_baseline "${component}" "${initial_restarts}"
           INITIAL_RESTARTS["${component}"]="${initial_restarts}"
         }
 
@@ -322,24 +331,17 @@ runs:
                 exit 1
               fi
               initial_restarts=$(restart_total "${component}")
-              enforce_restart_budget "${component}" "${initial_restarts}"
+              report_restart_baseline "${component}" "${initial_restarts}"
               INITIAL_RESTARTS["${component}"]="${initial_restarts}"
               recovered=true
               continue
             fi
             final_restarts=$(restart_total "${component}")
-            enforce_restart_budget "${component}" "${final_restarts}"
             if (( final_restarts > initial_restarts )); then
-              if ! try_recover_component "${component}" "restartCount increased from ${initial_restarts} to ${final_restarts} during ${STABILITY_WINDOW}"; then
-                echo "::error::${component} restartCount increased from ${initial_restarts} to ${final_restarts} during ${STABILITY_WINDOW}"
-                dump_component_diagnostics "${component}"
-                exit 1
-              fi
-              initial_restarts=$(restart_total "${component}")
-              enforce_restart_budget "${component}" "${initial_restarts}"
-              INITIAL_RESTARTS["${component}"]="${initial_restarts}"
-              recovered=true
-              continue
+              echo "::error::${component} restartCount increased from ${initial_restarts} to ${final_restarts} during ${STABILITY_WINDOW}"
+              dump_component_diagnostics "${component}"
+              kubectl_kind get --raw='/readyz' || true
+              exit 1
             fi
             INITIAL_RESTARTS["${component}"]="${final_restarts}"
           done
@@ -363,7 +365,6 @@ runs:
               exit 1
             fi
             final_restarts=$(restart_total "${component}")
-            enforce_restart_budget "${component}" "${final_restarts}"
             if (( final_restarts > initial_restarts )); then
               echo "::error::${component} restartCount increased from ${initial_restarts} to ${final_restarts} after recovery"
               dump_component_diagnostics "${component}"
@@ -379,4 +380,4 @@ runs:
           check_component "${component}"
         done
         verify_stability_window
-        kubectl_kind get --raw='/readyz'
+        require_readyz "after stability window"
diff --git a/.github/actions/gpu-cluster-setup/action.yml b/.github/actions/gpu-cluster-setup/action.yml
index d20d7cbdd..55325e870 100644
--- a/.github/actions/gpu-cluster-setup/action.yml
+++ b/.github/actions/gpu-cluster-setup/action.yml
@@ -44,10 +44,22 @@ inputs:
     description: 'Apply kubeadm patches that raise control-plane static pod resource requests'
     required: false
     default: 'false'
-  disable_control_plane_leader_election:
-    description: 'Disable kube-controller-manager and kube-scheduler leader election for single-control-plane kind CI clusters'
+  control_plane_leader_election_tuning:
+    description: 'Increase kube-controller-manager and kube-scheduler leader election timeouts for slow CI control planes'
     required: false
     default: 'false'
+  leader_election_lease_duration:
+    description: 'Leader election lease duration when control_plane_leader_election_tuning is true'
+    required: false
+    default: '120s'
+  leader_election_renew_deadline:
+    description: 'Leader election renew deadline when control_plane_leader_election_tuning is true'
+    required: false
+    default: '90s'
+  leader_election_retry_period:
+    description: 'Leader election retry period when control_plane_leader_election_tuning is true'
+    required: false
+    default: '10s'
   api_server_cpu_request:
     description: 'kube-apiserver CPU request when control_plane_resource_patches is true'
     required: false
@@ -282,7 +294,10 @@ runs:
         KIND_NODE_IMAGE: ${{ inputs.kind_node_image }}
         CLUSTER_CREATE_TIMEOUT: ${{ inputs.cluster_create_timeout }}
         CONTROL_PLANE_RESOURCE_PATCHES: ${{ inputs.control_plane_resource_patches }}
-        DISABLE_CONTROL_PLANE_LEADER_ELECTION: ${{ inputs.disable_control_plane_leader_election }}
+        CONTROL_PLANE_LEADER_ELECTION_TUNING: ${{ inputs.control_plane_leader_election_tuning }}
+        LEADER_ELECTION_LEASE_DURATION: ${{ inputs.leader_election_lease_duration }}
+        LEADER_ELECTION_RENEW_DEADLINE: ${{ inputs.leader_election_renew_deadline }}
+        LEADER_ELECTION_RETRY_PERIOD: ${{ inputs.leader_election_retry_period }}
         API_SERVER_CPU_REQUEST: ${{ inputs.api_server_cpu_request }}
         API_SERVER_MEMORY_REQUEST: ${{ inputs.api_server_memory_request }}
         CONTROLLER_MANAGER_CPU_REQUEST: ${{ inputs.controller_manager_cpu_request }}
@@ -309,16 +324,16 @@ runs:
             ;;
         esac
 
-        case "${DISABLE_CONTROL_PLANE_LEADER_ELECTION}" in
+        case "${CONTROL_PLANE_LEADER_ELECTION_TUNING}" in
           true) ;;
-          ""|false) DISABLE_CONTROL_PLANE_LEADER_ELECTION=false ;;
+          ""|false) CONTROL_PLANE_LEADER_ELECTION_TUNING=false ;;
           *)
-            echo "::error::disable_control_plane_leader_election must be true or false, got '${DISABLE_CONTROL_PLANE_LEADER_ELECTION}'"
+            echo "::error::control_plane_leader_election_tuning must be true or false, got '${CONTROL_PLANE_LEADER_ELECTION_TUNING}'"
             exit 1
             ;;
         esac
 
-        if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" || "${DISABLE_CONTROL_PLANE_LEADER_ELECTION}" == "true" ]]; then
+        if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" || "${CONTROL_PLANE_LEADER_ELECTION_TUNING}" == "true" ]]; then
           patch_dir="$(mktemp -d)"
           config_template="$(mktemp)"
 
@@ -417,19 +432,27 @@ runs:
               directory: /patches
         EOF
           fi
-          if [[ "${DISABLE_CONTROL_PLANE_LEADER_ELECTION}" == "true" ]]; then
-          cat >> "${config_template}" <<'EOF'
+          if [[ "${CONTROL_PLANE_LEADER_ELECTION_TUNING}" == "true" ]]; then
+          cat >> "${config_template}" <<EOF
           - |
             kind: ClusterConfiguration
             apiVersion: kubeadm.k8s.io/v1beta4
             controllerManager:
               extraArgs:
-              - name: leader-elect
-                value: "false"
+              - name: leader-elect-lease-duration
+                value: "${LEADER_ELECTION_LEASE_DURATION}"
+              - name: leader-elect-renew-deadline
+                value: "${LEADER_ELECTION_RENEW_DEADLINE}"
+              - name: leader-elect-retry-period
+                value: "${LEADER_ELECTION_RETRY_PERIOD}"
             scheduler:
               extraArgs:
-              - name: leader-elect
-                value: "false"
+              - name: leader-elect-lease-duration
+                value: "${LEADER_ELECTION_LEASE_DURATION}"
+              - name: leader-elect-renew-deadline
+                value: "${LEADER_ELECTION_RENEW_DEADLINE}"
+              - name: leader-elect-retry-period
+                value: "${LEADER_ELECTION_RETRY_PERIOD}"
         EOF
           fi
           cat >> "${config_template}" <<'EOF'
@@ -462,8 +485,11 @@ runs:
               sed 's/^/  /' "${patch_file}"
             done
           fi
-          if [[ "${DISABLE_CONTROL_PLANE_LEADER_ELECTION}" == "true" ]]; then
-            echo "Disabling kube-controller-manager and kube-scheduler leader election for single-control-plane kind CI."
+          if [[ "${CONTROL_PLANE_LEADER_ELECTION_TUNING}" == "true" ]]; then
+            echo "Increasing kube-controller-manager and kube-scheduler leader election timeouts for slow CI control planes:"
+            echo "  lease-duration=${LEADER_ELECTION_LEASE_DURATION}"
+            echo "  renew-deadline=${LEADER_ELECTION_RENEW_DEADLINE}"
+            echo "  retry-period=${LEADER_ELECTION_RETRY_PERIOD}"
           fi
           CREATE_ARGS+=(--config-template="${config_template}")
         fi
diff --git a/.github/workflows/gpu-h100-inference-test.yaml b/.github/workflows/gpu-h100-inference-test.yaml
index ef08a55e0..38a82bd88 100644
--- a/.github/workflows/gpu-h100-inference-test.yaml
+++ b/.github/workflows/gpu-h100-inference-test.yaml
@@ -126,7 +126,7 @@ jobs:
           min_available_memory_gb: '16'
           cluster_create_timeout: 900s
           control_plane_resource_patches: 'true'
-          disable_control_plane_leader_election: 'true'
+          control_plane_leader_election_tuning: 'true'
 
       - name: Build aicr
         uses: ./.github/actions/aicr-build
diff --git a/.github/workflows/gpu-h100-training-test.yaml b/.github/workflows/gpu-h100-training-test.yaml
index f3317b72b..f5b95fb8c 100644
--- a/.github/workflows/gpu-h100-training-test.yaml
+++ b/.github/workflows/gpu-h100-training-test.yaml
@@ -122,7 +122,7 @@ jobs:
           min_available_memory_gb: '16'
           cluster_create_timeout: 900s
           control_plane_resource_patches: 'true'
-          disable_control_plane_leader_election: 'true'
+          control_plane_leader_election_tuning: 'true'
 
       - name: Build aicr
         uses: ./.github/actions/aicr-build
diff --git a/docs/user/cli-reference.md b/docs/user/cli-reference.md
index 862c098b4..d5f340c49 100644
--- a/docs/user/cli-reference.md
+++ b/docs/user/cli-reference.md
@@ -1318,7 +1318,7 @@ After `helm install`, the same manifests are re-applied as post-install to ensur
 
 **Async components:**
 
-Components that use operator patterns with custom resources that reconcile asynchronously (e.g., `kai-scheduler`) are installed without `--wait` to avoid Helm timing out on CR readiness. The `dynamo-platform` Helm command uses client-side apply so retries and upgrades do not conflict with Grove-managed webhook certificate Secret data, while preserving Helm `--wait` behavior.
+Components that use operator patterns with custom resources that reconcile asynchronously (e.g., `kai-scheduler`) are installed without `--wait` to avoid Helm timing out on CR readiness. The `dynamo-platform` Helm command uses client-side apply so retries and upgrades do not conflict with Grove-managed webhook certificate Secret data, while preserving Helm `--wait` behavior. Dynamo installs use a 20 minute per-attempt timeout and cap the retry budget at 3 retries because cold-start failures often involve operator webhook and certificate resources settling across attempts.
 
 ##### DRA kubelet plugin registration
 
diff --git a/pkg/bundler/deployer/helm/helm_test.go b/pkg/bundler/deployer/helm/helm_test.go
index 7b52b74cf..e5d06fa1e 100644
--- a/pkg/bundler/deployer/helm/helm_test.go
+++ b/pkg/bundler/deployer/helm/helm_test.go
@@ -530,6 +530,7 @@ func TestGenerate_DeployScriptComponentTimeouts(t *testing.T) {
 		wantRetryCap        string
 		wantApplyArgs       string
 		wantComment         string
+		wantSnippets        []string
 		rejectRetryCap      bool
 	}{
 		{
@@ -542,10 +543,15 @@ func TestGenerate_DeployScriptComponentTimeouts(t *testing.T) {
 				Type:      recipe.ComponentTypeHelm,
 				Source:    "oci://nvcr.io/nvidia/ai-dynamo",
 			},
-			wantTimeout:         `COMPONENT_HELM_TIMEOUT="30m"`,
-			wantRetryAssignment: `COMPONENT_MAX_RETRIES="1"`,
-			wantRetryCap:        `if [[ "${COMPONENT_MAX_RETRIES}" -gt 1 ]]`,
+			wantTimeout:         `COMPONENT_HELM_TIMEOUT="20m"`,
+			wantRetryAssignment: `COMPONENT_MAX_RETRIES="3"`,
+			wantRetryCap:        `if [[ "${COMPONENT_MAX_RETRIES}" -gt 3 ]]`,
 			wantApplyArgs:       `COMPONENT_HELM_APPLY_ARGS=(--server-side=false)`,
+			wantSnippets: []string{
+				`dump_dynamo_platform_helm_diagnostics "${namespace}"`,
+				`deployment/dynamo-platform-dynamo-operator-controller-manager`,
+				`--previous --tail=200`,
+			},
 		},
 		{
 			name: "kube-prometheus-stack",
@@ -620,6 +626,11 @@ func TestGenerate_DeployScriptComponentTimeouts(t *testing.T) {
 			if tt.wantComment != "" && !strings.Contains(componentBlock, tt.wantComment) {
 				t.Errorf("deploy.sh missing %s retry rationale", tt.component.Name)
 			}
+			for _, snippet := range tt.wantSnippets {
+				if !strings.Contains(script, snippet) {
+					t.Errorf("deploy.sh missing %s snippet %q", tt.component.Name, snippet)
+				}
+			}
 			if tt.rejectRetryCap && retryCapPattern.MatchString(componentBlock) {
 				t.Errorf("deploy.sh should not cap %s retries in its component block", tt.component.Name)
 			}
diff --git a/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl b/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl
index 711135bf4..cd4e786cd 100644
--- a/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl
+++ b/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl
@@ -144,6 +144,34 @@ function dump_kai_scheduler_helm_diagnostics() {
   echo "  --- End ${namespace} diagnostics ---"
 }
 
+function dump_dynamo_platform_helm_diagnostics() {
+  local namespace="$1"
+  if [[ "${namespace}" != "dynamo-system" ]]; then
+    return
+  fi
+
+  echo "  --- ${namespace} diagnostics ---"
+  echo "  Deployments:"
+  kubectl get deployments -n "${namespace}" -o wide 2>/dev/null || true
+  echo "  Jobs:"
+  kubectl get jobs -n "${namespace}" 2>/dev/null || true
+  echo "  Pods:"
+  kubectl get pods -n "${namespace}" -o wide 2>/dev/null || true
+  echo "  Pod descriptions:"
+  kubectl describe pods -n "${namespace}" 2>/dev/null || true
+  echo "  Dynamo operator manager logs:"
+  kubectl logs deployment/dynamo-platform-dynamo-operator-controller-manager -n "${namespace}" -c manager --tail=200 2>/dev/null || true
+  echo "  Dynamo operator manager previous logs:"
+  kubectl logs deployment/dynamo-platform-dynamo-operator-controller-manager -n "${namespace}" -c manager --previous --tail=200 2>/dev/null || true
+  echo "  Grove operator logs:"
+  kubectl logs deployment/grove-operator -n "${namespace}" --all-containers --tail=100 2>/dev/null || true
+  echo "  Grove operator previous logs:"
+  kubectl logs deployment/grove-operator -n "${namespace}" --all-containers --previous --tail=100 2>/dev/null || true
+  echo "  Recent events:"
+  kubectl get events -n "${namespace}" --sort-by='.lastTimestamp' 2>/dev/null | tail -50 || true
+  echo "  --- End ${namespace} diagnostics ---"
+}
+
 # helm_retry contract:
 #   helm_retry "<description>" "<namespace>" "<max_retries>" <command> [args...]
 # Callers must pass the retry budget as the third positional argument before the
@@ -161,6 +189,7 @@ function helm_retry() {
     fi
     attempt=$((attempt + 1))
     dump_kai_scheduler_helm_diagnostics "${namespace}"
+    dump_dynamo_platform_helm_diagnostics "${namespace}"
     if [[ ${attempt} -gt ${max_retries} ]]; then
       echo "ERROR: ${desc} failed after ${attempt} attempts"
       return 1
@@ -382,10 +411,10 @@ if [[ "${COMPONENT_MAX_RETRIES}" -gt 1 ]]; then
   COMPONENT_MAX_RETRIES="1"
 fi
 {{ else if eq .Name "dynamo-platform" -}}
-COMPONENT_HELM_TIMEOUT="30m"
+COMPONENT_HELM_TIMEOUT="20m"
 COMPONENT_HELM_APPLY_ARGS=(--server-side=false)
-if [[ "${COMPONENT_MAX_RETRIES}" -gt 1 ]]; then
-  COMPONENT_MAX_RETRIES="1"
+if [[ "${COMPONENT_MAX_RETRIES}" -gt 3 ]]; then
+  COMPONENT_MAX_RETRIES="3"
 fi
 {{ else if eq .Name "kube-prometheus-stack" -}}
 # Grafana can trip its Deployment progress deadline before a longer Helm

From 0f308d68e8aa1e40ca73a403c251a7ac1a476108 Mon Sep 17 00:00:00 2001
From: Yuan Chen <yuanchen97@gmail.com>
Date: Sun, 26 Apr 2026 14:36:45 -0700
Subject: [PATCH 15/21] Stabilize H100 GPU CI checks

---
 .../check-control-plane-health/action.yml     | 104 ++++++++++++++++-
 .github/actions/gpu-cluster-setup/action.yml  | 101 +++++++++++++++-
 .github/actions/gpu-test-cleanup/action.yml   |  69 ++++++++++-
 .../workflows/gpu-h100-inference-test.yaml    |  11 +-
 .github/workflows/gpu-h100-training-test.yaml |  13 +--
 .github/workflows/gpu-smoke-test.yaml         |   6 +
 docs/user/cli-reference.md                    |   4 +-
 pkg/bundler/deployer/helm/helm_test.go        | 110 ++++++++++++++++--
 .../deployer/helm/templates/README.md.tmpl    |  20 +++-
 .../helm/templates/component-README.md.tmpl   |  32 ++++-
 .../deployer/helm/templates/deploy.sh.tmpl    |   7 +-
 tests/chainsaw/ai-conformance/README.md       |   3 +-
 .../kind-common/assert-monitoring.yaml        |  85 ++++++++++++++
 .../kind-inference-dynamo/chainsaw-test.yaml  |   4 +-
 .../kind-training-kubeflow/chainsaw-test.yaml |   4 +-
 15 files changed, 520 insertions(+), 53 deletions(-)
 create mode 100644 tests/chainsaw/ai-conformance/kind-common/assert-monitoring.yaml

diff --git a/.github/actions/check-control-plane-health/action.yml b/.github/actions/check-control-plane-health/action.yml
index fdb9b4a2e..582d78746 100644
--- a/.github/actions/check-control-plane-health/action.yml
+++ b/.github/actions/check-control-plane-health/action.yml
@@ -85,6 +85,9 @@ runs:
           echo "::error::stability_window must be a duration like 0s, 60s, or 2m; got '${STABILITY_WINDOW}'"
           exit 1
         fi
+        if [[ "${STABILITY_WINDOW}" =~ ^0+[smh]$ ]]; then
+          STABILITY_WINDOW="0s"
+        fi
 
         RECOVER_UNHEALTHY="${RECOVER_UNHEALTHY#"${RECOVER_UNHEALTHY%%[![:space:]]*}"}"
         RECOVER_UNHEALTHY="${RECOVER_UNHEALTHY%"${RECOVER_UNHEALTHY##*[![:space:]]}"}"
@@ -167,11 +170,109 @@ runs:
 
           if ! kubectl_kind get --raw='/readyz'; then
             echo "::error::kube-apiserver /readyz failed ${reason}"
-            dump_control_plane_summary
+            dump_all_control_plane_runtime_diagnostics
             exit 1
           fi
         }
 
+        dump_api_server_health() {
+          local endpoint
+
+          for endpoint in '/livez?verbose' '/readyz?verbose' '/healthz'; do
+            echo "=== kube-apiserver ${endpoint} ==="
+            kubectl_kind get --raw="${endpoint}" || true
+          done
+        }
+
+        dump_kind_node_runtime_summary() {
+          local node="${KIND_CLUSTER_NAME}-control-plane"
+
+          if ! docker_timeout inspect "${node}" >/dev/null 2>&1; then
+            echo "::warning::cannot collect node runtime summary: kind node container ${node} not found"
+            return
+          fi
+
+          echo "=== ${node} docker stats ==="
+          docker_timeout stats --no-stream \
+            --format 'table {{.Name}}\t{{.CPUPerc}}\t{{.MemUsage}}\t{{.NetIO}}\t{{.BlockIO}}\t{{.PIDs}}' \
+            "${node}" || true
+
+          echo "=== ${node} docker inspect state ==="
+          docker_timeout inspect \
+            --format 'status={{.State.Status}} running={{.State.Running}} oomKilled={{.State.OOMKilled}} pid={{.State.Pid}} started={{.State.StartedAt}} finished={{.State.FinishedAt}}' \
+            "${node}" || true
+
+          echo "=== ${node} node pressure snapshot ==="
+          docker_timeout exec "${node}" sh -c '
+            date
+            uptime || true
+            free -h || true
+            df -h / /var/lib/containerd /var/lib/kubelet 2>/dev/null || df -h
+            echo "--- top cpu/memory processes ---"
+            ps -eo pid,ppid,stat,etime,%cpu,%mem,comm,args --sort=-%cpu | head -40 || true
+          ' || true
+
+          echo "=== ${node} CRI pod/container summary ==="
+          docker_timeout exec "${node}" crictl pods || true
+          docker_timeout exec "${node}" crictl ps -a || true
+          docker_timeout exec "${node}" crictl stats || true
+        }
+
+        dump_static_pod_runtime_diagnostics() {
+          local component="$1"
+          local node="${KIND_CLUSTER_NAME}-control-plane"
+          local container_ids
+          local container_id
+          local count=0
+
+          if ! docker_timeout inspect "${node}" >/dev/null 2>&1; then
+            echo "::warning::cannot collect ${component} runtime diagnostics: kind node container ${node} not found"
+            return
+          fi
+
+          echo "=== ${node} ${component} static pod manifest ==="
+          docker_timeout exec "${node}" sh -c "sed -n '1,220p' /etc/kubernetes/manifests/${component}.yaml" || true
+
+          echo "=== ${node} ${component} CRI containers ==="
+          docker_timeout exec "${node}" crictl ps -a --name "${component}" || true
+
+          container_ids=$(docker_timeout exec "${node}" crictl ps -a --name "${component}" -q 2>/dev/null || true)
+          for container_id in ${container_ids}; do
+            count=$((count + 1))
+            if (( count > 8 )); then
+              echo "Skipping remaining ${component} CRI containers after first 8 entries."
+              break
+            fi
+
+            echo "=== ${node} crictl inspect ${component} ${container_id} ==="
+            docker_timeout exec "${node}" crictl inspect "${container_id}" || true
+            echo "=== ${node} crictl logs ${component} ${container_id} ==="
+            docker_timeout exec "${node}" crictl logs --tail=200 "${container_id}" || true
+          done
+
+          echo "=== ${node} kubelet journal (${component}) ==="
+          docker_timeout exec "${node}" journalctl -u kubelet --since '45 minutes ago' --no-pager 2>/dev/null \
+            | grep -Ei "${component}|static pod|mirror pod|probe|liveness|readiness|startup|back-off|backoff|container|failed|error|oom|killed" \
+            | tail -200 || true
+
+          echo "=== ${node} containerd journal (${component}) ==="
+          docker_timeout exec "${node}" journalctl -u containerd --since '45 minutes ago' --no-pager 2>/dev/null \
+            | grep -Ei "${component}|container|task|shim|deadline|failed|error|oom|killed" \
+            | tail -200 || true
+        }
+
+        dump_all_control_plane_runtime_diagnostics() {
+          local component
+
+          dump_control_plane_summary
+          dump_api_server_health
+          dump_kind_node_runtime_summary
+          for component in ${COMPONENTS}; do
+            dump_static_pod_runtime_diagnostics "${component}"
+            kubectl_kind -n kube-system get lease "${component}" -o yaml 2>/dev/null || true
+          done
+        }
+
         dump_component_diagnostics() {
           local component="$1"
           local selector="component=${component}"
@@ -192,6 +293,7 @@ runs:
             kubectl_kind -n "${NAMESPACE}" logs "${pod}" --all-containers --previous --tail=100 2>/dev/null || true
           done <<< "${pods}"
 
+          dump_all_control_plane_runtime_diagnostics
           kubectl_kind -n kube-system get lease "${component}" -o yaml 2>/dev/null || true
         }
 
diff --git a/.github/actions/gpu-cluster-setup/action.yml b/.github/actions/gpu-cluster-setup/action.yml
index 55325e870..93ab80cef 100644
--- a/.github/actions/gpu-cluster-setup/action.yml
+++ b/.github/actions/gpu-cluster-setup/action.yml
@@ -51,11 +51,11 @@ inputs:
   leader_election_lease_duration:
     description: 'Leader election lease duration when control_plane_leader_election_tuning is true'
     required: false
-    default: '120s'
+    default: '300s'
   leader_election_renew_deadline:
     description: 'Leader election renew deadline when control_plane_leader_election_tuning is true'
     required: false
-    default: '90s'
+    default: '240s'
   leader_election_retry_period:
     description: 'Leader election retry period when control_plane_leader_election_tuning is true'
     required: false
@@ -427,13 +427,27 @@ runs:
           cat >> "${config_template}" <<'EOF'
           - |
             kind: InitConfiguration
-            apiVersion: kubeadm.k8s.io/v1beta4
             patches:
               directory: /patches
         EOF
           fi
           if [[ "${CONTROL_PLANE_LEADER_ELECTION_TUNING}" == "true" ]]; then
+          # kind v0.31 renders kubeadm v1beta3. Keep a v1beta4 patch too so
+          # this remains valid when a future kind image switches API versions.
           cat >> "${config_template}" <<EOF
+          - |
+            kind: ClusterConfiguration
+            apiVersion: kubeadm.k8s.io/v1beta3
+            controllerManager:
+              extraArgs:
+                leader-elect-lease-duration: "${LEADER_ELECTION_LEASE_DURATION}"
+                leader-elect-renew-deadline: "${LEADER_ELECTION_RENEW_DEADLINE}"
+                leader-elect-retry-period: "${LEADER_ELECTION_RETRY_PERIOD}"
+            scheduler:
+              extraArgs:
+                leader-elect-lease-duration: "${LEADER_ELECTION_LEASE_DURATION}"
+                leader-elect-renew-deadline: "${LEADER_ELECTION_RENEW_DEADLINE}"
+                leader-elect-retry-period: "${LEADER_ELECTION_RETRY_PERIOD}"
           - |
             kind: ClusterConfiguration
             apiVersion: kubeadm.k8s.io/v1beta4
@@ -527,6 +541,87 @@ runs:
               " limits=" + ((.resources.limits // {}) | tostring))
           ' || true
 
+        normalize_cpu_request() {
+          local cpu="$1"
+
+          if [[ "${cpu}" =~ ^([0-9]+)000m$ ]]; then
+            echo "${BASH_REMATCH[1]}"
+            return
+          fi
+          echo "${cpu}"
+        }
+
+        control_plane_request() {
+          local component="$1"
+          local resource="$2"
+
+          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kube-system \
+            get pod -l "component=${component}" \
+            -o "jsonpath={.items[0].spec.containers[0].resources.requests.${resource}}"
+        }
+
+        assert_control_plane_request() {
+          local component="$1"
+          local resource="$2"
+          local expected="$3"
+          local actual
+
+          actual="$(control_plane_request "${component}" "${resource}")"
+          if [[ "${resource}" == "cpu" ]]; then
+            expected="$(normalize_cpu_request "${expected}")"
+            actual="$(normalize_cpu_request "${actual}")"
+          fi
+          if [[ "${actual}" != "${expected}" ]]; then
+            echo "::error::${component} ${resource} request is '${actual}', expected '${expected}'"
+            exit 1
+          fi
+          echo "${component} ${resource} request verified: ${actual}"
+        }
+
+        control_plane_command_args() {
+          local component="$1"
+
+          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kube-system \
+            get pod -l "component=${component}" \
+            -o jsonpath='{range .items[0].spec.containers[0].command[*]}{.}{"\n"}{end}{range .items[0].spec.containers[0].args[*]}{.}{"\n"}{end}'
+        }
+
+        assert_control_plane_arg() {
+          local component="$1"
+          local expected="$2"
+          local command_args
+
+          command_args="$(control_plane_command_args "${component}")"
+          if ! grep -Fxq "${expected}" <<< "${command_args}"; then
+            echo "::error::${component} live pod command/args does not contain ${expected}"
+            echo "Observed live command/args:"
+            echo "${command_args}"
+            exit 1
+          fi
+          echo "${component} command/args verified: ${expected}"
+        }
+
+        if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then
+          echo "Verifying control-plane resource patches..."
+          assert_control_plane_request kube-apiserver cpu "${API_SERVER_CPU_REQUEST}"
+          assert_control_plane_request kube-apiserver memory "${API_SERVER_MEMORY_REQUEST}"
+          assert_control_plane_request kube-controller-manager cpu "${CONTROLLER_MANAGER_CPU_REQUEST}"
+          assert_control_plane_request kube-controller-manager memory "${CONTROLLER_MANAGER_MEMORY_REQUEST}"
+          assert_control_plane_request kube-scheduler cpu "${SCHEDULER_CPU_REQUEST}"
+          assert_control_plane_request kube-scheduler memory "${SCHEDULER_MEMORY_REQUEST}"
+          assert_control_plane_request etcd cpu "${ETCD_CPU_REQUEST}"
+          assert_control_plane_request etcd memory "${ETCD_MEMORY_REQUEST}"
+        fi
+
+        if [[ "${CONTROL_PLANE_LEADER_ELECTION_TUNING}" == "true" ]]; then
+          echo "Verifying control-plane leader election timeout patches..."
+          for component in kube-controller-manager kube-scheduler; do
+            assert_control_plane_arg "${component}" "--leader-elect-lease-duration=${LEADER_ELECTION_LEASE_DURATION}"
+            assert_control_plane_arg "${component}" "--leader-elect-renew-deadline=${LEADER_ELECTION_RENEW_DEADLINE}"
+            assert_control_plane_arg "${component}" "--leader-elect-retry-period=${LEADER_ELECTION_RETRY_PERIOD}"
+          done
+        fi
+
     - name: Print GPUs (nvkind)
       shell: bash
       run: nvkind cluster print-gpus --name="${KIND_CLUSTER_NAME}"
diff --git a/.github/actions/gpu-test-cleanup/action.yml b/.github/actions/gpu-test-cleanup/action.yml
index 5d1cef3e4..c085ed630 100644
--- a/.github/actions/gpu-test-cleanup/action.yml
+++ b/.github/actions/gpu-test-cleanup/action.yml
@@ -28,19 +28,36 @@ runs:
   using: 'composite'
   steps:
     - name: Collect debug artifacts
-      if: failure()
+      if: failure() || cancelled()
       shell: bash
       env:
         KIND_CLUSTER_NAME: ${{ inputs.cluster_name }}
       run: |
         set -o pipefail
         mkdir -p /tmp/debug-artifacts
+        CONTROL_PLANE_COMPONENTS="kube-apiserver kube-controller-manager kube-scheduler etcd"
         kubectl_kind() {
           timeout 30s kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" "$@"
         }
 
         kubectl_kind get all --all-namespaces > /tmp/debug-artifacts/all-resources.txt || true
         kubectl_kind get events --all-namespaces --sort-by='.lastTimestamp' > /tmp/debug-artifacts/events.txt || true
+        kubectl_kind get --raw='/livez?verbose' > /tmp/debug-artifacts/apiserver-livez.txt 2>&1 || true
+        kubectl_kind get --raw='/readyz?verbose' > /tmp/debug-artifacts/apiserver-readyz.txt 2>&1 || true
+        kubectl_kind -n kube-system get pods -l tier=control-plane -o wide \
+          > /tmp/debug-artifacts/control-plane-pods.txt 2>&1 || true
+        kubectl_kind -n kube-system get events --sort-by='.lastTimestamp' \
+          > /tmp/debug-artifacts/kube-system-events.txt 2>&1 || true
+        for component in ${CONTROL_PLANE_COMPONENTS}; do
+          kubectl_kind -n kube-system describe pod -l "component=${component}" \
+            > "/tmp/debug-artifacts/${component}-describe.txt" 2>&1 || true
+          kubectl_kind -n kube-system logs -l "component=${component}" --all-containers --tail=300 \
+            > "/tmp/debug-artifacts/${component}-logs.txt" 2>&1 || true
+          kubectl_kind -n kube-system logs -l "component=${component}" --all-containers --previous --tail=300 \
+            > "/tmp/debug-artifacts/${component}-previous-logs.txt" 2>&1 || true
+          kubectl_kind -n kube-system get lease "${component}" -o yaml \
+            > "/tmp/debug-artifacts/${component}-lease.yaml" 2>&1 || true
+        done
         kubectl_kind -n gpu-operator get pods -o wide > /tmp/debug-artifacts/gpu-operator-pods.txt || true
         kubectl_kind -n gpu-operator logs -l app=nvidia-device-plugin-daemonset --tail=100 > /tmp/debug-artifacts/device-plugin-logs.txt || true
         kubectl_kind -n gpu-operator logs -l app.kubernetes.io/component=gpu-operator --tail=100 > /tmp/debug-artifacts/gpu-operator-logs.txt || true
@@ -55,8 +72,54 @@ runs:
           echo "No recipe.yaml or bundle directory found; skipping runtime bundle archive"
         fi
 
+        docker ps --filter "label=io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}" \
+          --format '{{.Names}}' | sort | while read -r node_container; do
+            [[ -z "${node_container}" ]] && continue
+            node_file="${node_container//[^A-Za-z0-9_.-]/_}"
+            timeout 30s docker exec "${node_container}" journalctl -u kubelet \
+              --since "90 minutes ago" --no-pager \
+              > "/tmp/debug-artifacts/${node_file}-kubelet-journal.txt" 2>&1 || true
+            timeout 30s docker exec "${node_container}" journalctl -u containerd \
+              --since "90 minutes ago" --no-pager \
+              > "/tmp/debug-artifacts/${node_file}-containerd-journal.txt" 2>&1 || true
+            timeout 30s docker exec "${node_container}" crictl ps -a \
+              > "/tmp/debug-artifacts/${node_file}-crictl-ps-a.txt" 2>&1 || true
+            timeout 30s docker exec "${node_container}" crictl pods \
+              > "/tmp/debug-artifacts/${node_file}-crictl-pods.txt" 2>&1 || true
+            timeout 30s docker exec "${node_container}" crictl stats \
+              > "/tmp/debug-artifacts/${node_file}-crictl-stats.txt" 2>&1 || true
+            timeout 30s docker exec "${node_container}" sh -c '
+              date
+              uptime || true
+              free -h || true
+              df -h / /var/lib/containerd /var/lib/kubelet 2>/dev/null || df -h
+              echo "--- top cpu/memory processes ---"
+              ps -eo pid,ppid,stat,etime,%cpu,%mem,comm,args --sort=-%cpu | head -40 || true
+            ' > "/tmp/debug-artifacts/${node_file}-node-pressure.txt" 2>&1 || true
+            timeout 120s docker exec "${node_container}" sh -c '
+              for component in kube-apiserver kube-controller-manager kube-scheduler etcd; do
+                echo "=== ${component} static pod manifest ==="
+                sed -n "1,220p" "/etc/kubernetes/manifests/${component}.yaml" 2>/dev/null || true
+                echo "=== ${component} CRI containers ==="
+                crictl ps -a --name "${component}" || true
+                count=0
+                for container_id in $(crictl ps -a --name "${component}" -q 2>/dev/null); do
+                  count=$((count + 1))
+                  if [ "${count}" -gt 8 ]; then
+                    echo "Skipping remaining ${component} CRI containers after first 8 entries."
+                    break
+                  fi
+                  echo "=== crictl inspect ${component} ${container_id} ==="
+                  crictl inspect "${container_id}" || true
+                  echo "=== crictl logs ${component} ${container_id} ==="
+                  crictl logs --tail=300 "${container_id}" || true
+                done
+              done
+            ' > "/tmp/debug-artifacts/${node_file}-control-plane-cri.txt" 2>&1 || true
+          done
+
     - name: Export kind logs
-      if: failure()
+      if: failure() || cancelled()
       shell: bash
       env:
         KIND_CLUSTER_NAME: ${{ inputs.cluster_name }}
@@ -82,7 +145,7 @@ runs:
         timeout 60s docker system prune -f --filter "until=24h" || true
 
     - name: Upload debug artifacts
-      if: failure()
+      if: failure() || cancelled()
       uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f  # v6.0.0
       with:
         name: ${{ inputs.artifact_name_prefix }}-${{ github.run_id }}
diff --git a/.github/workflows/gpu-h100-inference-test.yaml b/.github/workflows/gpu-h100-inference-test.yaml
index 38a82bd88..081bb0261 100644
--- a/.github/workflows/gpu-h100-inference-test.yaml
+++ b/.github/workflows/gpu-h100-inference-test.yaml
@@ -128,10 +128,10 @@ jobs:
           control_plane_resource_patches: 'true'
           control_plane_leader_election_tuning: 'true'
 
-      - name: Build aicr
+      - name: Build aicr and snapshot agent image
         uses: ./.github/actions/aicr-build
         with:
-          build_snapshot_agent: 'false'
+          build_snapshot_agent: 'true'
           validator_phases: 'none'
 
       # Fast readiness gate after cluster setup. Stability windows start after
@@ -164,13 +164,6 @@ jobs:
 
       # --- Snapshot and GPU validation ---
 
-      - name: Build snapshot agent image
-        uses: ./.github/actions/aicr-build
-        with:
-          build_cli: 'false'
-          build_snapshot_agent: 'true'
-          validator_phases: 'none'
-
       - name: Snapshot and validate GPU
         uses: ./.github/actions/gpu-snapshot-validate
         with:
diff --git a/.github/workflows/gpu-h100-training-test.yaml b/.github/workflows/gpu-h100-training-test.yaml
index f5b95fb8c..a617bc718 100644
--- a/.github/workflows/gpu-h100-training-test.yaml
+++ b/.github/workflows/gpu-h100-training-test.yaml
@@ -124,10 +124,10 @@ jobs:
           control_plane_resource_patches: 'true'
           control_plane_leader_election_tuning: 'true'
 
-      - name: Build aicr
+      - name: Build aicr and snapshot agent image
         uses: ./.github/actions/aicr-build
         with:
-          build_snapshot_agent: 'false'
+          build_snapshot_agent: 'true'
           validator_phases: 'none'
 
       # Fast readiness gate after cluster setup. Stability windows start after
@@ -160,15 +160,6 @@ jobs:
           stability_window: 60s
           recover_unhealthy: 'true'
 
-      # --- Snapshot and GPU validation ---
-
-      - name: Build snapshot agent image
-        uses: ./.github/actions/aicr-build
-        with:
-          build_cli: 'false'
-          build_snapshot_agent: 'true'
-          validator_phases: 'none'
-
       - name: Snapshot and validate GPU
         uses: ./.github/actions/gpu-snapshot-validate
         with:
diff --git a/.github/workflows/gpu-smoke-test.yaml b/.github/workflows/gpu-smoke-test.yaml
index d5b8c5c74..25d968b67 100644
--- a/.github/workflows/gpu-smoke-test.yaml
+++ b/.github/workflows/gpu-smoke-test.yaml
@@ -88,6 +88,12 @@ jobs:
 
       - name: Set up GPU cluster
         uses: ./.github/actions/gpu-cluster-setup
+        with:
+          # Keep smoke runner preflight explicit so action default changes do not
+          # silently alter L40G coverage.
+          min_gpu_count: '1'
+          min_free_disk_gb: '20'
+          min_available_memory_gb: '8'
 
       - name: Build aicr
         uses: ./.github/actions/aicr-build
diff --git a/docs/user/cli-reference.md b/docs/user/cli-reference.md
index d5f340c49..a6dd85b1d 100644
--- a/docs/user/cli-reference.md
+++ b/docs/user/cli-reference.md
@@ -1308,6 +1308,8 @@ Unknown flags are rejected with an error to catch typos (e.g., `--best-effort`).
 
 The deploy script retries failed `helm upgrade --install` and `kubectl apply` operations with exponential backoff. By default, each operation is retried up to 5 times (6 total attempts). The backoff delay increases quadratically: 5s, 20s, 45s, 80s, 120s (capped) between retries.
 
+On slower H100 CI runners, `kube-prometheus-stack` can hit Grafana's Deployment progress deadline before a longer Helm timeout would help. The deploy script intentionally keeps the default timeout and retry budget for `kube-prometheus-stack` so subsequent upgrade attempts can succeed after image pulls and controllers settle. Kind H100 Chainsaw health checks do not require Grafana because AICR conformance metrics use Prometheus, DCGM exporter, and prometheus-adapter directly.
+
 Use `--retries 0` to disable retries (fail-fast behavior). When `--best-effort` is also set, retries are exhausted first before falling through to best-effort handling.
 
 **Pre-install manifests and CRD ordering:**
@@ -1318,7 +1320,7 @@ After `helm install`, the same manifests are re-applied as post-install to ensur
 
 **Async components:**
 
-Components that use operator patterns with custom resources that reconcile asynchronously (e.g., `kai-scheduler`) are installed without `--wait` to avoid Helm timing out on CR readiness. The `dynamo-platform` Helm command uses client-side apply so retries and upgrades do not conflict with Grove-managed webhook certificate Secret data, while preserving Helm `--wait` behavior. Dynamo installs use a 20 minute per-attempt timeout and cap the retry budget at 3 retries because cold-start failures often involve operator webhook and certificate resources settling across attempts.
+Components that use operator patterns with custom resources that reconcile asynchronously (e.g., `kai-scheduler`) are installed without `--wait` to avoid Helm timing out on CR readiness. Kai Scheduler installs use a 30 minute per-attempt timeout and cap the retry budget at 1 retry so hook diagnostics surface quickly on cold runners. The `dynamo-platform` Helm command uses client-side apply so retries and upgrades do not conflict with Grove-managed webhook certificate Secret data, while preserving Helm `--wait` behavior. Dynamo installs use a 20 minute per-attempt timeout and cap the retry budget at 3 retries because cold-start failures often involve operator webhook and certificate resources settling across attempts.
 
 ##### DRA kubelet plugin registration
 
diff --git a/pkg/bundler/deployer/helm/helm_test.go b/pkg/bundler/deployer/helm/helm_test.go
index e5d06fa1e..ff41c88e4 100644
--- a/pkg/bundler/deployer/helm/helm_test.go
+++ b/pkg/bundler/deployer/helm/helm_test.go
@@ -518,20 +518,47 @@ func TestGenerate_DeployScriptKaiSchedulerTimeout(t *testing.T) {
 	if !strings.Contains(script, `kubectl describe pods -n "${namespace}"`) {
 		t.Error("deploy.sh missing pod diagnostics")
 	}
+
+	rootReadmeContent, err := os.ReadFile(filepath.Join(outputDir, "README.md"))
+	if err != nil {
+		t.Fatalf("failed to read root README: %v", err)
+	}
+	componentReadmeContent, err := os.ReadFile(filepath.Join(outputDir, "kai-scheduler", "README.md"))
+	if err != nil {
+		t.Fatalf("failed to read component README: %v", err)
+	}
+	rootReadme := string(rootReadmeContent)
+	componentReadme := string(componentReadmeContent)
+	if !strings.Contains(rootReadme, `--timeout 30m`) {
+		t.Error("root README missing kai-scheduler 30m timeout")
+	}
+	if !strings.Contains(componentReadme, `--timeout 30m`) {
+		t.Error("component README missing kai-scheduler 30m timeout")
+	}
+	if strings.Contains(componentReadme, `--wait --timeout 30m`) {
+		t.Error("component README should document kai-scheduler without --wait")
+	}
+	if strings.Contains(componentReadme, `--wait --timeout 10m`) {
+		t.Error("component README should not use default timeout for kai-scheduler")
+	}
 }
 
 func TestGenerate_DeployScriptComponentTimeouts(t *testing.T) {
 	retryCapPattern := regexp.MustCompile(`(?m)(if \[\[ "\$\{COMPONENT_MAX_RETRIES\}" -gt \d+ \]\]|COMPONENT_MAX_RETRIES="\d+")`)
+	applyArgsExpansion := `${COMPONENT_HELM_APPLY_ARGS[@]+"${COMPONENT_HELM_APPLY_ARGS[@]}"}`
 	tests := []struct {
-		name                string
-		component           recipe.ComponentRef
-		wantTimeout         string
-		wantRetryAssignment string
-		wantRetryCap        string
-		wantApplyArgs       string
-		wantComment         string
-		wantSnippets        []string
-		rejectRetryCap      bool
+		name                 string
+		component            recipe.ComponentRef
+		wantTimeout          string
+		wantRetryAssignment  string
+		wantRetryCap         string
+		wantApplyArgs        string
+		wantComment          string
+		wantSnippets         []string
+		wantReadmeSnippets   []string
+		rejectSnippets       []string
+		rejectReadmeSnippets []string
+		rejectRetryCap       bool
 	}{
 		{
 			name: "dynamo-platform",
@@ -552,6 +579,10 @@ func TestGenerate_DeployScriptComponentTimeouts(t *testing.T) {
 				`deployment/dynamo-platform-dynamo-operator-controller-manager`,
 				`--previous --tail=200`,
 			},
+			wantReadmeSnippets: []string{
+				`--server-side=false`,
+				`--wait --timeout 20m`,
+			},
 		},
 		{
 			name: "kube-prometheus-stack",
@@ -567,6 +598,33 @@ func TestGenerate_DeployScriptComponentTimeouts(t *testing.T) {
 			wantComment:    `preserve the default retry`,
 			rejectRetryCap: true,
 		},
+		{
+			name: "ordinary component defaults",
+			component: recipe.ComponentRef{
+				Name:      "gpu-operator",
+				Namespace: "gpu-operator",
+				Chart:     "gpu-operator",
+				Version:   "v25.10.1",
+				Type:      recipe.ComponentTypeHelm,
+				Source:    "https://helm.ngc.nvidia.com/nvidia",
+			},
+			wantTimeout:   `COMPONENT_HELM_TIMEOUT="${HELM_TIMEOUT}"`,
+			wantApplyArgs: `COMPONENT_HELM_APPLY_ARGS=()`,
+			wantReadmeSnippets: []string{
+				`--wait --timeout 10m`,
+			},
+			rejectSnippets: []string{
+				`--server-side=false`,
+				`COMPONENT_MAX_RETRIES="1"`,
+				`COMPONENT_MAX_RETRIES="3"`,
+			},
+			rejectReadmeSnippets: []string{
+				`--server-side=false`,
+				`--wait --timeout 20m`,
+				`--timeout 30m`,
+			},
+			rejectRetryCap: true,
+		},
 	}
 
 	for _, tt := range tests {
@@ -620,7 +678,7 @@ func TestGenerate_DeployScriptComponentTimeouts(t *testing.T) {
 			if tt.wantApplyArgs != "" && !strings.Contains(componentBlock, tt.wantApplyArgs) {
 				t.Errorf("deploy.sh missing %s apply args %q", tt.component.Name, tt.wantApplyArgs)
 			}
-			if tt.wantApplyArgs != "" && !strings.Contains(script[blockStart:], `"${COMPONENT_HELM_APPLY_ARGS[@]}"`) {
+			if tt.wantApplyArgs != "" && !strings.Contains(script[blockStart:], applyArgsExpansion) {
 				t.Errorf("deploy.sh missing %s apply args in helm command", tt.component.Name)
 			}
 			if tt.wantComment != "" && !strings.Contains(componentBlock, tt.wantComment) {
@@ -631,9 +689,41 @@ func TestGenerate_DeployScriptComponentTimeouts(t *testing.T) {
 					t.Errorf("deploy.sh missing %s snippet %q", tt.component.Name, snippet)
 				}
 			}
+			for _, snippet := range tt.rejectSnippets {
+				if strings.Contains(componentBlock, snippet) {
+					t.Errorf("deploy.sh should not include %s snippet %q", tt.component.Name, snippet)
+				}
+			}
 			if tt.rejectRetryCap && retryCapPattern.MatchString(componentBlock) {
 				t.Errorf("deploy.sh should not cap %s retries in its component block", tt.component.Name)
 			}
+
+			rootReadmeContent, err := os.ReadFile(filepath.Join(outputDir, "README.md"))
+			if err != nil {
+				t.Fatalf("failed to read root README: %v", err)
+			}
+			componentReadmeContent, err := os.ReadFile(filepath.Join(outputDir, tt.component.Name, "README.md"))
+			if err != nil {
+				t.Fatalf("failed to read component README: %v", err)
+			}
+			rootReadme := string(rootReadmeContent)
+			componentReadme := string(componentReadmeContent)
+			for _, snippet := range tt.wantReadmeSnippets {
+				if !strings.Contains(rootReadme, snippet) {
+					t.Errorf("root README missing %s snippet %q", tt.component.Name, snippet)
+				}
+				if !strings.Contains(componentReadme, snippet) {
+					t.Errorf("component README missing %s snippet %q", tt.component.Name, snippet)
+				}
+			}
+			for _, snippet := range tt.rejectReadmeSnippets {
+				if strings.Contains(rootReadme, snippet) {
+					t.Errorf("root README should not include %s snippet %q", tt.component.Name, snippet)
+				}
+				if strings.Contains(componentReadme, snippet) {
+					t.Errorf("component README should not include %s snippet %q", tt.component.Name, snippet)
+				}
+			}
 		})
 	}
 }
diff --git a/pkg/bundler/deployer/helm/templates/README.md.tmpl b/pkg/bundler/deployer/helm/templates/README.md.tmpl
index 3c3e874f4..ba3cef380 100644
--- a/pkg/bundler/deployer/helm/templates/README.md.tmpl
+++ b/pkg/bundler/deployer/helm/templates/README.md.tmpl
@@ -77,19 +77,31 @@ kustomize build '{{ .Repository }}//{{ .Path }}{{ if .Tag }}?ref={{ .Tag }}{{ en
 ```bash
 {{ if .IsOCI -}}
 helm upgrade --install {{ .Name }} {{ .Repository }}/{{ .ChartName }} \
+  {{ if eq .Name "dynamo-platform" }}--server-side=false \
+  {{ end -}}
   --version {{ .Version }} \
   -n {{ .Namespace }} --create-namespace \
   -f {{ .Name }}/values.yaml \
   -f {{ .Name }}/cluster-values.yaml \
-  --wait --timeout 10m
+  {{ if eq .Name "kai-scheduler" -}}
+  --timeout 30m
+  {{ else -}}
+  --wait --timeout {{ if eq .Name "dynamo-platform" }}20m{{ else }}10m{{ end }}
+  {{ end -}}
 {{ else -}}
 helm upgrade --install {{ .Name }} {{ .ChartName }} \
+  {{ if eq .Name "dynamo-platform" }}--server-side=false \
+  {{ end -}}
   --repo {{ .Repository }} \
   --version {{ .Version }} \
   -n {{ .Namespace }} --create-namespace \
   -f {{ .Name }}/values.yaml \
   -f {{ .Name }}/cluster-values.yaml \
-  --wait --timeout 10m
+  {{ if eq .Name "kai-scheduler" -}}
+  --timeout 30m
+  {{ else -}}
+  --wait --timeout {{ if eq .Name "dynamo-platform" }}20m{{ else }}10m{{ end }}
+  {{ end -}}
 {{ end -}}
 ```
 {{ end -}}
@@ -119,7 +131,9 @@ Each Helm component has two values files in its directory:
 
 ## Upgrade
 
-To upgrade a specific Helm component:
+To upgrade a specific Helm component, use the generic form below. Some
+components require component-specific flags; use the component subdirectory
+`README.md` for the exact command.
 
 ```bash
 helm upgrade <component> <chart> --version <version> -n <namespace> -f <component>/values.yaml -f <component>/cluster-values.yaml --wait --timeout 10m
diff --git a/pkg/bundler/deployer/helm/templates/component-README.md.tmpl b/pkg/bundler/deployer/helm/templates/component-README.md.tmpl
index 068bfcd28..7797779a0 100644
--- a/pkg/bundler/deployer/helm/templates/component-README.md.tmpl
+++ b/pkg/bundler/deployer/helm/templates/component-README.md.tmpl
@@ -43,19 +43,31 @@ Namespace: {{ .Namespace }}
 ```bash
 {{ if .IsOCI -}}
 helm upgrade --install {{ .Name }} {{ .Repository }}/{{ .ChartName }} \
+  {{ if eq .Name "dynamo-platform" }}--server-side=false \
+  {{ end -}}
   --version {{ .Version }} \
   -n {{ .Namespace }} --create-namespace \
   -f values.yaml \
   -f cluster-values.yaml \
-  --wait --timeout 10m
+  {{ if eq .Name "kai-scheduler" -}}
+  --timeout 30m
+  {{ else -}}
+  --wait --timeout {{ if eq .Name "dynamo-platform" }}20m{{ else }}10m{{ end }}
+  {{ end -}}
 {{ else -}}
 helm upgrade --install {{ .Name }} {{ .ChartName }} \
+  {{ if eq .Name "dynamo-platform" }}--server-side=false \
+  {{ end -}}
   --repo {{ .Repository }} \
   --version {{ .Version }} \
   -n {{ .Namespace }} --create-namespace \
   -f values.yaml \
   -f cluster-values.yaml \
-  --wait --timeout 10m
+  {{ if eq .Name "kai-scheduler" -}}
+  --timeout 30m
+  {{ else -}}
+  --wait --timeout {{ if eq .Name "dynamo-platform" }}20m{{ else }}10m{{ end }}
+  {{ end -}}
 {{ end -}}
 ```
 {{ if .HasManifests }}
@@ -70,19 +82,31 @@ kubectl apply -f manifests/
 ```bash
 {{ if .IsOCI -}}
 helm upgrade {{ .Name }} {{ .Repository }}/{{ .ChartName }} \
+  {{ if eq .Name "dynamo-platform" }}--server-side=false \
+  {{ end -}}
   --version {{ .Version }} \
   -n {{ .Namespace }} \
   -f values.yaml \
   -f cluster-values.yaml \
-  --wait --timeout 10m
+  {{ if eq .Name "kai-scheduler" -}}
+  --timeout 30m
+  {{ else -}}
+  --wait --timeout {{ if eq .Name "dynamo-platform" }}20m{{ else }}10m{{ end }}
+  {{ end -}}
 {{ else -}}
 helm upgrade {{ .Name }} {{ .ChartName }} \
+  {{ if eq .Name "dynamo-platform" }}--server-side=false \
+  {{ end -}}
   --repo {{ .Repository }} \
   --version {{ .Version }} \
   -n {{ .Namespace }} \
   -f values.yaml \
   -f cluster-values.yaml \
-  --wait --timeout 10m
+  {{ if eq .Name "kai-scheduler" -}}
+  --timeout 30m
+  {{ else -}}
+  --wait --timeout {{ if eq .Name "dynamo-platform" }}20m{{ else }}10m{{ end }}
+  {{ end -}}
 {{ end -}}
 ```
 
diff --git a/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl b/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl
index cd4e786cd..7bd03a356 100644
--- a/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl
+++ b/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl
@@ -418,7 +418,8 @@ if [[ "${COMPONENT_MAX_RETRIES}" -gt 3 ]]; then
 fi
 {{ else if eq .Name "kube-prometheus-stack" -}}
 # Grafana can trip its Deployment progress deadline before a longer Helm
-# timeout helps. Keep the default 10m timeout and preserve the default retry
+# timeout helps, especially on slower H100 CI runners under image-pull and
+# control-plane load. Keep the default 10m timeout and preserve the default retry
 # budget so later upgrades can succeed after images and controllers settle.
 {{ end -}}
 # Derive wait args: global --wait/--no-wait behavior + component timeout.
@@ -436,7 +437,7 @@ fi
 helm_retry "{{ .Name }} helm install" "{{ .Namespace }}" \
   "${COMPONENT_MAX_RETRIES}" \
   helm upgrade --install {{ .Name }} {{ .Repository }}/{{ .ChartName }} \
-  "${COMPONENT_HELM_APPLY_ARGS[@]}" \
+  ${COMPONENT_HELM_APPLY_ARGS[@]+"${COMPONENT_HELM_APPLY_ARGS[@]}"} \
   {{ if .Version }}--version {{ .Version }} \
   {{ end -}}
   -n {{ .Namespace }} --create-namespace \
@@ -448,7 +449,7 @@ helm_retry "{{ .Name }} helm install" "{{ .Namespace }}" \
 helm_retry "{{ .Name }} helm install" "{{ .Namespace }}" \
   "${COMPONENT_MAX_RETRIES}" \
   helm upgrade --install {{ .Name }} {{ .ChartName }} \
-  "${COMPONENT_HELM_APPLY_ARGS[@]}" \
+  ${COMPONENT_HELM_APPLY_ARGS[@]+"${COMPONENT_HELM_APPLY_ARGS[@]}"} \
   --repo {{ .Repository }} \
   {{ if .Version }}--version {{ .Version }} \
   {{ end -}}
diff --git a/tests/chainsaw/ai-conformance/README.md b/tests/chainsaw/ai-conformance/README.md
index b1a88e9d4..a69b88f13 100644
--- a/tests/chainsaw/ai-conformance/README.md
+++ b/tests/chainsaw/ai-conformance/README.md
@@ -73,10 +73,11 @@ tests/chainsaw/ai-conformance/
 │   ├── assert-cert-manager.yaml         # cert-manager healthy
 │   ├── assert-dra-driver.yaml           # DRA driver healthy
 │   ├── assert-kai-scheduler.yaml        # KAI scheduler healthy
-│   ├── assert-monitoring.yaml           # Prometheus stack healthy
+│   ├── assert-monitoring.yaml           # Prometheus stack healthy with Grafana
 │   └── assert-skyhook.yaml              # Skyhook operator healthy
 ├── kind-common/                         # Shared Kind-only assertions
 │   ├── assert-gpu-operator.yaml         # GPU operator healthy on kind
+│   ├── assert-monitoring.yaml           # Prometheus stack healthy without Grafana
 │   ├── assert-network-operator.yaml     # Network operator healthy on kind
 │   └── assert-nvsentinel.yaml           # NVSentinel healthy on kind
 ├── kind-inference-dynamo/               # Kind + H100 + inference + dynamo leaf suite
diff --git a/tests/chainsaw/ai-conformance/kind-common/assert-monitoring.yaml b/tests/chainsaw/ai-conformance/kind-common/assert-monitoring.yaml
new file mode 100644
index 000000000..868be3fea
--- /dev/null
+++ b/tests/chainsaw/ai-conformance/kind-common/assert-monitoring.yaml
@@ -0,0 +1,85 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Assert kind monitoring stack components required by H100 CI are healthy.
+# Grafana is intentionally not asserted here because conformance metrics use
+# Prometheus, DCGM exporter, and prometheus-adapter directly.
+
+# Prometheus Operator - manages Prometheus, Alertmanager, and ServiceMonitor CRs
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: kube-prometheus-operator
+  namespace: monitoring
+status:
+  (conditions[?type == 'Available']):
+    - status: "True"
+---
+# kube-state-metrics - Kubernetes object state metrics
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: kube-state-metrics
+  namespace: monitoring
+status:
+  (conditions[?type == 'Available']):
+    - status: "True"
+---
+# Prometheus StatefulSet - time series database
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: prometheus-kube-prometheus-prometheus
+  namespace: monitoring
+status:
+  (readyReplicas > `0`): true
+---
+# Alertmanager StatefulSet - alert routing and silencing
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: alertmanager-kube-prometheus-alertmanager
+  namespace: monitoring
+status:
+  (readyReplicas > `0`): true
+---
+# Prometheus Node Exporter DaemonSet - node-level hardware/OS metrics
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: prometheus-node-exporter
+  namespace: monitoring
+status:
+  (numberReady > `0`): true
+  (desiredNumberScheduled > `0`): true
+---
+# k8s-ephemeral-storage-metrics - ephemeral storage usage metrics
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: k8s-ephemeral-storage-metrics
+  namespace: monitoring
+status:
+  (conditions[?type == 'Available']):
+    - status: "True"
+---
+# Prometheus Adapter - custom metrics API for HPA
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: prometheus-adapter
+  namespace: monitoring
+status:
+  (conditions[?type == 'Available']):
+    - status: "True"
diff --git a/tests/chainsaw/ai-conformance/kind-inference-dynamo/chainsaw-test.yaml b/tests/chainsaw/ai-conformance/kind-inference-dynamo/chainsaw-test.yaml
index 85aa33ab6..cac236b32 100644
--- a/tests/chainsaw/ai-conformance/kind-inference-dynamo/chainsaw-test.yaml
+++ b/tests/chainsaw/ai-conformance/kind-inference-dynamo/chainsaw-test.yaml
@@ -65,10 +65,10 @@ spec:
 
     # ── Monitoring ─────────────────────────────────────────────────────
     - name: assert-monitoring
-      description: Verify kube-prometheus-stack, ephemeral storage metrics, and prometheus-adapter.
+      description: Verify kind monitoring stack without Grafana.
       try:
         - assert:
-            file: ../common/assert-monitoring.yaml
+            file: ../kind-common/assert-monitoring.yaml
 
     # ── kgateway ───────────────────────────────────────────────────────
     - name: assert-kgateway
diff --git a/tests/chainsaw/ai-conformance/kind-training-kubeflow/chainsaw-test.yaml b/tests/chainsaw/ai-conformance/kind-training-kubeflow/chainsaw-test.yaml
index e3d2b35a9..20332ad64 100644
--- a/tests/chainsaw/ai-conformance/kind-training-kubeflow/chainsaw-test.yaml
+++ b/tests/chainsaw/ai-conformance/kind-training-kubeflow/chainsaw-test.yaml
@@ -60,10 +60,10 @@ spec:
             file: ../kind-common/assert-gpu-operator.yaml
 
     - name: assert-monitoring
-      description: Verify kube-prometheus-stack, ephemeral storage metrics, and prometheus-adapter.
+      description: Verify kind monitoring stack without Grafana.
       try:
         - assert:
-            file: ../common/assert-monitoring.yaml
+            file: ../kind-common/assert-monitoring.yaml
 
     - name: assert-skyhook
       description: Verify Skyhook operator controller-manager is available.

From 22304c97f35cbe2287cdd91d780ad0b956007459 Mon Sep 17 00:00:00 2001
From: Yuan Chen <yuanchen97@gmail.com>
Date: Sun, 26 Apr 2026 14:43:55 -0700
Subject: [PATCH 16/21] Address H100 CI review feedback

---
 .../check-control-plane-health/action.yml     | 54 +++++++++++++-----
 .github/actions/gpu-cluster-setup/action.yml  | 55 +++++++++++++++++++
 .../actions/install-karpenter-kwok/action.yml | 23 ++++++++
 .../workflows/gpu-h100-inference-test.yaml    |  8 +++
 .github/workflows/gpu-h100-training-test.yaml |  8 +++
 docs/user/cli-reference.md                    |  6 +-
 .../deployer/helm/templates/deploy.sh.tmpl    |  3 +
 7 files changed, 142 insertions(+), 15 deletions(-)

diff --git a/.github/actions/check-control-plane-health/action.yml b/.github/actions/check-control-plane-health/action.yml
index 582d78746..59b52e3f0 100644
--- a/.github/actions/check-control-plane-health/action.yml
+++ b/.github/actions/check-control-plane-health/action.yml
@@ -69,6 +69,16 @@ runs:
       run: |
         set -euo pipefail
 
+        validate_duration_input() {
+          local input_name="$1"
+          local input_value="$2"
+
+          if ! [[ "${input_value}" =~ ^[0-9]+[smh]$ ]]; then
+            echo "::error::${input_name} must be a duration like 60s, 2m, or 1h; got '${input_value}'"
+            exit 1
+          fi
+        }
+
         MAX_RECOVERY_ATTEMPTS="${MAX_RECOVERY_ATTEMPTS#"${MAX_RECOVERY_ATTEMPTS%%[![:space:]]*}"}"
         MAX_RECOVERY_ATTEMPTS="${MAX_RECOVERY_ATTEMPTS%"${MAX_RECOVERY_ATTEMPTS##*[![:space:]]}"}"
         if ! [[ "${MAX_RECOVERY_ATTEMPTS}" =~ ^[0-9]+$ ]]; then
@@ -76,15 +86,16 @@ runs:
           exit 1
         fi
 
+        WAIT_TIMEOUT="${WAIT_TIMEOUT#"${WAIT_TIMEOUT%%[![:space:]]*}"}"
+        WAIT_TIMEOUT="${WAIT_TIMEOUT%"${WAIT_TIMEOUT##*[![:space:]]}"}"
+        validate_duration_input wait_timeout "${WAIT_TIMEOUT}"
+
         STABILITY_WINDOW="${STABILITY_WINDOW#"${STABILITY_WINDOW%%[![:space:]]*}"}"
         STABILITY_WINDOW="${STABILITY_WINDOW%"${STABILITY_WINDOW##*[![:space:]]}"}"
         if [[ -z "${STABILITY_WINDOW}" ]]; then
           STABILITY_WINDOW="0s"
         fi
-        if ! [[ "${STABILITY_WINDOW}" =~ ^[0-9]+[smh]$ ]]; then
-          echo "::error::stability_window must be a duration like 0s, 60s, or 2m; got '${STABILITY_WINDOW}'"
-          exit 1
-        fi
+        validate_duration_input stability_window "${STABILITY_WINDOW}"
         if [[ "${STABILITY_WINDOW}" =~ ^0+[smh]$ ]]; then
           STABILITY_WINDOW="0s"
         fi
@@ -107,7 +118,11 @@ runs:
           timeout 30s docker "$@"
         }
 
+        STATIC_POD_RECREATE_SETTLE_SECONDS=5
+        RESTART_COUNT_ATTEMPTS=3
+        RESTART_COUNT_RETRY_SLEEP_SECONDS=2
         declare -A RECOVERY_ATTEMPTS=()
+        declare -A INITIAL_RESTARTS=()
 
         kubectl_kind get --raw='/readyz' || true
 
@@ -127,15 +142,26 @@ runs:
           local restart_counts
           local restart_count
           local total=0
+          local attempt
+
+          for ((attempt = 1; attempt <= RESTART_COUNT_ATTEMPTS; attempt++)); do
+            if restart_counts=$(kubectl_kind -n "${NAMESPACE}" get pod -l "${selector}" \
+              -o jsonpath='{range .items[*]}{range .status.containerStatuses[*]}{.restartCount}{"\n"}{end}{end}'); then
+              if [[ -n "${restart_counts}" ]]; then
+                break
+              fi
+              echo "::warning::no container statuses found for ${component} pods (attempt ${attempt}/${RESTART_COUNT_ATTEMPTS})" >&2
+            else
+              echo "::warning::failed to read restart counts for ${component} pods (attempt ${attempt}/${RESTART_COUNT_ATTEMPTS})" >&2
+            fi
+
+            if (( attempt < RESTART_COUNT_ATTEMPTS )); then
+              sleep "${RESTART_COUNT_RETRY_SLEEP_SECONDS}"
+            fi
+          done
 
-          if ! restart_counts=$(kubectl_kind -n "${NAMESPACE}" get pod -l "${selector}" \
-            -o jsonpath='{range .items[*]}{range .status.containerStatuses[*]}{.restartCount}{"\n"}{end}{end}'); then
-            echo "::error::failed to read restart counts for ${component} pods" >&2
-            dump_component_diagnostics "${component}" >&2
-            exit 1
-          fi
           if [[ -z "${restart_counts}" ]]; then
-            echo "::error::no container statuses found for ${component} pods" >&2
+            echo "::error::no container statuses found for ${component} pods after ${RESTART_COUNT_ATTEMPTS} attempts" >&2
             dump_component_diagnostics "${component}" >&2
             exit 1
           fi
@@ -358,7 +384,9 @@ runs:
             fi
           done
 
-          sleep 5
+          # Give kubelet a short interval to observe the stopped CRI container
+          # and refresh the mirror pod before kubectl wait reads pod status.
+          sleep "${STATIC_POD_RECREATE_SETTLE_SECONDS}"
           if ! wait_ready "${component}"; then
             echo "::warning::${component} did not recover after static pod container restart"
             dump_component_diagnostics "${component}"
@@ -476,8 +504,6 @@ runs:
           done
         }
 
-        declare -A INITIAL_RESTARTS=()
-
         for component in ${COMPONENTS}; do
           check_component "${component}"
         done
diff --git a/.github/actions/gpu-cluster-setup/action.yml b/.github/actions/gpu-cluster-setup/action.yml
index 93ab80cef..21d27800c 100644
--- a/.github/actions/gpu-cluster-setup/action.yml
+++ b/.github/actions/gpu-cluster-setup/action.yml
@@ -309,6 +309,60 @@ runs:
       run: |
         set -euo pipefail
 
+        validate_duration_input() {
+          local input_name="$1"
+          local input_value="$2"
+
+          if ! [[ "${input_value}" =~ ^[0-9]+[smh]$ ]]; then
+            echo "::error::${input_name} must be a duration like 300s, 10m, or 1h; got '${input_value}'"
+            exit 1
+          fi
+        }
+
+        validate_generated_control_plane_config() {
+          if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then
+            for patch_file in "${patch_dir}"/*.yaml; do
+              if ! grep -Fxq 'apiVersion: v1' "${patch_file}" ||
+                ! grep -Fxq 'kind: Pod' "${patch_file}" ||
+                ! grep -Eq '^[[:space:]]+resources:$' "${patch_file}"; then
+                echo "::error::rendered static pod patch ${patch_file} is missing expected top-level YAML"
+                sed 's/^/  /' "${patch_file}" || true
+                exit 1
+              fi
+            done
+
+            if ! grep -Eq '^[[:space:]]*extraMounts:$' "${config_template}" ||
+              ! grep -Fq 'directory: /patches' "${config_template}"; then
+              echo "::error::rendered kind config is missing control-plane patch mounts"
+              sed 's/^/  /' "${config_template}" || true
+              exit 1
+            fi
+          fi
+
+          if [[ "${CONTROL_PLANE_LEADER_ELECTION_TUNING}" == "true" ]]; then
+            for expected in \
+              'apiVersion: kubeadm.k8s.io/v1beta3' \
+              'apiVersion: kubeadm.k8s.io/v1beta4' \
+              "leader-elect-lease-duration: \"${LEADER_ELECTION_LEASE_DURATION}\"" \
+              "leader-elect-renew-deadline: \"${LEADER_ELECTION_RENEW_DEADLINE}\"" \
+              "leader-elect-retry-period: \"${LEADER_ELECTION_RETRY_PERIOD}\"" \
+              "value: \"${LEADER_ELECTION_LEASE_DURATION}\"" \
+              "value: \"${LEADER_ELECTION_RENEW_DEADLINE}\"" \
+              "value: \"${LEADER_ELECTION_RETRY_PERIOD}\""; do
+              if ! grep -Fq "${expected}" "${config_template}"; then
+                echo "::error::rendered kind config is missing expected leader election setting: ${expected}"
+                sed 's/^/  /' "${config_template}" || true
+                exit 1
+              fi
+            done
+          fi
+        }
+
+        validate_duration_input cluster_create_timeout "${CLUSTER_CREATE_TIMEOUT}"
+        validate_duration_input leader_election_lease_duration "${LEADER_ELECTION_LEASE_DURATION}"
+        validate_duration_input leader_election_renew_deadline "${LEADER_ELECTION_RENEW_DEADLINE}"
+        validate_duration_input leader_election_retry_period "${LEADER_ELECTION_RETRY_PERIOD}"
+
         CREATE_ARGS=(--name="${KIND_CLUSTER_NAME}")
         if [[ -n "${KIND_NODE_IMAGE}" ]]; then
           echo "Using kind node image: ${KIND_NODE_IMAGE}"
@@ -505,6 +559,7 @@ runs:
             echo "  renew-deadline=${LEADER_ELECTION_RENEW_DEADLINE}"
             echo "  retry-period=${LEADER_ELECTION_RETRY_PERIOD}"
           fi
+          validate_generated_control_plane_config
           CREATE_ARGS+=(--config-template="${config_template}")
         fi
 
diff --git a/.github/actions/install-karpenter-kwok/action.yml b/.github/actions/install-karpenter-kwok/action.yml
index c917b2abc..f66848e6f 100644
--- a/.github/actions/install-karpenter-kwok/action.yml
+++ b/.github/actions/install-karpenter-kwok/action.yml
@@ -70,5 +70,28 @@ runs:
         KARPENTER_HELM_TIMEOUT: ${{ inputs.karpenter_helm_timeout }}
       run: |
         set -euo pipefail
+        validate_duration_input() {
+          local input_name="$1"
+          local input_value="$2"
+
+          if ! [[ "${input_value}" =~ ^[0-9]+[smh]$ ]]; then
+            echo "::error::${input_name} must be a duration like 300s, 10m, or 1h; got '${input_value}'"
+            exit 1
+          fi
+        }
+
+        validate_seconds_input() {
+          local input_name="$1"
+          local input_value="$2"
+
+          if ! [[ "${input_value}" =~ ^[0-9]+$ ]]; then
+            echo "::error::${input_name} must be an integer number of seconds, got '${input_value}'"
+            exit 1
+          fi
+        }
+
+        validate_duration_input kwok_helm_timeout "${KWOK_HELM_TIMEOUT}"
+        validate_seconds_input ko_build_timeout "${KO_BUILD_TIMEOUT}"
+        validate_duration_input karpenter_helm_timeout "${KARPENTER_HELM_TIMEOUT}"
         bash kwok/scripts/install-karpenter-kwok.sh
         kubectl --context="kind-${KIND_CLUSTER_NAME}" apply -f kwok/manifests/karpenter/nodepool.yaml
diff --git a/.github/workflows/gpu-h100-inference-test.yaml b/.github/workflows/gpu-h100-inference-test.yaml
index 081bb0261..4ef7c43e7 100644
--- a/.github/workflows/gpu-h100-inference-test.yaml
+++ b/.github/workflows/gpu-h100-inference-test.yaml
@@ -154,6 +154,8 @@ jobs:
           wait: 'true'
           best_effort: 'false'
 
+      # Runtime install creates many CRDs, webhooks, and controllers. Keep a
+      # stability window here to catch KCM/scheduler restarts before snapshot.
       - name: Check control plane health
         uses: ./.github/actions/check-control-plane-health
         with:
@@ -174,6 +176,8 @@ jobs:
 
       # --- Install Karpenter + KWOK early to give monitoring stack settle time ---
 
+      # Snapshot deploys a GPU Job and exercises cluster discovery; verify the
+      # control plane stayed stable before adding Karpenter/KWOK.
       - name: Check control plane health
         uses: ./.github/actions/check-control-plane-health
         with:
@@ -198,6 +202,8 @@ jobs:
           install_chainsaw: 'true'
           chainsaw_version: '${{ steps.versions.outputs.chainsaw }}'
 
+      # This gate is effectively post-Karpenter/KWOK; the chainsaw setup above
+      # only installs a runner-side binary.
       - name: Check control plane health
         uses: ./.github/actions/check-control-plane-health
         with:
@@ -227,6 +233,8 @@ jobs:
           build_snapshot_agent: 'false'
           validator_phases: 'conformance'
 
+      # Validator image build/load can contend with Docker and kind containerd;
+      # verify the control plane before the final conformance workload.
       - name: Check control plane health
         uses: ./.github/actions/check-control-plane-health
         with:
diff --git a/.github/workflows/gpu-h100-training-test.yaml b/.github/workflows/gpu-h100-training-test.yaml
index a617bc718..ae96a49a7 100644
--- a/.github/workflows/gpu-h100-training-test.yaml
+++ b/.github/workflows/gpu-h100-training-test.yaml
@@ -151,6 +151,8 @@ jobs:
           wait: 'true'
           best_effort: 'false'
 
+      # Runtime install creates many CRDs, webhooks, and controllers. Keep a
+      # stability window here to catch KCM/scheduler restarts before snapshot.
       - name: Check control plane health
         id: post_runtime_control_plane_health
         uses: ./.github/actions/check-control-plane-health
@@ -170,6 +172,8 @@ jobs:
 
       # --- Install Karpenter + KWOK early to give monitoring stack settle time ---
 
+      # Snapshot deploys a GPU Job and exercises cluster discovery; verify the
+      # control plane stayed stable before adding Karpenter/KWOK.
       - name: Check control plane health
         uses: ./.github/actions/check-control-plane-health
         with:
@@ -194,6 +198,8 @@ jobs:
           install_chainsaw: 'true'
           chainsaw_version: '${{ steps.versions.outputs.chainsaw }}'
 
+      # This gate is effectively post-Karpenter/KWOK; the chainsaw setup above
+      # only installs a runner-side binary.
       - name: Check control plane health
         uses: ./.github/actions/check-control-plane-health
         with:
@@ -223,6 +229,8 @@ jobs:
           build_snapshot_agent: 'false'
           validator_phases: 'conformance'
 
+      # Validator image build/load can contend with Docker and kind containerd;
+      # verify the control plane before the final conformance workload.
       - name: Check control plane health
         uses: ./.github/actions/check-control-plane-health
         with:
diff --git a/docs/user/cli-reference.md b/docs/user/cli-reference.md
index a6dd85b1d..ba62aef1a 100644
--- a/docs/user/cli-reference.md
+++ b/docs/user/cli-reference.md
@@ -1320,7 +1320,11 @@ After `helm install`, the same manifests are re-applied as post-install to ensur
 
 **Async components:**
 
-Components that use operator patterns with custom resources that reconcile asynchronously (e.g., `kai-scheduler`) are installed without `--wait` to avoid Helm timing out on CR readiness. Kai Scheduler installs use a 30 minute per-attempt timeout and cap the retry budget at 1 retry so hook diagnostics surface quickly on cold runners. The `dynamo-platform` Helm command uses client-side apply so retries and upgrades do not conflict with Grove-managed webhook certificate Secret data, while preserving Helm `--wait` behavior. Dynamo installs use a 20 minute per-attempt timeout and cap the retry budget at 3 retries because cold-start failures often involve operator webhook and certificate resources settling across attempts.
+Components that use operator patterns with custom resources that reconcile asynchronously use component-specific install behavior:
+
+- `kai-scheduler` installs without `--wait` to avoid Helm timing out on CR readiness. It uses a 30 minute per-attempt timeout and caps the retry budget at 1 retry so hook diagnostics surface quickly on cold runners.
+- `dynamo-platform` uses client-side apply so retries and upgrades do not conflict with Grove-managed webhook certificate Secret data, while preserving Helm `--wait` behavior. Dynamo installs use a 20 minute per-attempt timeout and cap the retry budget at 3 retries because cold-start failures often involve operator webhook and certificate resources settling across attempts.
+- `kube-prometheus-stack` keeps the default timeout and retry budget because Grafana can hit its Deployment progress deadline before a longer Helm timeout would help, especially on slower H100 CI runners under image-pull and control-plane load.
 
 ##### DRA kubelet plugin registration
 
diff --git a/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl b/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl
index 7bd03a356..826ab08ed 100644
--- a/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl
+++ b/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl
@@ -412,6 +412,9 @@ if [[ "${COMPONENT_MAX_RETRIES}" -gt 1 ]]; then
 fi
 {{ else if eq .Name "dynamo-platform" -}}
 COMPONENT_HELM_TIMEOUT="20m"
+# Grove owns the generated webhook certificate Secret data after install.
+# Client-side apply avoids server-side field ownership conflicts during retries.
+# Requires Helm v4+ for --server-side=false; AICR bundles pin Helm v4 in .settings.yaml.
 COMPONENT_HELM_APPLY_ARGS=(--server-side=false)
 if [[ "${COMPONENT_MAX_RETRIES}" -gt 3 ]]; then
   COMPONENT_MAX_RETRIES="3"

From 553051c3621cab63cf631c7a3851edb14b123b0e Mon Sep 17 00:00:00 2001
From: Yuan Chen <yuanchen97@gmail.com>
Date: Sun, 26 Apr 2026 14:53:42 -0700
Subject: [PATCH 17/21] Avoid pull request events for GPU runners

---
 .github/workflows/gpu-h100-inference-test.yaml | 6 +++---
 .github/workflows/gpu-h100-training-test.yaml  | 6 +++---
 .github/workflows/gpu-smoke-test.yaml          | 6 +++---
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/gpu-h100-inference-test.yaml b/.github/workflows/gpu-h100-inference-test.yaml
index 4ef7c43e7..27d9d27cc 100644
--- a/.github/workflows/gpu-h100-inference-test.yaml
+++ b/.github/workflows/gpu-h100-inference-test.yaml
@@ -20,8 +20,6 @@ on:
   push:
     branches:
       - "pull-request/[0-9]+"
-  pull_request:
-    types: [labeled]
   workflow_dispatch: {}  # Allow manual runs
 
 permissions:
@@ -86,11 +84,13 @@ jobs:
 
   gpu-inference-test:
     needs: [check-paths]
+    # NVIDIA self-hosted GPU runners reject pull_request event jobs before
+    # checkout. PR GPU coverage runs through the pull-request/<number> push
+    # mirror after ok-to-test approval.
     if: >
       always() && (
         github.event_name == 'schedule' ||
         github.event_name == 'workflow_dispatch' ||
-        (github.event_name == 'pull_request' && github.event.label.name == 'run-gpu-tests') ||
         (github.event_name == 'push' && needs.check-paths.outputs.should-run == 'true')
       )
     name: GPU Inference Test (nvkind + H100 x2)
diff --git a/.github/workflows/gpu-h100-training-test.yaml b/.github/workflows/gpu-h100-training-test.yaml
index ae96a49a7..972e38f73 100644
--- a/.github/workflows/gpu-h100-training-test.yaml
+++ b/.github/workflows/gpu-h100-training-test.yaml
@@ -20,8 +20,6 @@ on:
   push:
     branches:
       - "pull-request/[0-9]+"
-  pull_request:
-    types: [labeled]
   workflow_dispatch: {}  # Allow manual runs
 
 permissions:
@@ -82,11 +80,13 @@ jobs:
 
   gpu-training-test:
     needs: [check-paths]
+    # NVIDIA self-hosted GPU runners reject pull_request event jobs before
+    # checkout. PR GPU coverage runs through the pull-request/<number> push
+    # mirror after ok-to-test approval.
     if: >
       always() && (
         github.event_name == 'schedule' ||
         github.event_name == 'workflow_dispatch' ||
-        (github.event_name == 'pull_request' && github.event.label.name == 'run-gpu-tests') ||
         (github.event_name == 'push' && needs.check-paths.outputs.should-run == 'true')
       )
     name: GPU Training Test (nvkind + H100 x2)
diff --git a/.github/workflows/gpu-smoke-test.yaml b/.github/workflows/gpu-smoke-test.yaml
index 25d968b67..805548afc 100644
--- a/.github/workflows/gpu-smoke-test.yaml
+++ b/.github/workflows/gpu-smoke-test.yaml
@@ -20,8 +20,6 @@ on:
   push:
     branches:
       - "pull-request/[0-9]+"
-  pull_request:
-    types: [labeled]
   workflow_dispatch: {}  # Allow manual runs
 
 permissions:
@@ -62,11 +60,13 @@ jobs:
 
   gpu-smoke-test:
     needs: [check-paths]
+    # NVIDIA self-hosted GPU runners reject pull_request event jobs before
+    # checkout. PR GPU coverage runs through the pull-request/<number> push
+    # mirror after ok-to-test approval.
     if: >
       always() && (
         github.event_name == 'schedule' ||
         github.event_name == 'workflow_dispatch' ||
-        (github.event_name == 'pull_request' && github.event.label.name == 'run-gpu-tests') ||
         (github.event_name == 'push' && needs.check-paths.outputs.should-run == 'true')
       )
     name: GPU Smoke Test (nvkind + L40G)

From ff99d6af322f6cb67407c9b91a896438d673eaeb Mon Sep 17 00:00:00 2001
From: Yuan Chen <yuanchen97@gmail.com>
Date: Sun, 26 Apr 2026 15:14:31 -0700
Subject: [PATCH 18/21] Address GPU CI review feedback

---
 .github/actions/aicr-build/action.yml         |  79 +--
 .github/actions/aicr-build/build-cli.sh       |  10 +
 .../aicr-build/build-snapshot-agent.sh        |  18 +
 .../aicr-build/build-validator-images.sh      |  35 ++
 .github/actions/aicr-build/stage-cli.sh       |   4 +
 .../check-control-plane-health/action.yml     | 444 +--------------
 .../check-control-plane-health.sh             | 457 +++++++++++++++
 .github/actions/gpu-cluster-setup/action.yml  | 521 +-----------------
 .../check-runner-capacity.sh                  |  29 +
 .../configure-nvidia-container-toolkit.sh     |  21 +
 .../create-gpu-kind-cluster.sh                | 411 ++++++++++++++
 .../delete-stale-kind-cluster.sh              |  39 ++
 .../increase-inotify-limits.sh                |  19 +
 .../gpu-cluster-setup/install-nvkind.sh       |  19 +
 .../gpu-cluster-setup/runner-preflight.sh     |  69 +++
 .../validate-docker-gpu-access.sh             |  17 +
 .../actions/gpu-cluster-setup/validate-env.sh |  21 +
 .../gpu-cluster-setup/warm-kind-node-image.sh |  29 +
 .../actions/gpu-operator-install/action.yml   |  98 +---
 .../gpu-operator-install/generate-bundle.sh   |  23 +
 .../gpu-operator-install/generate-recipe.sh   |  29 +
 .../gpu-operator-install/install-bundle.sh    |  35 ++
 .../install-gpu-operator-helm.sh              |  29 +
 .../wait-gpu-operands-bundle.sh               |  38 ++
 .../wait-gpu-operands-helm.sh                 |  22 +
 .../actions/gpu-snapshot-validate/action.yml  |  56 +-
 .../debug-snapshot-job.sh                     |  35 ++
 .../gpu-snapshot-validate/run-snapshot.sh     |  26 +
 .../validate-snapshot-gpu.sh                  |  31 ++
 .github/actions/gpu-test-cleanup/action.yml   | 105 +---
 .../gpu-test-cleanup/cleanup-kind-cluster.sh  |  32 ++
 .../collect-debug-artifacts.sh                | 104 ++++
 .../gpu-test-cleanup/export-kind-logs.sh      |  19 +
 .../actions/install-karpenter-kwok/action.yml |  34 +-
 .../install-karpenter-kwok.sh                 |  41 ++
 .../resolve-versions.sh                       |  20 +
 .github/scripts/gpu-chainsaw-health.sh        |  10 +
 .github/scripts/gpu-debug-diagnostics.sh      | 146 +++++
 .github/scripts/gpu-smoke-run-nvidia-smi.sh   |  24 +
 .github/scripts/gpu-validate-conformance.sh   |  15 +
 .../workflows/gpu-h100-inference-test.yaml    | 125 +----
 .github/workflows/gpu-h100-training-test.yaml | 101 +---
 .github/workflows/gpu-smoke-test.yaml         |  39 +-
 docs/user/cli-reference.md                    |   4 +-
 pkg/bundler/deployer/helm/helm_test.go        |   4 +
 .../deployer/helm/templates/README.md.tmpl    |   3 +
 .../helm/templates/component-README.md.tmpl   |   3 +
 .../deployer/helm/templates/deploy.sh.tmpl    |  31 +-
 48 files changed, 1975 insertions(+), 1549 deletions(-)
 create mode 100644 .github/actions/aicr-build/build-cli.sh
 create mode 100644 .github/actions/aicr-build/build-snapshot-agent.sh
 create mode 100644 .github/actions/aicr-build/build-validator-images.sh
 create mode 100644 .github/actions/aicr-build/stage-cli.sh
 create mode 100644 .github/actions/check-control-plane-health/check-control-plane-health.sh
 create mode 100644 .github/actions/gpu-cluster-setup/check-runner-capacity.sh
 create mode 100644 .github/actions/gpu-cluster-setup/configure-nvidia-container-toolkit.sh
 create mode 100644 .github/actions/gpu-cluster-setup/create-gpu-kind-cluster.sh
 create mode 100644 .github/actions/gpu-cluster-setup/delete-stale-kind-cluster.sh
 create mode 100644 .github/actions/gpu-cluster-setup/increase-inotify-limits.sh
 create mode 100644 .github/actions/gpu-cluster-setup/install-nvkind.sh
 create mode 100644 .github/actions/gpu-cluster-setup/runner-preflight.sh
 create mode 100644 .github/actions/gpu-cluster-setup/validate-docker-gpu-access.sh
 create mode 100644 .github/actions/gpu-cluster-setup/validate-env.sh
 create mode 100644 .github/actions/gpu-cluster-setup/warm-kind-node-image.sh
 create mode 100644 .github/actions/gpu-operator-install/generate-bundle.sh
 create mode 100644 .github/actions/gpu-operator-install/generate-recipe.sh
 create mode 100644 .github/actions/gpu-operator-install/install-bundle.sh
 create mode 100644 .github/actions/gpu-operator-install/install-gpu-operator-helm.sh
 create mode 100644 .github/actions/gpu-operator-install/wait-gpu-operands-bundle.sh
 create mode 100644 .github/actions/gpu-operator-install/wait-gpu-operands-helm.sh
 create mode 100644 .github/actions/gpu-snapshot-validate/debug-snapshot-job.sh
 create mode 100644 .github/actions/gpu-snapshot-validate/run-snapshot.sh
 create mode 100644 .github/actions/gpu-snapshot-validate/validate-snapshot-gpu.sh
 create mode 100644 .github/actions/gpu-test-cleanup/cleanup-kind-cluster.sh
 create mode 100644 .github/actions/gpu-test-cleanup/collect-debug-artifacts.sh
 create mode 100644 .github/actions/gpu-test-cleanup/export-kind-logs.sh
 create mode 100644 .github/actions/install-karpenter-kwok/install-karpenter-kwok.sh
 create mode 100644 .github/actions/install-karpenter-kwok/resolve-versions.sh
 create mode 100644 .github/scripts/gpu-chainsaw-health.sh
 create mode 100644 .github/scripts/gpu-debug-diagnostics.sh
 create mode 100644 .github/scripts/gpu-smoke-run-nvidia-smi.sh
 create mode 100644 .github/scripts/gpu-validate-conformance.sh

diff --git a/.github/actions/aicr-build/action.yml b/.github/actions/aicr-build/action.yml
index 14d6a595b..671392215 100644
--- a/.github/actions/aicr-build/action.yml
+++ b/.github/actions/aicr-build/action.yml
@@ -41,91 +41,22 @@ runs:
       shell: bash
       env:
         GOFLAGS: -mod=vendor
-      run: |
-        set -euo pipefail
-        mkdir -p dist
-        if [[ -x dist/aicr ]]; then
-          echo "Reusing existing dist/aicr"
-          exit 0
-        fi
-        CGO_ENABLED=0 go build -trimpath -o dist/aicr ./cmd/aicr
+      run: bash "${{ github.action_path }}/build-cli.sh"
 
     - name: Build snapshot agent image and load into kind
       if: inputs.build_snapshot_agent == 'true'
       shell: bash
-      run: |
-        set -euo pipefail
-        # Build snapshot agent image with CUDA base (provides nvidia-smi for GPU detection).
-        # Uses cuda:base (~250MB) instead of cuda:runtime (~1.8GB) — only nvidia-smi is needed.
-        # GPU test workflows use --image=ko.local:smoke-test for aicr snapshot.
-        docker build -t ko.local:smoke-test -f - . <<'DOCKERFILE'
-        FROM nvcr.io/nvidia/cuda:13.1.0-base-ubuntu24.04
-        COPY dist/aicr /usr/local/bin/aicr
-        ENTRYPOINT ["/usr/local/bin/aicr"]
-        DOCKERFILE
-
-        # Load onto all nodes. The snapshot agent requests nvidia.com/gpu but
-        # does not set a node selector, so it can land on any GPU-capable node
-        # including the control-plane (e.g., T4 smoke test).
-        #
-        # Timeout is intentionally generous (900s per attempt). H100 self-hosted
-        # runners transfer images over a shared Docker-in-Docker bridge; large
-        # CUDA base images (~250MB compressed) combined with I/O contention from
-        # parallel GPU operator pods regularly exceed the previous 600s limit.
-        timeout 900 kind load docker-image ko.local:smoke-test --name "${KIND_CLUSTER_NAME}" || {
-          echo "::warning::kind load attempt 1 failed for ko.local:smoke-test, retrying..."
-          timeout 900 kind load docker-image ko.local:smoke-test --name "${KIND_CLUSTER_NAME}"
-        }
+      run: bash "${{ github.action_path }}/build-snapshot-agent.sh"
 
     - name: Build validator images and load into kind
       if: "!(inputs.validator_phases == 'none' || (inputs.validator_phases == '' && inputs.build_validators == 'false'))"
       shell: bash
       env:
         GOFLAGS: -mod=vendor
-      run: |
-        set -euo pipefail
-        # Determine which validator phases to build.
-        # validator_phases takes precedence; build_validators is a deprecated fallback.
-        if [[ -n "${{ inputs.validator_phases }}" ]]; then
-          if [[ "${{ inputs.validator_phases }}" == "none" ]]; then
-            echo "Skipping validator builds (validator_phases=none)"
-            exit 0
-          fi
-          PHASES="${{ inputs.validator_phases }}"
-        else
-          # Default: build all phases (backwards compatible)
-          PHASES="deployment,performance,conformance"
-        fi
-
-        # Compile only the requested validator binaries.
-        mkdir -p dist/validator
-        for phase in ${PHASES//,/ }; do
-          echo "Building validator binary: ${phase}"
-          CGO_ENABLED=0 go build -trimpath -o "dist/validator/${phase}" "./validators/${phase}"
-        done
-
-        for phase in ${PHASES//,/ }; do
-          mkdir -p "validators/${phase}/testdata"
-          docker build -t "ko.local/aicr-validators/${phase}:latest" -f - . <<DOCKERFILE
-        FROM gcr.io/distroless/static-debian12:nonroot
-        COPY dist/validator/${phase} /${phase}
-        COPY validators/${phase}/testdata /app/testdata
-        WORKDIR /app
-        USER nonroot
-        ENTRYPOINT ["/${phase}"]
-        DOCKERFILE
-          # Validator images are small (~30MB distroless), but share the same
-          # Docker-in-Docker bridge as the smoke-test load above. 600s per
-          # attempt accommodates I/O queuing behind concurrent image pulls.
-          timeout 600 kind load docker-image "ko.local/aicr-validators/${phase}:latest" --name "${KIND_CLUSTER_NAME}" || {
-            echo "::warning::kind load attempt 1 failed for ko.local/aicr-validators/${phase}:latest, retrying..."
-            timeout 600 kind load docker-image "ko.local/aicr-validators/${phase}:latest" --name "${KIND_CLUSTER_NAME}"
-          }
-        done
+        VALIDATOR_PHASES: ${{ inputs.validator_phases }}
+      run: bash "${{ github.action_path }}/build-validator-images.sh"
 
     - name: Stage aicr binary at repo root
       if: inputs.build_cli == 'true'
       shell: bash
-      run: |
-        set -euo pipefail
-        cp dist/aicr ./aicr
+      run: bash "${{ github.action_path }}/stage-cli.sh"
diff --git a/.github/actions/aicr-build/build-cli.sh b/.github/actions/aicr-build/build-cli.sh
new file mode 100644
index 000000000..81b657f58
--- /dev/null
+++ b/.github/actions/aicr-build/build-cli.sh
@@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+mkdir -p dist
+if [[ -x dist/aicr ]]; then
+  echo "Reusing existing dist/aicr"
+  exit 0
+fi
+
+CGO_ENABLED=0 go build -trimpath -o dist/aicr ./cmd/aicr
diff --git a/.github/actions/aicr-build/build-snapshot-agent.sh b/.github/actions/aicr-build/build-snapshot-agent.sh
new file mode 100644
index 000000000..9650fbd0d
--- /dev/null
+++ b/.github/actions/aicr-build/build-snapshot-agent.sh
@@ -0,0 +1,18 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Build snapshot agent image with CUDA base (provides nvidia-smi for GPU detection).
+# Uses cuda:base (~250MB) instead of cuda:runtime (~1.8GB) because only nvidia-smi is needed.
+docker build -t ko.local:smoke-test -f - . <<'DOCKERFILE'
+FROM nvcr.io/nvidia/cuda:13.1.0-base-ubuntu24.04
+COPY dist/aicr /usr/local/bin/aicr
+ENTRYPOINT ["/usr/local/bin/aicr"]
+DOCKERFILE
+
+# Load onto all nodes. The snapshot agent requests nvidia.com/gpu but does not
+# set a node selector, so it can land on any GPU-capable node including the
+# control-plane in the L40G smoke test.
+timeout 900 kind load docker-image ko.local:smoke-test --name "${KIND_CLUSTER_NAME}" || {
+  echo "::warning::kind load attempt 1 failed for ko.local:smoke-test, retrying..."
+  timeout 900 kind load docker-image ko.local:smoke-test --name "${KIND_CLUSTER_NAME}"
+}
diff --git a/.github/actions/aicr-build/build-validator-images.sh b/.github/actions/aicr-build/build-validator-images.sh
new file mode 100644
index 000000000..4389b15b4
--- /dev/null
+++ b/.github/actions/aicr-build/build-validator-images.sh
@@ -0,0 +1,35 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+if [[ -n "${VALIDATOR_PHASES}" ]]; then
+  if [[ "${VALIDATOR_PHASES}" == "none" ]]; then
+    echo "Skipping validator builds (validator_phases=none)"
+    exit 0
+  fi
+  PHASES="${VALIDATOR_PHASES}"
+else
+  # Default: build all phases (backwards compatible).
+  PHASES="deployment,performance,conformance"
+fi
+
+mkdir -p dist/validator
+for phase in ${PHASES//,/ }; do
+  echo "Building validator binary: ${phase}"
+  CGO_ENABLED=0 go build -trimpath -o "dist/validator/${phase}" "./validators/${phase}"
+done
+
+for phase in ${PHASES//,/ }; do
+  mkdir -p "validators/${phase}/testdata"
+  docker build -t "ko.local/aicr-validators/${phase}:latest" -f - . <<DOCKERFILE
+FROM gcr.io/distroless/static-debian12:nonroot
+COPY dist/validator/${phase} /${phase}
+COPY validators/${phase}/testdata /app/testdata
+WORKDIR /app
+USER nonroot
+ENTRYPOINT ["/${phase}"]
+DOCKERFILE
+  timeout 600 kind load docker-image "ko.local/aicr-validators/${phase}:latest" --name "${KIND_CLUSTER_NAME}" || {
+    echo "::warning::kind load attempt 1 failed for ko.local/aicr-validators/${phase}:latest, retrying..."
+    timeout 600 kind load docker-image "ko.local/aicr-validators/${phase}:latest" --name "${KIND_CLUSTER_NAME}"
+  }
+done
diff --git a/.github/actions/aicr-build/stage-cli.sh b/.github/actions/aicr-build/stage-cli.sh
new file mode 100644
index 000000000..929aed9e8
--- /dev/null
+++ b/.github/actions/aicr-build/stage-cli.sh
@@ -0,0 +1,4 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+cp dist/aicr ./aicr
diff --git a/.github/actions/check-control-plane-health/action.yml b/.github/actions/check-control-plane-health/action.yml
index 59b52e3f0..0172b00a9 100644
--- a/.github/actions/check-control-plane-health/action.yml
+++ b/.github/actions/check-control-plane-health/action.yml
@@ -66,446 +66,4 @@ runs:
         RECOVER_UNHEALTHY: ${{ inputs.recover_unhealthy }}
         RECOVERY_COMPONENTS: ${{ inputs.recovery_components }}
         MAX_RECOVERY_ATTEMPTS: ${{ inputs.max_recovery_attempts }}
-      run: |
-        set -euo pipefail
-
-        validate_duration_input() {
-          local input_name="$1"
-          local input_value="$2"
-
-          if ! [[ "${input_value}" =~ ^[0-9]+[smh]$ ]]; then
-            echo "::error::${input_name} must be a duration like 60s, 2m, or 1h; got '${input_value}'"
-            exit 1
-          fi
-        }
-
-        MAX_RECOVERY_ATTEMPTS="${MAX_RECOVERY_ATTEMPTS#"${MAX_RECOVERY_ATTEMPTS%%[![:space:]]*}"}"
-        MAX_RECOVERY_ATTEMPTS="${MAX_RECOVERY_ATTEMPTS%"${MAX_RECOVERY_ATTEMPTS##*[![:space:]]}"}"
-        if ! [[ "${MAX_RECOVERY_ATTEMPTS}" =~ ^[0-9]+$ ]]; then
-          echo "::error::max_recovery_attempts must be a non-negative integer, got '${MAX_RECOVERY_ATTEMPTS}'"
-          exit 1
-        fi
-
-        WAIT_TIMEOUT="${WAIT_TIMEOUT#"${WAIT_TIMEOUT%%[![:space:]]*}"}"
-        WAIT_TIMEOUT="${WAIT_TIMEOUT%"${WAIT_TIMEOUT##*[![:space:]]}"}"
-        validate_duration_input wait_timeout "${WAIT_TIMEOUT}"
-
-        STABILITY_WINDOW="${STABILITY_WINDOW#"${STABILITY_WINDOW%%[![:space:]]*}"}"
-        STABILITY_WINDOW="${STABILITY_WINDOW%"${STABILITY_WINDOW##*[![:space:]]}"}"
-        if [[ -z "${STABILITY_WINDOW}" ]]; then
-          STABILITY_WINDOW="0s"
-        fi
-        validate_duration_input stability_window "${STABILITY_WINDOW}"
-        if [[ "${STABILITY_WINDOW}" =~ ^0+[smh]$ ]]; then
-          STABILITY_WINDOW="0s"
-        fi
-
-        RECOVER_UNHEALTHY="${RECOVER_UNHEALTHY#"${RECOVER_UNHEALTHY%%[![:space:]]*}"}"
-        RECOVER_UNHEALTHY="${RECOVER_UNHEALTHY%"${RECOVER_UNHEALTHY##*[![:space:]]}"}"
-        case "${RECOVER_UNHEALTHY}" in
-          true|false) ;;
-          *)
-            echo "::error::recover_unhealthy must be true or false, got '${RECOVER_UNHEALTHY}'"
-            exit 1
-            ;;
-        esac
-
-        kubectl_kind() {
-          timeout 30s kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" "$@"
-        }
-
-        docker_timeout() {
-          timeout 30s docker "$@"
-        }
-
-        STATIC_POD_RECREATE_SETTLE_SECONDS=5
-        RESTART_COUNT_ATTEMPTS=3
-        RESTART_COUNT_RETRY_SLEEP_SECONDS=2
-        declare -A RECOVERY_ATTEMPTS=()
-        declare -A INITIAL_RESTARTS=()
-
-        kubectl_kind get --raw='/readyz' || true
-
-        wait_ready() {
-          local component="$1"
-          local selector="component=${component}"
-
-          if ! timeout "${WAIT_TIMEOUT}" kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" -n "${NAMESPACE}" \
-            wait --for=condition=Ready pod -l "${selector}" --timeout="${WAIT_TIMEOUT}"; then
-            return 1
-          fi
-        }
-
-        restart_total() {
-          local component="$1"
-          local selector="component=${component}"
-          local restart_counts
-          local restart_count
-          local total=0
-          local attempt
-
-          for ((attempt = 1; attempt <= RESTART_COUNT_ATTEMPTS; attempt++)); do
-            if restart_counts=$(kubectl_kind -n "${NAMESPACE}" get pod -l "${selector}" \
-              -o jsonpath='{range .items[*]}{range .status.containerStatuses[*]}{.restartCount}{"\n"}{end}{end}'); then
-              if [[ -n "${restart_counts}" ]]; then
-                break
-              fi
-              echo "::warning::no container statuses found for ${component} pods (attempt ${attempt}/${RESTART_COUNT_ATTEMPTS})" >&2
-            else
-              echo "::warning::failed to read restart counts for ${component} pods (attempt ${attempt}/${RESTART_COUNT_ATTEMPTS})" >&2
-            fi
-
-            if (( attempt < RESTART_COUNT_ATTEMPTS )); then
-              sleep "${RESTART_COUNT_RETRY_SLEEP_SECONDS}"
-            fi
-          done
-
-          if [[ -z "${restart_counts}" ]]; then
-            echo "::error::no container statuses found for ${component} pods after ${RESTART_COUNT_ATTEMPTS} attempts" >&2
-            dump_component_diagnostics "${component}" >&2
-            exit 1
-          fi
-
-          while IFS= read -r restart_count; do
-            [[ -z "${restart_count}" ]] && continue
-            total=$((total + restart_count))
-          done <<< "${restart_counts}"
-          echo "${total}"
-        }
-
-        report_restart_baseline() {
-          local component="$1"
-          local restart_count="$2"
-
-          if (( restart_count > 0 )); then
-            echo "::warning::${component} has historical restartCount=${restart_count}; checking current readiness and stability window only"
-            return
-          fi
-          echo "${component} restartCount=${restart_count}"
-        }
-
-        dump_control_plane_summary() {
-          echo "=== Control-plane pod restart summary ==="
-          kubectl_kind -n "${NAMESPACE}" get pods -l tier=control-plane -o wide || true
-          kubectl_kind -n "${NAMESPACE}" get pods -l tier=control-plane \
-            -o jsonpath='{range .items[*]}{.metadata.name}{" restartCount="}{range .status.containerStatuses[*]}{.restartCount}{" "}{end}{"\n"}{end}' || true
-        }
-
-        require_readyz() {
-          local reason="$1"
-
-          if ! kubectl_kind get --raw='/readyz'; then
-            echo "::error::kube-apiserver /readyz failed ${reason}"
-            dump_all_control_plane_runtime_diagnostics
-            exit 1
-          fi
-        }
-
-        dump_api_server_health() {
-          local endpoint
-
-          for endpoint in '/livez?verbose' '/readyz?verbose' '/healthz'; do
-            echo "=== kube-apiserver ${endpoint} ==="
-            kubectl_kind get --raw="${endpoint}" || true
-          done
-        }
-
-        dump_kind_node_runtime_summary() {
-          local node="${KIND_CLUSTER_NAME}-control-plane"
-
-          if ! docker_timeout inspect "${node}" >/dev/null 2>&1; then
-            echo "::warning::cannot collect node runtime summary: kind node container ${node} not found"
-            return
-          fi
-
-          echo "=== ${node} docker stats ==="
-          docker_timeout stats --no-stream \
-            --format 'table {{.Name}}\t{{.CPUPerc}}\t{{.MemUsage}}\t{{.NetIO}}\t{{.BlockIO}}\t{{.PIDs}}' \
-            "${node}" || true
-
-          echo "=== ${node} docker inspect state ==="
-          docker_timeout inspect \
-            --format 'status={{.State.Status}} running={{.State.Running}} oomKilled={{.State.OOMKilled}} pid={{.State.Pid}} started={{.State.StartedAt}} finished={{.State.FinishedAt}}' \
-            "${node}" || true
-
-          echo "=== ${node} node pressure snapshot ==="
-          docker_timeout exec "${node}" sh -c '
-            date
-            uptime || true
-            free -h || true
-            df -h / /var/lib/containerd /var/lib/kubelet 2>/dev/null || df -h
-            echo "--- top cpu/memory processes ---"
-            ps -eo pid,ppid,stat,etime,%cpu,%mem,comm,args --sort=-%cpu | head -40 || true
-          ' || true
-
-          echo "=== ${node} CRI pod/container summary ==="
-          docker_timeout exec "${node}" crictl pods || true
-          docker_timeout exec "${node}" crictl ps -a || true
-          docker_timeout exec "${node}" crictl stats || true
-        }
-
-        dump_static_pod_runtime_diagnostics() {
-          local component="$1"
-          local node="${KIND_CLUSTER_NAME}-control-plane"
-          local container_ids
-          local container_id
-          local count=0
-
-          if ! docker_timeout inspect "${node}" >/dev/null 2>&1; then
-            echo "::warning::cannot collect ${component} runtime diagnostics: kind node container ${node} not found"
-            return
-          fi
-
-          echo "=== ${node} ${component} static pod manifest ==="
-          docker_timeout exec "${node}" sh -c "sed -n '1,220p' /etc/kubernetes/manifests/${component}.yaml" || true
-
-          echo "=== ${node} ${component} CRI containers ==="
-          docker_timeout exec "${node}" crictl ps -a --name "${component}" || true
-
-          container_ids=$(docker_timeout exec "${node}" crictl ps -a --name "${component}" -q 2>/dev/null || true)
-          for container_id in ${container_ids}; do
-            count=$((count + 1))
-            if (( count > 8 )); then
-              echo "Skipping remaining ${component} CRI containers after first 8 entries."
-              break
-            fi
-
-            echo "=== ${node} crictl inspect ${component} ${container_id} ==="
-            docker_timeout exec "${node}" crictl inspect "${container_id}" || true
-            echo "=== ${node} crictl logs ${component} ${container_id} ==="
-            docker_timeout exec "${node}" crictl logs --tail=200 "${container_id}" || true
-          done
-
-          echo "=== ${node} kubelet journal (${component}) ==="
-          docker_timeout exec "${node}" journalctl -u kubelet --since '45 minutes ago' --no-pager 2>/dev/null \
-            | grep -Ei "${component}|static pod|mirror pod|probe|liveness|readiness|startup|back-off|backoff|container|failed|error|oom|killed" \
-            | tail -200 || true
-
-          echo "=== ${node} containerd journal (${component}) ==="
-          docker_timeout exec "${node}" journalctl -u containerd --since '45 minutes ago' --no-pager 2>/dev/null \
-            | grep -Ei "${component}|container|task|shim|deadline|failed|error|oom|killed" \
-            | tail -200 || true
-        }
-
-        dump_all_control_plane_runtime_diagnostics() {
-          local component
-
-          dump_control_plane_summary
-          dump_api_server_health
-          dump_kind_node_runtime_summary
-          for component in ${COMPONENTS}; do
-            dump_static_pod_runtime_diagnostics "${component}"
-            kubectl_kind -n kube-system get lease "${component}" -o yaml 2>/dev/null || true
-          done
-        }
-
-        dump_component_diagnostics() {
-          local component="$1"
-          local selector="component=${component}"
-          local pods
-          local pod
-
-          dump_control_plane_summary
-          kubectl_kind -n "${NAMESPACE}" get pod -l "${selector}" -o wide || true
-          kubectl_kind -n "${NAMESPACE}" describe pod -l "${selector}" || true
-          kubectl_kind -n "${NAMESPACE}" get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true
-
-          pods=$(kubectl_kind -n "${NAMESPACE}" get pod -l "${selector}" -o name 2>/dev/null || true)
-          while IFS= read -r pod; do
-            [[ -z "${pod}" ]] && continue
-            echo "=== ${pod} logs ==="
-            kubectl_kind -n "${NAMESPACE}" logs "${pod}" --all-containers --tail=100 2>/dev/null || true
-            echo "=== ${pod} previous logs ==="
-            kubectl_kind -n "${NAMESPACE}" logs "${pod}" --all-containers --previous --tail=100 2>/dev/null || true
-          done <<< "${pods}"
-
-          dump_all_control_plane_runtime_diagnostics
-          kubectl_kind -n kube-system get lease "${component}" -o yaml 2>/dev/null || true
-        }
-
-        is_recovery_component() {
-          local component="$1"
-          local candidate
-
-          for candidate in ${RECOVERY_COMPONENTS}; do
-            if [[ "${candidate}" == "${component}" ]]; then
-              return 0
-            fi
-          done
-          return 1
-        }
-
-        try_recover_component() {
-          local component="$1"
-          local reason="$2"
-          local node="${KIND_CLUSTER_NAME}-control-plane"
-          local attempt
-          local container_ids
-          local container_id
-
-          if [[ "${RECOVER_UNHEALTHY}" != "true" ]]; then
-            return 1
-          fi
-          if (( MAX_RECOVERY_ATTEMPTS == 0 )); then
-            return 1
-          fi
-          if ! is_recovery_component "${component}"; then
-            return 1
-          fi
-
-          attempt="${RECOVERY_ATTEMPTS[${component}]:-0}"
-          if (( attempt >= MAX_RECOVERY_ATTEMPTS )); then
-            return 1
-          fi
-          RECOVERY_ATTEMPTS["${component}"]=$((attempt + 1))
-
-          echo "::warning::${component} is unhealthy (${reason}); restarting static pod container (attempt $((attempt + 1))/${MAX_RECOVERY_ATTEMPTS})"
-          dump_component_diagnostics "${component}"
-
-          if ! docker_timeout inspect "${node}" >/dev/null 2>&1; then
-            echo "::warning::cannot recover ${component}: kind node container ${node} not found"
-            return 1
-          fi
-
-          if ! container_ids=$(docker_timeout exec "${node}" crictl ps --name "${component}" -q 2>/dev/null); then
-            echo "::warning::cannot recover ${component}: timed out or failed to list containers in ${node}"
-            return 1
-          fi
-          if [[ -z "${container_ids}" ]]; then
-            echo "::warning::cannot recover ${component}: no running container found in ${node}"
-            return 1
-          fi
-
-          for container_id in ${container_ids}; do
-            echo "Stopping ${component} container ${container_id} in ${node}..."
-            if ! docker_timeout exec "${node}" crictl stop "${container_id}"; then
-              echo "::warning::failed to stop ${component} container ${container_id}"
-              return 1
-            fi
-          done
-
-          # Give kubelet a short interval to observe the stopped CRI container
-          # and refresh the mirror pod before kubectl wait reads pod status.
-          sleep "${STATIC_POD_RECREATE_SETTLE_SECONDS}"
-          if ! wait_ready "${component}"; then
-            echo "::warning::${component} did not recover after static pod container restart"
-            dump_component_diagnostics "${component}"
-            kubectl_kind get --raw='/readyz' || true
-            return 1
-          fi
-
-          echo "${component} recovered after static pod container restart."
-          return 0
-        }
-
-        check_component() {
-          local component="$1"
-          local selector="component=${component}"
-          local pods
-          local initial_restarts
-
-          if ! pods=$(kubectl_kind -n "${NAMESPACE}" get pod -l "${selector}" -o name); then
-            if ! try_recover_component "${component}" "failed to list pods in ${NAMESPACE} with selector ${selector}"; then
-              echo "::error::failed to list ${component} pods in ${NAMESPACE} with selector ${selector}"
-              kubectl_kind -n "${NAMESPACE}" get pods -o wide || true
-              exit 1
-            fi
-            if ! pods=$(kubectl_kind -n "${NAMESPACE}" get pod -l "${selector}" -o name); then
-              echo "::error::failed to list ${component} pods after recovery"
-              kubectl_kind -n "${NAMESPACE}" get pods -o wide || true
-              exit 1
-            fi
-          fi
-          if [[ -z "${pods}" ]]; then
-            echo "::error::no ${component} pods found in ${NAMESPACE} with selector ${selector}"
-            kubectl_kind -n "${NAMESPACE}" get pods -o wide || true
-            exit 1
-          fi
-
-          if ! wait_ready "${component}"; then
-            if ! try_recover_component "${component}" "pods did not become Ready within ${WAIT_TIMEOUT}"; then
-              echo "::error::${component} pods did not become Ready within ${WAIT_TIMEOUT}"
-              dump_component_diagnostics "${component}"
-              kubectl_kind get --raw='/readyz' || true
-              exit 1
-            fi
-          fi
-          initial_restarts=$(restart_total "${component}")
-          report_restart_baseline "${component}" "${initial_restarts}"
-          INITIAL_RESTARTS["${component}"]="${initial_restarts}"
-        }
-
-        verify_stability_window() {
-          local component
-          local initial_restarts
-          local final_restarts
-          local recovered=false
-
-          if [[ "${STABILITY_WINDOW}" == "0s" ]]; then
-            return
-          fi
-
-          echo "Observing control-plane stability for ${STABILITY_WINDOW}..."
-          sleep "${STABILITY_WINDOW}"
-          for component in ${COMPONENTS}; do
-            initial_restarts="${INITIAL_RESTARTS[${component}]:-}"
-            if [[ -z "${initial_restarts}" ]]; then
-              echo "::error::missing initial restart count for ${component}"
-              exit 1
-            fi
-            if ! wait_ready "${component}"; then
-              if ! try_recover_component "${component}" "pods became unready during ${STABILITY_WINDOW}"; then
-                echo "::error::${component} pods became unready during ${STABILITY_WINDOW}"
-                dump_component_diagnostics "${component}"
-                kubectl_kind get --raw='/readyz' || true
-                exit 1
-              fi
-              initial_restarts=$(restart_total "${component}")
-              report_restart_baseline "${component}" "${initial_restarts}"
-              INITIAL_RESTARTS["${component}"]="${initial_restarts}"
-              recovered=true
-              continue
-            fi
-            final_restarts=$(restart_total "${component}")
-            if (( final_restarts > initial_restarts )); then
-              echo "::error::${component} restartCount increased from ${initial_restarts} to ${final_restarts} during ${STABILITY_WINDOW}"
-              dump_component_diagnostics "${component}"
-              kubectl_kind get --raw='/readyz' || true
-              exit 1
-            fi
-            INITIAL_RESTARTS["${component}"]="${final_restarts}"
-          done
-
-          if [[ "${recovered}" != "true" ]]; then
-            return
-          fi
-
-          echo "::warning::control-plane recovery occurred; observing one additional ${STABILITY_WINDOW} stability window"
-          sleep "${STABILITY_WINDOW}"
-          for component in ${COMPONENTS}; do
-            initial_restarts="${INITIAL_RESTARTS[${component}]:-}"
-            if [[ -z "${initial_restarts}" ]]; then
-              echo "::error::missing post-recovery restart count for ${component}"
-              exit 1
-            fi
-            if ! wait_ready "${component}"; then
-              echo "::error::${component} pods became unready after recovery"
-              dump_component_diagnostics "${component}"
-              kubectl_kind get --raw='/readyz' || true
-              exit 1
-            fi
-            final_restarts=$(restart_total "${component}")
-            if (( final_restarts > initial_restarts )); then
-              echo "::error::${component} restartCount increased from ${initial_restarts} to ${final_restarts} after recovery"
-              dump_component_diagnostics "${component}"
-              exit 1
-            fi
-            INITIAL_RESTARTS["${component}"]="${final_restarts}"
-          done
-        }
-
-        for component in ${COMPONENTS}; do
-          check_component "${component}"
-        done
-        verify_stability_window
-        require_readyz "after stability window"
+      run: bash "${{ github.action_path }}/check-control-plane-health.sh"
diff --git a/.github/actions/check-control-plane-health/check-control-plane-health.sh b/.github/actions/check-control-plane-health/check-control-plane-health.sh
new file mode 100644
index 000000000..3614df47f
--- /dev/null
+++ b/.github/actions/check-control-plane-health/check-control-plane-health.sh
@@ -0,0 +1,457 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+validate_duration_input() {
+  local input_name="$1"
+  local input_value="$2"
+
+  if ! [[ "${input_value}" =~ ^[0-9]+[smh]$ ]]; then
+    echo "::error::${input_name} must be a duration like 60s, 2m, or 1h; got '${input_value}'"
+    exit 1
+  fi
+}
+
+MAX_RECOVERY_ATTEMPTS="${MAX_RECOVERY_ATTEMPTS#"${MAX_RECOVERY_ATTEMPTS%%[![:space:]]*}"}"
+MAX_RECOVERY_ATTEMPTS="${MAX_RECOVERY_ATTEMPTS%"${MAX_RECOVERY_ATTEMPTS##*[![:space:]]}"}"
+if ! [[ "${MAX_RECOVERY_ATTEMPTS}" =~ ^[0-9]+$ ]]; then
+  echo "::error::max_recovery_attempts must be a non-negative integer, got '${MAX_RECOVERY_ATTEMPTS}'"
+  exit 1
+fi
+
+WAIT_TIMEOUT="${WAIT_TIMEOUT#"${WAIT_TIMEOUT%%[![:space:]]*}"}"
+WAIT_TIMEOUT="${WAIT_TIMEOUT%"${WAIT_TIMEOUT##*[![:space:]]}"}"
+validate_duration_input wait_timeout "${WAIT_TIMEOUT}"
+
+STABILITY_WINDOW="${STABILITY_WINDOW#"${STABILITY_WINDOW%%[![:space:]]*}"}"
+STABILITY_WINDOW="${STABILITY_WINDOW%"${STABILITY_WINDOW##*[![:space:]]}"}"
+if [[ -z "${STABILITY_WINDOW}" ]]; then
+  STABILITY_WINDOW="0s"
+fi
+validate_duration_input stability_window "${STABILITY_WINDOW}"
+if [[ "${STABILITY_WINDOW}" =~ ^0+[smh]$ ]]; then
+  STABILITY_WINDOW="0s"
+fi
+
+RECOVER_UNHEALTHY="${RECOVER_UNHEALTHY#"${RECOVER_UNHEALTHY%%[![:space:]]*}"}"
+RECOVER_UNHEALTHY="${RECOVER_UNHEALTHY%"${RECOVER_UNHEALTHY##*[![:space:]]}"}"
+case "${RECOVER_UNHEALTHY}" in
+  true|false) ;;
+  *)
+    echo "::error::recover_unhealthy must be true or false, got '${RECOVER_UNHEALTHY}'"
+    exit 1
+    ;;
+esac
+
+kubectl_kind() {
+  timeout 30s kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" "$@"
+}
+
+docker_timeout() {
+  timeout 30s docker "$@"
+}
+
+STATIC_POD_RECREATE_SETTLE_SECONDS=5
+RESTART_COUNT_ATTEMPTS=3
+RESTART_COUNT_RETRY_SLEEP_SECONDS=2
+declare -A RECOVERY_ATTEMPTS=()
+declare -A INITIAL_RESTARTS=()
+
+kubectl_kind get --raw='/readyz' || true
+
+wait_ready() {
+  local component="$1"
+  local selector="component=${component}"
+
+  if ! timeout "${WAIT_TIMEOUT}" kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" -n "${NAMESPACE}" \
+    wait --for=condition=Ready pod -l "${selector}" --timeout="${WAIT_TIMEOUT}"; then
+    return 1
+  fi
+}
+
+restart_total() {
+  local component="$1"
+  local selector="component=${component}"
+  local restart_counts
+  local restart_count
+  local total=0
+  local attempt
+
+  for ((attempt = 1; attempt <= RESTART_COUNT_ATTEMPTS; attempt++)); do
+    if restart_counts=$(kubectl_kind -n "${NAMESPACE}" get pod -l "${selector}" \
+      -o jsonpath='{range .items[*]}{range .status.containerStatuses[*]}{.restartCount}{"\n"}{end}{end}'); then
+      if [[ -n "${restart_counts}" ]]; then
+        break
+      fi
+      echo "::warning::no container statuses found for ${component} pods (attempt ${attempt}/${RESTART_COUNT_ATTEMPTS})" >&2
+    else
+      echo "::warning::failed to read restart counts for ${component} pods (attempt ${attempt}/${RESTART_COUNT_ATTEMPTS})" >&2
+    fi
+
+    if (( attempt < RESTART_COUNT_ATTEMPTS )); then
+      sleep "${RESTART_COUNT_RETRY_SLEEP_SECONDS}"
+    fi
+  done
+
+  if [[ -z "${restart_counts}" ]]; then
+    echo "::error::no container statuses found for ${component} pods after ${RESTART_COUNT_ATTEMPTS} attempts" >&2
+    dump_component_diagnostics "${component}" >&2
+    exit 1
+  fi
+
+  while IFS= read -r restart_count; do
+    [[ -z "${restart_count}" ]] && continue
+    total=$((total + restart_count))
+  done <<< "${restart_counts}"
+  echo "${total}"
+}
+
+report_restart_baseline() {
+  local component="$1"
+  local restart_count="$2"
+
+  if (( restart_count > 0 )); then
+    echo "::warning::${component} has historical restartCount=${restart_count}; checking current readiness and stability window only"
+    return
+  fi
+  echo "${component} restartCount=${restart_count}"
+}
+
+dump_control_plane_summary() {
+  echo "=== Control-plane pod restart summary ==="
+  kubectl_kind -n "${NAMESPACE}" get pods -l tier=control-plane -o wide || true
+  kubectl_kind -n "${NAMESPACE}" get pods -l tier=control-plane \
+    -o jsonpath='{range .items[*]}{.metadata.name}{" restartCount="}{range .status.containerStatuses[*]}{.restartCount}{" "}{end}{"\n"}{end}' || true
+}
+
+require_readyz() {
+  local reason="$1"
+
+  if ! kubectl_kind get --raw='/readyz'; then
+    echo "::error::kube-apiserver /readyz failed ${reason}"
+    dump_all_control_plane_runtime_diagnostics
+    exit 1
+  fi
+}
+
+dump_api_server_health() {
+  local endpoint
+
+  for endpoint in '/livez?verbose' '/readyz?verbose' '/healthz'; do
+    echo "=== kube-apiserver ${endpoint} ==="
+    kubectl_kind get --raw="${endpoint}" || true
+  done
+}
+
+dump_kind_node_runtime_summary() {
+  local node="${KIND_CLUSTER_NAME}-control-plane"
+
+  if ! docker_timeout inspect "${node}" >/dev/null 2>&1; then
+    echo "::warning::cannot collect node runtime summary: kind node container ${node} not found"
+    return
+  fi
+
+  echo "=== ${node} docker stats ==="
+  docker_timeout stats --no-stream \
+    --format 'table {{.Name}}\t{{.CPUPerc}}\t{{.MemUsage}}\t{{.NetIO}}\t{{.BlockIO}}\t{{.PIDs}}' \
+    "${node}" || true
+
+  echo "=== ${node} docker inspect state ==="
+  docker_timeout inspect \
+    --format 'status={{.State.Status}} running={{.State.Running}} oomKilled={{.State.OOMKilled}} pid={{.State.Pid}} started={{.State.StartedAt}} finished={{.State.FinishedAt}}' \
+    "${node}" || true
+
+  echo "=== ${node} node pressure snapshot ==="
+  docker_timeout exec "${node}" sh -c '
+    date
+    uptime || true
+    free -h || true
+    df -h / /var/lib/containerd /var/lib/kubelet 2>/dev/null || df -h
+    echo "--- top cpu/memory processes ---"
+    ps -eo pid,ppid,stat,etime,%cpu,%mem,comm,args --sort=-%cpu | head -40 || true
+  ' || true
+
+  echo "=== ${node} CRI pod/container summary ==="
+  docker_timeout exec "${node}" crictl pods || true
+  docker_timeout exec "${node}" crictl ps -a || true
+  docker_timeout exec "${node}" crictl stats || true
+}
+
+dump_static_pod_runtime_diagnostics() {
+  local component="$1"
+  local node="${KIND_CLUSTER_NAME}-control-plane"
+  local container_ids
+  local container_id
+  local count=0
+
+  if ! docker_timeout inspect "${node}" >/dev/null 2>&1; then
+    echo "::warning::cannot collect ${component} runtime diagnostics: kind node container ${node} not found"
+    return
+  fi
+
+  echo "=== ${node} ${component} static pod manifest ==="
+  docker_timeout exec "${node}" sh -c "sed -n '1,220p' /etc/kubernetes/manifests/${component}.yaml" || true
+
+  echo "=== ${node} ${component} CRI containers ==="
+  docker_timeout exec "${node}" crictl ps -a --name "${component}" || true
+
+  container_ids=$(docker_timeout exec "${node}" crictl ps -a --name "${component}" -q 2>/dev/null || true)
+  for container_id in ${container_ids}; do
+    count=$((count + 1))
+    if (( count > 8 )); then
+      echo "Skipping remaining ${component} CRI containers after first 8 entries."
+      break
+    fi
+
+    echo "=== ${node} crictl inspect ${component} ${container_id} ==="
+    docker_timeout exec "${node}" crictl inspect "${container_id}" || true
+    echo "=== ${node} crictl logs ${component} ${container_id} ==="
+    docker_timeout exec "${node}" crictl logs --tail=200 "${container_id}" || true
+  done
+
+  echo "=== ${node} kubelet journal (${component}) ==="
+  docker_timeout exec "${node}" journalctl -u kubelet --since '45 minutes ago' --no-pager 2>/dev/null \
+    | grep -Ei "${component}|static pod|mirror pod|probe|liveness|readiness|startup|back-off|backoff|container|failed|error|oom|killed" \
+    | tail -200 || true
+
+  echo "=== ${node} containerd journal (${component}) ==="
+  docker_timeout exec "${node}" journalctl -u containerd --since '45 minutes ago' --no-pager 2>/dev/null \
+    | grep -Ei "${component}|container|task|shim|deadline|failed|error|oom|killed" \
+    | tail -200 || true
+}
+
+dump_all_control_plane_runtime_diagnostics() {
+  local component
+
+  dump_control_plane_summary
+  dump_api_server_health
+  dump_kind_node_runtime_summary
+  for component in ${COMPONENTS}; do
+    dump_static_pod_runtime_diagnostics "${component}"
+    kubectl_kind -n "${NAMESPACE}" get lease "${component}" -o yaml 2>/dev/null || true
+  done
+}
+
+dump_component_diagnostics() {
+  local component="$1"
+  local selector="component=${component}"
+  local pods
+  local pod
+
+  dump_control_plane_summary
+  kubectl_kind -n "${NAMESPACE}" get pod -l "${selector}" -o wide || true
+  kubectl_kind -n "${NAMESPACE}" describe pod -l "${selector}" || true
+  kubectl_kind -n "${NAMESPACE}" get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true
+
+  pods=$(kubectl_kind -n "${NAMESPACE}" get pod -l "${selector}" -o name 2>/dev/null || true)
+  while IFS= read -r pod; do
+    [[ -z "${pod}" ]] && continue
+    echo "=== ${pod} logs ==="
+    kubectl_kind -n "${NAMESPACE}" logs "${pod}" --all-containers --tail=100 2>/dev/null || true
+    echo "=== ${pod} previous logs ==="
+    kubectl_kind -n "${NAMESPACE}" logs "${pod}" --all-containers --previous --tail=100 2>/dev/null || true
+  done <<< "${pods}"
+
+  dump_all_control_plane_runtime_diagnostics
+  kubectl_kind -n "${NAMESPACE}" get lease "${component}" -o yaml 2>/dev/null || true
+}
+
+is_recovery_component() {
+  local component="$1"
+  local candidate
+
+  for candidate in ${RECOVERY_COMPONENTS}; do
+    if [[ "${candidate}" == "${component}" ]]; then
+      return 0
+    fi
+  done
+  return 1
+}
+
+try_recover_component() {
+  local component="$1"
+  local reason="$2"
+  local node="${KIND_CLUSTER_NAME}-control-plane"
+  local attempt
+  local container_ids
+  local container_id
+
+  if [[ "${RECOVER_UNHEALTHY}" != "true" ]]; then
+    return 1
+  fi
+  if (( MAX_RECOVERY_ATTEMPTS == 0 )); then
+    return 1
+  fi
+  if ! is_recovery_component "${component}"; then
+    return 1
+  fi
+
+  attempt="${RECOVERY_ATTEMPTS[${component}]:-0}"
+  if (( attempt >= MAX_RECOVERY_ATTEMPTS )); then
+    return 1
+  fi
+  RECOVERY_ATTEMPTS["${component}"]=$((attempt + 1))
+
+  echo "::warning::${component} is unhealthy (${reason}); restarting static pod container (attempt $((attempt + 1))/${MAX_RECOVERY_ATTEMPTS})"
+  dump_component_diagnostics "${component}"
+
+  if ! docker_timeout inspect "${node}" >/dev/null 2>&1; then
+    echo "::warning::cannot recover ${component}: kind node container ${node} not found"
+    return 1
+  fi
+
+  if ! container_ids=$(docker_timeout exec "${node}" crictl ps --name "${component}" -q 2>/dev/null); then
+    echo "::warning::cannot recover ${component}: timed out or failed to list containers in ${node}"
+    return 1
+  fi
+  if [[ -z "${container_ids}" ]]; then
+    echo "::warning::cannot recover ${component}: no running container found in ${node}"
+    return 1
+  fi
+
+  for container_id in ${container_ids}; do
+    echo "Stopping ${component} container ${container_id} in ${node}..."
+    if ! docker_timeout exec "${node}" crictl stop "${container_id}"; then
+      echo "::warning::failed to stop ${component} container ${container_id}"
+      return 1
+    fi
+  done
+
+  # Give kubelet a short interval to observe the stopped CRI container
+  # and refresh the mirror pod before kubectl wait reads pod status.
+  sleep "${STATIC_POD_RECREATE_SETTLE_SECONDS}"
+  if ! wait_ready "${component}"; then
+    echo "::warning::${component} did not recover after static pod container restart"
+    dump_component_diagnostics "${component}"
+    kubectl_kind get --raw='/readyz' || true
+    return 1
+  fi
+
+  echo "${component} recovered after static pod container restart."
+  return 0
+}
+
+check_component() {
+  local component="$1"
+  local selector="component=${component}"
+  local pods
+  local initial_restarts
+
+  if ! pods=$(kubectl_kind -n "${NAMESPACE}" get pod -l "${selector}" -o name); then
+    if ! try_recover_component "${component}" "failed to list pods in ${NAMESPACE} with selector ${selector}"; then
+      echo "::error::failed to list ${component} pods in ${NAMESPACE} with selector ${selector}"
+      kubectl_kind -n "${NAMESPACE}" get pods -o wide || true
+      exit 1
+    fi
+    if ! pods=$(kubectl_kind -n "${NAMESPACE}" get pod -l "${selector}" -o name); then
+      echo "::error::failed to list ${component} pods after recovery"
+      kubectl_kind -n "${NAMESPACE}" get pods -o wide || true
+      exit 1
+    fi
+  fi
+  if [[ -z "${pods}" ]]; then
+    echo "::error::no ${component} pods found in ${NAMESPACE} with selector ${selector}"
+    kubectl_kind -n "${NAMESPACE}" get pods -o wide || true
+    exit 1
+  fi
+
+  if ! wait_ready "${component}"; then
+    if ! try_recover_component "${component}" "pods did not become Ready within ${WAIT_TIMEOUT}"; then
+      echo "::error::${component} pods did not become Ready within ${WAIT_TIMEOUT}"
+      dump_component_diagnostics "${component}"
+      kubectl_kind get --raw='/readyz' || true
+      exit 1
+    fi
+  fi
+  initial_restarts=$(restart_total "${component}")
+  report_restart_baseline "${component}" "${initial_restarts}"
+  INITIAL_RESTARTS["${component}"]="${initial_restarts}"
+}
+
+verify_stability_window() {
+  local component
+  local initial_restarts
+  local final_restarts
+  local recovered=false
+
+  if [[ "${STABILITY_WINDOW}" == "0s" ]]; then
+    return
+  fi
+
+  echo "Observing control-plane stability for ${STABILITY_WINDOW}..."
+  sleep "${STABILITY_WINDOW}"
+  for component in ${COMPONENTS}; do
+    initial_restarts="${INITIAL_RESTARTS[${component}]:-}"
+    if [[ -z "${initial_restarts}" ]]; then
+      echo "::error::missing initial restart count for ${component}"
+      exit 1
+    fi
+    if ! wait_ready "${component}"; then
+      if ! try_recover_component "${component}" "pods became unready during ${STABILITY_WINDOW}"; then
+        echo "::error::${component} pods became unready during ${STABILITY_WINDOW}"
+        dump_component_diagnostics "${component}"
+        kubectl_kind get --raw='/readyz' || true
+        exit 1
+      fi
+      initial_restarts=$(restart_total "${component}")
+      report_restart_baseline "${component}" "${initial_restarts}"
+      INITIAL_RESTARTS["${component}"]="${initial_restarts}"
+      recovered=true
+      continue
+    fi
+    final_restarts=$(restart_total "${component}")
+    if (( final_restarts > initial_restarts )); then
+      echo "::error::${component} restartCount increased from ${initial_restarts} to ${final_restarts} during ${STABILITY_WINDOW}"
+      dump_component_diagnostics "${component}"
+      kubectl_kind get --raw='/readyz' || true
+      exit 1
+    fi
+    INITIAL_RESTARTS["${component}"]="${final_restarts}"
+  done
+
+  if [[ "${recovered}" != "true" ]]; then
+    return
+  fi
+
+  echo "::warning::control-plane recovery occurred; observing one additional ${STABILITY_WINDOW} stability window"
+  sleep "${STABILITY_WINDOW}"
+  for component in ${COMPONENTS}; do
+    initial_restarts="${INITIAL_RESTARTS[${component}]:-}"
+    if [[ -z "${initial_restarts}" ]]; then
+      echo "::error::missing post-recovery restart count for ${component}"
+      exit 1
+    fi
+    if ! wait_ready "${component}"; then
+      echo "::error::${component} pods became unready after recovery"
+      dump_component_diagnostics "${component}"
+      kubectl_kind get --raw='/readyz' || true
+      exit 1
+    fi
+    final_restarts=$(restart_total "${component}")
+    if (( final_restarts > initial_restarts )); then
+      echo "::error::${component} restartCount increased from ${initial_restarts} to ${final_restarts} after recovery"
+      dump_component_diagnostics "${component}"
+      exit 1
+    fi
+    INITIAL_RESTARTS["${component}"]="${final_restarts}"
+  done
+}
+
+for component in ${COMPONENTS}; do
+  check_component "${component}"
+done
+verify_stability_window
+require_readyz "after stability window"
diff --git a/.github/actions/gpu-cluster-setup/action.yml b/.github/actions/gpu-cluster-setup/action.yml
index 21d27800c..324ce7a8f 100644
--- a/.github/actions/gpu-cluster-setup/action.yml
+++ b/.github/actions/gpu-cluster-setup/action.yml
@@ -99,12 +99,7 @@ runs:
 
     - name: Validate environment
       shell: bash
-      run: |
-        if [[ -z "${KIND_CLUSTER_NAME:-}" ]]; then
-          echo "::error::KIND_CLUSTER_NAME environment variable must be set by the calling workflow"
-          exit 1
-        fi
-
+      run: bash "${{ github.action_path }}/validate-env.sh"
     - name: Load versions
       id: versions
       uses: ./.github/actions/load-versions
@@ -130,10 +125,9 @@ runs:
 
     - name: Install nvkind
       shell: bash
-      run: |
-        go install "github.com/NVIDIA/nvkind/cmd/nvkind@${{ steps.versions.outputs.nvkind }}"
-        nvkind --help
-
+      env:
+        NVKIND_VERSION: ${{ steps.versions.outputs.nvkind }}
+      run: bash "${{ github.action_path }}/install-nvkind.sh"
     - name: Runner preflight
       shell: bash
       env:
@@ -141,153 +135,32 @@ runs:
         MIN_GPU_COUNT: ${{ inputs.min_gpu_count }}
         MIN_FREE_DISK_GB: ${{ inputs.min_free_disk_gb }}
         MIN_AVAILABLE_MEMORY_GB: ${{ inputs.min_available_memory_gb }}
-      run: |
-        set -euo pipefail
-
-        echo "=== Runner baseline ==="
-        date -u
-        hostname
-        uptime
-        nproc
-        free -h
-        df -h /
-        df -ih /
-
-        for value_name in MIN_GPU_COUNT MIN_FREE_DISK_GB MIN_AVAILABLE_MEMORY_GB; do
-          value="${!value_name}"
-          if ! [[ "${value}" =~ ^[0-9]+$ ]]; then
-            echo "::error::${value_name} must be an integer, got '${value}'"
-            exit 1
-          fi
-        done
-
-        echo "=== Docker health ==="
-        docker info >/dev/null
-        docker version
-
-        echo "=== Host GPUs ==="
-        nvidia-smi -L
-        nvidia-smi
-
-        mapfile -t gpu_names < <(nvidia-smi --query-gpu=name --format=csv,noheader)
-        if [[ -n "${GPU_MODEL_PATTERN}" ]]; then
-          set +e
-          gpu_count=$(printf '%s\n' "${gpu_names[@]}" | grep -Eic -- "${GPU_MODEL_PATTERN}")
-          grep_status=$?
-          set -e
-          if (( grep_status == 2 )); then
-            echo "::error::invalid gpu_model_pattern regex: ${GPU_MODEL_PATTERN}"
-            exit 1
-          fi
-          if (( grep_status != 0 )); then
-            gpu_count=0
-          fi
-          echo "Visible GPUs matching '${GPU_MODEL_PATTERN}': ${gpu_count}"
-        else
-          gpu_count="${#gpu_names[@]}"
-          echo "Visible GPUs: ${gpu_count}"
-        fi
-
-        if (( gpu_count < MIN_GPU_COUNT )); then
-          echo "::error::visible GPU count ${gpu_count} is below required minimum ${MIN_GPU_COUNT}"
-          exit 1
-        fi
-
-        echo "=== Existing kind state ==="
-        kind get clusters || true
-        docker ps -a --filter "label=io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}" || true
-
+      run: bash "${{ github.action_path }}/runner-preflight.sh"
     - name: Configure NVIDIA Container Toolkit for kind
       shell: bash
-      run: |
-        sudo nvidia-ctk runtime configure --runtime=docker --set-as-default --cdi.enabled
-        sudo nvidia-ctk config --set accept-nvidia-visible-devices-as-volume-mounts=true --in-place
-        sudo nvidia-ctk config --set accept-nvidia-visible-devices-envvar-when-unprivileged=false --in-place
-        sudo systemctl restart docker
-
+      run: bash "${{ github.action_path }}/configure-nvidia-container-toolkit.sh"
     - name: Validate Docker GPU access
       shell: bash
-      run: |
-        set -euo pipefail
-        timeout 300s docker run --rm -v /dev/null:/var/run/nvidia-container-devices/all ubuntu:22.04 nvidia-smi -L
-
+      run: bash "${{ github.action_path }}/validate-docker-gpu-access.sh"
     - name: Increase inotify limits
       shell: bash
-      run: |
-        sudo sysctl -w fs.inotify.max_user_watches=524288
-        sudo sysctl -w fs.inotify.max_user_instances=1024
-
+      run: bash "${{ github.action_path }}/increase-inotify-limits.sh"
     - name: Delete stale kind cluster
       shell: bash
-      run: |
-        set -euo pipefail
-        kind_cluster_label="io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}"
-        if kind get clusters | grep -Fxq "${KIND_CLUSTER_NAME}"; then
-          echo "Deleting stale kind cluster: ${KIND_CLUSTER_NAME}"
-          if ! timeout 180s kind delete cluster --name "${KIND_CLUSTER_NAME}"; then
-            echo "::warning::kind delete cluster timed out or failed; falling back to direct container cleanup"
-          fi
-        else
-          echo "No stale kind cluster named ${KIND_CLUSTER_NAME}"
-        fi
-
-        remaining_containers=$(docker ps -aq --filter "label=${kind_cluster_label}")
-        if [[ -n "${remaining_containers}" ]]; then
-          echo "Removing stale containers for ${KIND_CLUSTER_NAME}:"
-          docker ps -a --filter "label=${kind_cluster_label}"
-          docker rm -f ${remaining_containers}
-        fi
-
-        remaining_containers=$(docker ps -aq --filter "label=${kind_cluster_label}")
-        if [[ -n "${remaining_containers}" ]]; then
-          echo "::error::stale containers still remain for ${KIND_CLUSTER_NAME}:"
-          docker ps -a --filter "label=${kind_cluster_label}"
-          exit 1
-        fi
-
+      run: bash "${{ github.action_path }}/delete-stale-kind-cluster.sh"
     - name: Check runner capacity
       shell: bash
       env:
         MIN_FREE_DISK_GB: ${{ inputs.min_free_disk_gb }}
         MIN_AVAILABLE_MEMORY_GB: ${{ inputs.min_available_memory_gb }}
-      run: |
-        set -euo pipefail
-        free_disk_gb=$(df -BG --output=avail / | tail -1 | tr -dc '0-9')
-        if (( free_disk_gb < MIN_FREE_DISK_GB )); then
-          echo "::error::free disk on / is ${free_disk_gb}GiB, need at least ${MIN_FREE_DISK_GB}GiB"
-          exit 1
-        fi
-
-        available_memory_gb=$(free -g | awk '/^Mem:/ {print $7}')
-        if (( available_memory_gb < MIN_AVAILABLE_MEMORY_GB )); then
-          echo "::error::available memory is ${available_memory_gb}GiB, need at least ${MIN_AVAILABLE_MEMORY_GB}GiB"
-          exit 1
-        fi
-
-        echo "Runner capacity is sufficient: disk=${free_disk_gb}GiB memory=${available_memory_gb}GiB"
-
+      run: bash "${{ github.action_path }}/check-runner-capacity.sh"
     - name: Warm kind node image
       if: ${{ inputs.kind_node_image != '' }}
       shell: bash
       env:
         KIND_NODE_IMAGE: ${{ inputs.kind_node_image }}
         MIN_FREE_DISK_GB: ${{ inputs.min_free_disk_gb }}
-      run: |
-        set -euo pipefail
-        echo "=== Kind node image cache ==="
-        if docker image inspect "${KIND_NODE_IMAGE}" >/dev/null 2>&1; then
-          echo "Kind node image already cached: ${KIND_NODE_IMAGE}"
-        else
-          echo "Pulling kind node image: ${KIND_NODE_IMAGE}"
-          timeout 600s docker pull "${KIND_NODE_IMAGE}"
-        fi
-        free_disk_gb=$(df -BG --output=avail / | tail -1 | tr -dc '0-9')
-        if (( free_disk_gb < MIN_FREE_DISK_GB )); then
-          echo "::error::free disk on / is ${free_disk_gb}GiB after warming ${KIND_NODE_IMAGE}, need at least ${MIN_FREE_DISK_GB}GiB"
-          exit 1
-        fi
-        echo "Runner disk remains sufficient after kind image warm-up: ${free_disk_gb}GiB"
-
+      run: bash "${{ github.action_path }}/warm-kind-node-image.sh"
     - name: Create GPU-enabled kind cluster
       shell: bash
       env:
@@ -306,377 +179,7 @@ runs:
         SCHEDULER_MEMORY_REQUEST: ${{ inputs.scheduler_memory_request }}
         ETCD_CPU_REQUEST: ${{ inputs.etcd_cpu_request }}
         ETCD_MEMORY_REQUEST: ${{ inputs.etcd_memory_request }}
-      run: |
-        set -euo pipefail
-
-        validate_duration_input() {
-          local input_name="$1"
-          local input_value="$2"
-
-          if ! [[ "${input_value}" =~ ^[0-9]+[smh]$ ]]; then
-            echo "::error::${input_name} must be a duration like 300s, 10m, or 1h; got '${input_value}'"
-            exit 1
-          fi
-        }
-
-        validate_generated_control_plane_config() {
-          if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then
-            for patch_file in "${patch_dir}"/*.yaml; do
-              if ! grep -Fxq 'apiVersion: v1' "${patch_file}" ||
-                ! grep -Fxq 'kind: Pod' "${patch_file}" ||
-                ! grep -Eq '^[[:space:]]+resources:$' "${patch_file}"; then
-                echo "::error::rendered static pod patch ${patch_file} is missing expected top-level YAML"
-                sed 's/^/  /' "${patch_file}" || true
-                exit 1
-              fi
-            done
-
-            if ! grep -Eq '^[[:space:]]*extraMounts:$' "${config_template}" ||
-              ! grep -Fq 'directory: /patches' "${config_template}"; then
-              echo "::error::rendered kind config is missing control-plane patch mounts"
-              sed 's/^/  /' "${config_template}" || true
-              exit 1
-            fi
-          fi
-
-          if [[ "${CONTROL_PLANE_LEADER_ELECTION_TUNING}" == "true" ]]; then
-            for expected in \
-              'apiVersion: kubeadm.k8s.io/v1beta3' \
-              'apiVersion: kubeadm.k8s.io/v1beta4' \
-              "leader-elect-lease-duration: \"${LEADER_ELECTION_LEASE_DURATION}\"" \
-              "leader-elect-renew-deadline: \"${LEADER_ELECTION_RENEW_DEADLINE}\"" \
-              "leader-elect-retry-period: \"${LEADER_ELECTION_RETRY_PERIOD}\"" \
-              "value: \"${LEADER_ELECTION_LEASE_DURATION}\"" \
-              "value: \"${LEADER_ELECTION_RENEW_DEADLINE}\"" \
-              "value: \"${LEADER_ELECTION_RETRY_PERIOD}\""; do
-              if ! grep -Fq "${expected}" "${config_template}"; then
-                echo "::error::rendered kind config is missing expected leader election setting: ${expected}"
-                sed 's/^/  /' "${config_template}" || true
-                exit 1
-              fi
-            done
-          fi
-        }
-
-        validate_duration_input cluster_create_timeout "${CLUSTER_CREATE_TIMEOUT}"
-        validate_duration_input leader_election_lease_duration "${LEADER_ELECTION_LEASE_DURATION}"
-        validate_duration_input leader_election_renew_deadline "${LEADER_ELECTION_RENEW_DEADLINE}"
-        validate_duration_input leader_election_retry_period "${LEADER_ELECTION_RETRY_PERIOD}"
-
-        CREATE_ARGS=(--name="${KIND_CLUSTER_NAME}")
-        if [[ -n "${KIND_NODE_IMAGE}" ]]; then
-          echo "Using kind node image: ${KIND_NODE_IMAGE}"
-          CREATE_ARGS+=(--image="${KIND_NODE_IMAGE}")
-        fi
-
-        case "${CONTROL_PLANE_RESOURCE_PATCHES}" in
-          true) ;;
-          ""|false) CONTROL_PLANE_RESOURCE_PATCHES=false ;;
-          *)
-            echo "::error::control_plane_resource_patches must be true or false, got '${CONTROL_PLANE_RESOURCE_PATCHES}'"
-            exit 1
-            ;;
-        esac
-
-        case "${CONTROL_PLANE_LEADER_ELECTION_TUNING}" in
-          true) ;;
-          ""|false) CONTROL_PLANE_LEADER_ELECTION_TUNING=false ;;
-          *)
-            echo "::error::control_plane_leader_election_tuning must be true or false, got '${CONTROL_PLANE_LEADER_ELECTION_TUNING}'"
-            exit 1
-            ;;
-        esac
-
-        if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" || "${CONTROL_PLANE_LEADER_ELECTION_TUNING}" == "true" ]]; then
-          patch_dir="$(mktemp -d)"
-          config_template="$(mktemp)"
-
-          # Keep heredoc body indentation aligned with this run block. GitHub
-          # Actions strips the common run: | indent before bash sees it.
-          if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then
-          cat > "${patch_dir}/kube-apiserver+strategic.yaml" <<EOF
-        apiVersion: v1
-        kind: Pod
-        metadata:
-          name: kube-apiserver
-          namespace: kube-system
-        spec:
-          containers:
-          - name: kube-apiserver
-            resources:
-              requests:
-                cpu: ${API_SERVER_CPU_REQUEST}
-                memory: ${API_SERVER_MEMORY_REQUEST}
-        EOF
-
-          cat > "${patch_dir}/kube-controller-manager+strategic.yaml" <<EOF
-        apiVersion: v1
-        kind: Pod
-        metadata:
-          name: kube-controller-manager
-          namespace: kube-system
-        spec:
-          containers:
-          - name: kube-controller-manager
-            resources:
-              requests:
-                cpu: ${CONTROLLER_MANAGER_CPU_REQUEST}
-                memory: ${CONTROLLER_MANAGER_MEMORY_REQUEST}
-        EOF
-
-          cat > "${patch_dir}/kube-scheduler+strategic.yaml" <<EOF
-        apiVersion: v1
-        kind: Pod
-        metadata:
-          name: kube-scheduler
-          namespace: kube-system
-        spec:
-          containers:
-          - name: kube-scheduler
-            resources:
-              requests:
-                cpu: ${SCHEDULER_CPU_REQUEST}
-                memory: ${SCHEDULER_MEMORY_REQUEST}
-        EOF
-
-          cat > "${patch_dir}/etcd+strategic.yaml" <<EOF
-        apiVersion: v1
-        kind: Pod
-        metadata:
-          name: etcd
-          namespace: kube-system
-        spec:
-          containers:
-          - name: etcd
-            resources:
-              requests:
-                cpu: ${ETCD_CPU_REQUEST}
-                memory: ${ETCD_MEMORY_REQUEST}
-        EOF
-          fi
-
-          cat > "${config_template}" <<'EOF'
-        kind: Cluster
-        apiVersion: kind.x-k8s.io/v1alpha4
-        {{- if hasKey $ "name" }}
-        name: {{ $.name }}
-        {{- end }}
-        nodes:
-        - role: control-plane
-          {{- if hasKey $ "image" }}
-          image: {{ $.image }}
-          {{- end }}
-        EOF
-          if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then
-          cat >> "${config_template}" <<EOF
-          extraMounts:
-          - hostPath: ${patch_dir}
-            containerPath: /patches
-        EOF
-          fi
-          cat >> "${config_template}" <<'EOF'
-          kubeadmConfigPatches:
-        EOF
-          if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then
-          cat >> "${config_template}" <<'EOF'
-          - |
-            kind: InitConfiguration
-            patches:
-              directory: /patches
-        EOF
-          fi
-          if [[ "${CONTROL_PLANE_LEADER_ELECTION_TUNING}" == "true" ]]; then
-          # kind v0.31 renders kubeadm v1beta3. Keep a v1beta4 patch too so
-          # this remains valid when a future kind image switches API versions.
-          cat >> "${config_template}" <<EOF
-          - |
-            kind: ClusterConfiguration
-            apiVersion: kubeadm.k8s.io/v1beta3
-            controllerManager:
-              extraArgs:
-                leader-elect-lease-duration: "${LEADER_ELECTION_LEASE_DURATION}"
-                leader-elect-renew-deadline: "${LEADER_ELECTION_RENEW_DEADLINE}"
-                leader-elect-retry-period: "${LEADER_ELECTION_RETRY_PERIOD}"
-            scheduler:
-              extraArgs:
-                leader-elect-lease-duration: "${LEADER_ELECTION_LEASE_DURATION}"
-                leader-elect-renew-deadline: "${LEADER_ELECTION_RENEW_DEADLINE}"
-                leader-elect-retry-period: "${LEADER_ELECTION_RETRY_PERIOD}"
-          - |
-            kind: ClusterConfiguration
-            apiVersion: kubeadm.k8s.io/v1beta4
-            controllerManager:
-              extraArgs:
-              - name: leader-elect-lease-duration
-                value: "${LEADER_ELECTION_LEASE_DURATION}"
-              - name: leader-elect-renew-deadline
-                value: "${LEADER_ELECTION_RENEW_DEADLINE}"
-              - name: leader-elect-retry-period
-                value: "${LEADER_ELECTION_RETRY_PERIOD}"
-            scheduler:
-              extraArgs:
-              - name: leader-elect-lease-duration
-                value: "${LEADER_ELECTION_LEASE_DURATION}"
-              - name: leader-elect-renew-deadline
-                value: "${LEADER_ELECTION_RENEW_DEADLINE}"
-              - name: leader-elect-retry-period
-                value: "${LEADER_ELECTION_RETRY_PERIOD}"
-        EOF
-          fi
-          cat >> "${config_template}" <<'EOF'
-        {{- range $.workers }}
-        - role: worker
-          {{- if hasKey $ "image" }}
-          image: {{ $.image }}
-          {{- end }}
-
-          {{- if hasKey . "devices" }}
-          {{- $devices := .devices }}
-          {{- if not (kindIs "slice" $devices) }}
-            {{- $devices = list .devices }}
-          {{- end }}
-          extraMounts:
-            # We inject all NVIDIA GPUs using the nvidia-container-runtime.
-            # This requires `accept-nvidia-visible-devices-as-volume-mounts = true` be set
-            # in `/etc/nvidia-container-runtime/config.toml`
-            {{- range $d := $devices }}
-            - hostPath: /dev/null
-              containerPath: /var/run/nvidia-container-devices/{{ $d }}
-            {{- end }}
-          {{- end }}
-        {{- end }}
-        EOF
-          if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then
-            echo "Applying control-plane static pod resource patches from ${patch_dir}:"
-            for patch_file in "${patch_dir}"/*.yaml; do
-              echo "--- ${patch_file}"
-              sed 's/^/  /' "${patch_file}"
-            done
-          fi
-          if [[ "${CONTROL_PLANE_LEADER_ELECTION_TUNING}" == "true" ]]; then
-            echo "Increasing kube-controller-manager and kube-scheduler leader election timeouts for slow CI control planes:"
-            echo "  lease-duration=${LEADER_ELECTION_LEASE_DURATION}"
-            echo "  renew-deadline=${LEADER_ELECTION_RENEW_DEADLINE}"
-            echo "  retry-period=${LEADER_ELECTION_RETRY_PERIOD}"
-          fi
-          validate_generated_control_plane_config
-          CREATE_ARGS+=(--config-template="${config_template}")
-        fi
-
-        set +e
-        timeout "${CLUSTER_CREATE_TIMEOUT}" nvkind cluster create "${CREATE_ARGS[@]}"
-        create_status=$?
-        set -e
-        if (( create_status != 0 )); then
-          echo "::warning::nvkind cluster create exited with status ${create_status}; continuing only if post-create checks pass"
-        fi
-
-        kubectl --context="kind-${KIND_CLUSTER_NAME}" wait --for=condition=Ready nodes --all --timeout=300s
-        kubectl --context="kind-${KIND_CLUSTER_NAME}" cluster-info
-        kubectl --context="kind-${KIND_CLUSTER_NAME}" get nodes -o wide
-        kubectl --context="kind-${KIND_CLUSTER_NAME}" describe nodes | \
-          grep -E "^(Name:|Capacity:|Allocatable:|Allocated resources:|  cpu|  memory|  nvidia.com/gpu)" || true
-
-        echo "=== Kind node container resources ==="
-        docker ps --filter "label=io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}" \
-          --format '{{.Names}}' | sort | while read -r node_container; do
-            [[ -z "${node_container}" ]] && continue
-            docker inspect "${node_container}" \
-              --format '{{.Name}} NanoCpus={{.HostConfig.NanoCpus}} CpuShares={{.HostConfig.CpuShares}} Memory={{.HostConfig.Memory}} MemoryReservation={{.HostConfig.MemoryReservation}}'
-          done
-
-        echo "=== Control-plane resource requests/limits ==="
-        kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kube-system \
-          get pods -l tier=control-plane -o json | jq -r '
-            .items[] as $pod |
-            $pod.metadata.name,
-            ($pod.spec.containers[] |
-              "  " + .name +
-              " requests=" + ((.resources.requests // {}) | tostring) +
-              " limits=" + ((.resources.limits // {}) | tostring))
-          ' || true
-
-        normalize_cpu_request() {
-          local cpu="$1"
-
-          if [[ "${cpu}" =~ ^([0-9]+)000m$ ]]; then
-            echo "${BASH_REMATCH[1]}"
-            return
-          fi
-          echo "${cpu}"
-        }
-
-        control_plane_request() {
-          local component="$1"
-          local resource="$2"
-
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kube-system \
-            get pod -l "component=${component}" \
-            -o "jsonpath={.items[0].spec.containers[0].resources.requests.${resource}}"
-        }
-
-        assert_control_plane_request() {
-          local component="$1"
-          local resource="$2"
-          local expected="$3"
-          local actual
-
-          actual="$(control_plane_request "${component}" "${resource}")"
-          if [[ "${resource}" == "cpu" ]]; then
-            expected="$(normalize_cpu_request "${expected}")"
-            actual="$(normalize_cpu_request "${actual}")"
-          fi
-          if [[ "${actual}" != "${expected}" ]]; then
-            echo "::error::${component} ${resource} request is '${actual}', expected '${expected}'"
-            exit 1
-          fi
-          echo "${component} ${resource} request verified: ${actual}"
-        }
-
-        control_plane_command_args() {
-          local component="$1"
-
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kube-system \
-            get pod -l "component=${component}" \
-            -o jsonpath='{range .items[0].spec.containers[0].command[*]}{.}{"\n"}{end}{range .items[0].spec.containers[0].args[*]}{.}{"\n"}{end}'
-        }
-
-        assert_control_plane_arg() {
-          local component="$1"
-          local expected="$2"
-          local command_args
-
-          command_args="$(control_plane_command_args "${component}")"
-          if ! grep -Fxq "${expected}" <<< "${command_args}"; then
-            echo "::error::${component} live pod command/args does not contain ${expected}"
-            echo "Observed live command/args:"
-            echo "${command_args}"
-            exit 1
-          fi
-          echo "${component} command/args verified: ${expected}"
-        }
-
-        if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then
-          echo "Verifying control-plane resource patches..."
-          assert_control_plane_request kube-apiserver cpu "${API_SERVER_CPU_REQUEST}"
-          assert_control_plane_request kube-apiserver memory "${API_SERVER_MEMORY_REQUEST}"
-          assert_control_plane_request kube-controller-manager cpu "${CONTROLLER_MANAGER_CPU_REQUEST}"
-          assert_control_plane_request kube-controller-manager memory "${CONTROLLER_MANAGER_MEMORY_REQUEST}"
-          assert_control_plane_request kube-scheduler cpu "${SCHEDULER_CPU_REQUEST}"
-          assert_control_plane_request kube-scheduler memory "${SCHEDULER_MEMORY_REQUEST}"
-          assert_control_plane_request etcd cpu "${ETCD_CPU_REQUEST}"
-          assert_control_plane_request etcd memory "${ETCD_MEMORY_REQUEST}"
-        fi
-
-        if [[ "${CONTROL_PLANE_LEADER_ELECTION_TUNING}" == "true" ]]; then
-          echo "Verifying control-plane leader election timeout patches..."
-          for component in kube-controller-manager kube-scheduler; do
-            assert_control_plane_arg "${component}" "--leader-elect-lease-duration=${LEADER_ELECTION_LEASE_DURATION}"
-            assert_control_plane_arg "${component}" "--leader-elect-renew-deadline=${LEADER_ELECTION_RENEW_DEADLINE}"
-            assert_control_plane_arg "${component}" "--leader-elect-retry-period=${LEADER_ELECTION_RETRY_PERIOD}"
-          done
-        fi
-
+      run: bash "${{ github.action_path }}/create-gpu-kind-cluster.sh"
     - name: Print GPUs (nvkind)
       shell: bash
       run: nvkind cluster print-gpus --name="${KIND_CLUSTER_NAME}"
diff --git a/.github/actions/gpu-cluster-setup/check-runner-capacity.sh b/.github/actions/gpu-cluster-setup/check-runner-capacity.sh
new file mode 100644
index 000000000..2e83beeb9
--- /dev/null
+++ b/.github/actions/gpu-cluster-setup/check-runner-capacity.sh
@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+free_disk_gb=$(df -BG --output=avail / | tail -1 | tr -dc '0-9')
+if (( free_disk_gb < MIN_FREE_DISK_GB )); then
+  echo "::error::free disk on / is ${free_disk_gb}GiB, need at least ${MIN_FREE_DISK_GB}GiB"
+  exit 1
+fi
+
+available_memory_gb=$(free -g | awk '/^Mem:/ {print $7}')
+if (( available_memory_gb < MIN_AVAILABLE_MEMORY_GB )); then
+  echo "::error::available memory is ${available_memory_gb}GiB, need at least ${MIN_AVAILABLE_MEMORY_GB}GiB"
+  exit 1
+fi
+
+echo "Runner capacity is sufficient: disk=${free_disk_gb}GiB memory=${available_memory_gb}GiB"
diff --git a/.github/actions/gpu-cluster-setup/configure-nvidia-container-toolkit.sh b/.github/actions/gpu-cluster-setup/configure-nvidia-container-toolkit.sh
new file mode 100644
index 000000000..0a2fcd814
--- /dev/null
+++ b/.github/actions/gpu-cluster-setup/configure-nvidia-container-toolkit.sh
@@ -0,0 +1,21 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+sudo nvidia-ctk runtime configure --runtime=docker --set-as-default --cdi.enabled
+sudo nvidia-ctk config --set accept-nvidia-visible-devices-as-volume-mounts=true --in-place
+sudo nvidia-ctk config --set accept-nvidia-visible-devices-envvar-when-unprivileged=false --in-place
+sudo systemctl restart docker
diff --git a/.github/actions/gpu-cluster-setup/create-gpu-kind-cluster.sh b/.github/actions/gpu-cluster-setup/create-gpu-kind-cluster.sh
new file mode 100644
index 000000000..42a282e17
--- /dev/null
+++ b/.github/actions/gpu-cluster-setup/create-gpu-kind-cluster.sh
@@ -0,0 +1,411 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+validate_duration_input() {
+  local input_name="$1"
+  local input_value="$2"
+
+  if ! [[ "${input_value}" =~ ^[0-9]+[smh]$ ]]; then
+    echo "::error::${input_name} must be a duration like 300s, 10m, or 1h; got '${input_value}'"
+    exit 1
+  fi
+}
+
+validate_generated_control_plane_config() {
+  if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then
+    for patch_file in "${patch_dir}"/*.yaml; do
+      if ! grep -Fxq 'apiVersion: v1' "${patch_file}" ||
+        ! grep -Fxq 'kind: Pod' "${patch_file}" ||
+        ! grep -Eq '^[[:space:]]+resources:$' "${patch_file}"; then
+        echo "::error::rendered static pod patch ${patch_file} is missing expected top-level YAML"
+        sed 's/^/  /' "${patch_file}" || true
+        exit 1
+      fi
+    done
+
+    if ! grep -Eq '^[[:space:]]*extraMounts:$' "${config_template}" ||
+      ! grep -Fq 'directory: /patches' "${config_template}"; then
+      echo "::error::rendered kind config is missing control-plane patch mounts"
+      sed 's/^/  /' "${config_template}" || true
+      exit 1
+    fi
+  fi
+
+  if [[ "${CONTROL_PLANE_LEADER_ELECTION_TUNING}" == "true" ]]; then
+    for expected in \
+      'apiVersion: kubeadm.k8s.io/v1beta3' \
+      'apiVersion: kubeadm.k8s.io/v1beta4' \
+      "leader-elect-lease-duration: \"${LEADER_ELECTION_LEASE_DURATION}\"" \
+      "leader-elect-renew-deadline: \"${LEADER_ELECTION_RENEW_DEADLINE}\"" \
+      "leader-elect-retry-period: \"${LEADER_ELECTION_RETRY_PERIOD}\"" \
+      "value: \"${LEADER_ELECTION_LEASE_DURATION}\"" \
+      "value: \"${LEADER_ELECTION_RENEW_DEADLINE}\"" \
+      "value: \"${LEADER_ELECTION_RETRY_PERIOD}\""; do
+      if ! grep -Fq "${expected}" "${config_template}"; then
+        echo "::error::rendered kind config is missing expected leader election setting: ${expected}"
+        sed 's/^/  /' "${config_template}" || true
+        exit 1
+      fi
+    done
+  fi
+}
+
+validate_duration_input cluster_create_timeout "${CLUSTER_CREATE_TIMEOUT}"
+validate_duration_input leader_election_lease_duration "${LEADER_ELECTION_LEASE_DURATION}"
+validate_duration_input leader_election_renew_deadline "${LEADER_ELECTION_RENEW_DEADLINE}"
+validate_duration_input leader_election_retry_period "${LEADER_ELECTION_RETRY_PERIOD}"
+
+CREATE_ARGS=(--name="${KIND_CLUSTER_NAME}")
+if [[ -n "${KIND_NODE_IMAGE}" ]]; then
+  echo "Using kind node image: ${KIND_NODE_IMAGE}"
+  CREATE_ARGS+=(--image="${KIND_NODE_IMAGE}")
+fi
+
+case "${CONTROL_PLANE_RESOURCE_PATCHES}" in
+  true) ;;
+  ""|false) CONTROL_PLANE_RESOURCE_PATCHES=false ;;
+  *)
+    echo "::error::control_plane_resource_patches must be true or false, got '${CONTROL_PLANE_RESOURCE_PATCHES}'"
+    exit 1
+    ;;
+esac
+
+case "${CONTROL_PLANE_LEADER_ELECTION_TUNING}" in
+  true) ;;
+  ""|false) CONTROL_PLANE_LEADER_ELECTION_TUNING=false ;;
+  *)
+    echo "::error::control_plane_leader_election_tuning must be true or false, got '${CONTROL_PLANE_LEADER_ELECTION_TUNING}'"
+    exit 1
+    ;;
+esac
+
+if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" || "${CONTROL_PLANE_LEADER_ELECTION_TUNING}" == "true" ]]; then
+  patch_dir="$(mktemp -d)"
+  config_template="$(mktemp)"
+
+  # Keep YAML heredocs at column 0; indentation is literal content.
+  if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then
+  cat > "${patch_dir}/kube-apiserver+strategic.yaml" <<EOF
+apiVersion: v1
+kind: Pod
+metadata:
+  name: kube-apiserver
+  namespace: kube-system
+spec:
+  containers:
+  - name: kube-apiserver
+    resources:
+      requests:
+        cpu: ${API_SERVER_CPU_REQUEST}
+        memory: ${API_SERVER_MEMORY_REQUEST}
+EOF
+
+  cat > "${patch_dir}/kube-controller-manager+strategic.yaml" <<EOF
+apiVersion: v1
+kind: Pod
+metadata:
+  name: kube-controller-manager
+  namespace: kube-system
+spec:
+  containers:
+  - name: kube-controller-manager
+    resources:
+      requests:
+        cpu: ${CONTROLLER_MANAGER_CPU_REQUEST}
+        memory: ${CONTROLLER_MANAGER_MEMORY_REQUEST}
+EOF
+
+  cat > "${patch_dir}/kube-scheduler+strategic.yaml" <<EOF
+apiVersion: v1
+kind: Pod
+metadata:
+  name: kube-scheduler
+  namespace: kube-system
+spec:
+  containers:
+  - name: kube-scheduler
+    resources:
+      requests:
+        cpu: ${SCHEDULER_CPU_REQUEST}
+        memory: ${SCHEDULER_MEMORY_REQUEST}
+EOF
+
+  cat > "${patch_dir}/etcd+strategic.yaml" <<EOF
+apiVersion: v1
+kind: Pod
+metadata:
+  name: etcd
+  namespace: kube-system
+spec:
+  containers:
+  - name: etcd
+    resources:
+      requests:
+        cpu: ${ETCD_CPU_REQUEST}
+        memory: ${ETCD_MEMORY_REQUEST}
+EOF
+  fi
+
+  cat > "${config_template}" <<'EOF'
+kind: Cluster
+apiVersion: kind.x-k8s.io/v1alpha4
+{{- if hasKey $ "name" }}
+name: {{ $.name }}
+{{- end }}
+nodes:
+- role: control-plane
+  {{- if hasKey $ "image" }}
+  image: {{ $.image }}
+  {{- end }}
+EOF
+  if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then
+  cat >> "${config_template}" <<EOF
+  extraMounts:
+  - hostPath: ${patch_dir}
+    containerPath: /patches
+EOF
+  fi
+  cat >> "${config_template}" <<'EOF'
+  kubeadmConfigPatches:
+EOF
+  if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then
+  cat >> "${config_template}" <<'EOF'
+  - |
+    kind: InitConfiguration
+    patches:
+      directory: /patches
+EOF
+  fi
+  if [[ "${CONTROL_PLANE_LEADER_ELECTION_TUNING}" == "true" ]]; then
+  # kind v0.31 renders kubeadm v1beta3. Keep a v1beta4 patch too so
+  # this remains valid when a future kind image switches API versions.
+  cat >> "${config_template}" <<EOF
+  - |
+    kind: ClusterConfiguration
+    apiVersion: kubeadm.k8s.io/v1beta3
+    controllerManager:
+      extraArgs:
+        leader-elect-lease-duration: "${LEADER_ELECTION_LEASE_DURATION}"
+        leader-elect-renew-deadline: "${LEADER_ELECTION_RENEW_DEADLINE}"
+        leader-elect-retry-period: "${LEADER_ELECTION_RETRY_PERIOD}"
+    scheduler:
+      extraArgs:
+        leader-elect-lease-duration: "${LEADER_ELECTION_LEASE_DURATION}"
+        leader-elect-renew-deadline: "${LEADER_ELECTION_RENEW_DEADLINE}"
+        leader-elect-retry-period: "${LEADER_ELECTION_RETRY_PERIOD}"
+  - |
+    kind: ClusterConfiguration
+    apiVersion: kubeadm.k8s.io/v1beta4
+    controllerManager:
+      extraArgs:
+      - name: leader-elect-lease-duration
+        value: "${LEADER_ELECTION_LEASE_DURATION}"
+      - name: leader-elect-renew-deadline
+        value: "${LEADER_ELECTION_RENEW_DEADLINE}"
+      - name: leader-elect-retry-period
+        value: "${LEADER_ELECTION_RETRY_PERIOD}"
+    scheduler:
+      extraArgs:
+      - name: leader-elect-lease-duration
+        value: "${LEADER_ELECTION_LEASE_DURATION}"
+      - name: leader-elect-renew-deadline
+        value: "${LEADER_ELECTION_RENEW_DEADLINE}"
+      - name: leader-elect-retry-period
+        value: "${LEADER_ELECTION_RETRY_PERIOD}"
+EOF
+  fi
+  cat >> "${config_template}" <<'EOF'
+{{- range $.workers }}
+- role: worker
+  {{- if hasKey $ "image" }}
+  image: {{ $.image }}
+  {{- end }}
+
+  {{- if hasKey . "devices" }}
+  {{- $devices := .devices }}
+  {{- if not (kindIs "slice" $devices) }}
+    {{- $devices = list .devices }}
+  {{- end }}
+  extraMounts:
+    # We inject all NVIDIA GPUs using the nvidia-container-runtime.
+    # This requires `accept-nvidia-visible-devices-as-volume-mounts = true` be set
+    # in `/etc/nvidia-container-runtime/config.toml`
+    {{- range $d := $devices }}
+    - hostPath: /dev/null
+      containerPath: /var/run/nvidia-container-devices/{{ $d }}
+    {{- end }}
+  {{- end }}
+{{- end }}
+EOF
+  if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then
+    echo "Applying control-plane static pod resource patches from ${patch_dir}:"
+    for patch_file in "${patch_dir}"/*.yaml; do
+      echo "--- ${patch_file}"
+      sed 's/^/  /' "${patch_file}"
+    done
+  fi
+  if [[ "${CONTROL_PLANE_LEADER_ELECTION_TUNING}" == "true" ]]; then
+    echo "Increasing kube-controller-manager and kube-scheduler leader election timeouts for slow CI control planes:"
+    echo "  lease-duration=${LEADER_ELECTION_LEASE_DURATION}"
+    echo "  renew-deadline=${LEADER_ELECTION_RENEW_DEADLINE}"
+    echo "  retry-period=${LEADER_ELECTION_RETRY_PERIOD}"
+  fi
+  validate_generated_control_plane_config
+  CREATE_ARGS+=(--config-template="${config_template}")
+fi
+
+set +e
+timeout "${CLUSTER_CREATE_TIMEOUT}" nvkind cluster create "${CREATE_ARGS[@]}"
+create_status=$?
+set -e
+case "${create_status}" in
+  0) ;;
+  124)
+    echo "::warning::nvkind cluster create timed out after ${CLUSTER_CREATE_TIMEOUT}; continuing only if post-create checks pass"
+    ;;
+  *)
+    echo "::error::nvkind cluster create failed with status ${create_status}"
+    exit "${create_status}"
+    ;;
+esac
+
+kubectl --context="kind-${KIND_CLUSTER_NAME}" wait --for=condition=Ready nodes --all --timeout=300s
+kubectl --context="kind-${KIND_CLUSTER_NAME}" cluster-info
+kubectl --context="kind-${KIND_CLUSTER_NAME}" get nodes -o wide
+kubectl --context="kind-${KIND_CLUSTER_NAME}" describe nodes | \
+  grep -E "^(Name:|Capacity:|Allocatable:|Allocated resources:|  cpu|  memory|  nvidia.com/gpu)" || true
+
+echo "=== Kind node container resources ==="
+docker ps --filter "label=io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}" \
+  --format '{{.Names}}' | sort | while read -r node_container; do
+    [[ -z "${node_container}" ]] && continue
+    docker inspect "${node_container}" \
+      --format '{{.Name}} NanoCpus={{.HostConfig.NanoCpus}} CpuShares={{.HostConfig.CpuShares}} Memory={{.HostConfig.Memory}} MemoryReservation={{.HostConfig.MemoryReservation}}'
+  done
+
+echo "=== Control-plane resource requests/limits ==="
+kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kube-system \
+  get pods -l tier=control-plane -o json | jq -r '
+    .items[] as $pod |
+    $pod.metadata.name,
+    ($pod.spec.containers[] |
+      "  " + .name +
+      " requests=" + ((.resources.requests // {}) | tostring) +
+      " limits=" + ((.resources.limits // {}) | tostring))
+  ' || true
+
+normalize_cpu_request() {
+  local cpu="$1"
+
+  if [[ "${cpu}" =~ ^([0-9]+)000m$ ]]; then
+    echo "${BASH_REMATCH[1]}"
+    return
+  fi
+  echo "${cpu}"
+}
+
+control_plane_request() {
+  local component="$1"
+  local resource="$2"
+
+  kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kube-system \
+    get pod -l "component=${component}" \
+    -o "jsonpath={.items[0].spec.containers[0].resources.requests.${resource}}"
+}
+
+assert_control_plane_request() {
+  local component="$1"
+  local resource="$2"
+  local expected="$3"
+  local actual
+
+  actual="$(control_plane_request "${component}" "${resource}")"
+  if [[ "${resource}" == "cpu" ]]; then
+    expected="$(normalize_cpu_request "${expected}")"
+    actual="$(normalize_cpu_request "${actual}")"
+  fi
+  if [[ "${actual}" != "${expected}" ]]; then
+    echo "::error::${component} ${resource} request is '${actual}', expected '${expected}'"
+    exit 1
+  fi
+  echo "${component} ${resource} request verified: ${actual}"
+}
+
+control_plane_command_args() {
+  local component="$1"
+
+  kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kube-system \
+    get pod -l "component=${component}" \
+    -o json | jq -r '.items[0].spec.containers[0] | ((.command // []) + (.args // []))[]?'
+}
+
+static_pod_manifest_contains_arg() {
+  local component="$1"
+  local expected="$2"
+  local node="${KIND_CLUSTER_NAME}-control-plane"
+
+  docker exec "${node}" grep -Fq -- "- ${expected}" "/etc/kubernetes/manifests/${component}.yaml"
+}
+
+dump_static_pod_manifest() {
+  local component="$1"
+  local node="${KIND_CLUSTER_NAME}-control-plane"
+
+  echo "Static pod manifest /etc/kubernetes/manifests/${component}.yaml:"
+  docker exec "${node}" sed -n '1,220p' "/etc/kubernetes/manifests/${component}.yaml" || true
+}
+
+assert_control_plane_arg() {
+  local component="$1"
+  local expected="$2"
+  local command_args
+
+  command_args="$(control_plane_command_args "${component}")"
+  if ! grep -Fxq "${expected}" <<< "${command_args}"; then
+    if static_pod_manifest_contains_arg "${component}" "${expected}"; then
+      echo "::warning::${component} live mirror pod command/args did not show ${expected}; static pod manifest is patched"
+      return
+    fi
+    echo "::error::${component} live pod command/args does not contain ${expected}"
+    echo "Observed live command/args:"
+    echo "${command_args}"
+    dump_static_pod_manifest "${component}"
+    exit 1
+  fi
+  echo "${component} command/args verified: ${expected}"
+}
+
+if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then
+  echo "Verifying control-plane resource patches..."
+  assert_control_plane_request kube-apiserver cpu "${API_SERVER_CPU_REQUEST}"
+  assert_control_plane_request kube-apiserver memory "${API_SERVER_MEMORY_REQUEST}"
+  assert_control_plane_request kube-controller-manager cpu "${CONTROLLER_MANAGER_CPU_REQUEST}"
+  assert_control_plane_request kube-controller-manager memory "${CONTROLLER_MANAGER_MEMORY_REQUEST}"
+  assert_control_plane_request kube-scheduler cpu "${SCHEDULER_CPU_REQUEST}"
+  assert_control_plane_request kube-scheduler memory "${SCHEDULER_MEMORY_REQUEST}"
+  assert_control_plane_request etcd cpu "${ETCD_CPU_REQUEST}"
+  assert_control_plane_request etcd memory "${ETCD_MEMORY_REQUEST}"
+fi
+
+if [[ "${CONTROL_PLANE_LEADER_ELECTION_TUNING}" == "true" ]]; then
+  echo "Verifying control-plane leader election timeout patches..."
+  for component in kube-controller-manager kube-scheduler; do
+    assert_control_plane_arg "${component}" "--leader-elect-lease-duration=${LEADER_ELECTION_LEASE_DURATION}"
+    assert_control_plane_arg "${component}" "--leader-elect-renew-deadline=${LEADER_ELECTION_RENEW_DEADLINE}"
+    assert_control_plane_arg "${component}" "--leader-elect-retry-period=${LEADER_ELECTION_RETRY_PERIOD}"
+  done
+fi
diff --git a/.github/actions/gpu-cluster-setup/delete-stale-kind-cluster.sh b/.github/actions/gpu-cluster-setup/delete-stale-kind-cluster.sh
new file mode 100644
index 000000000..8e85ffcb9
--- /dev/null
+++ b/.github/actions/gpu-cluster-setup/delete-stale-kind-cluster.sh
@@ -0,0 +1,39 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+kind_cluster_label="io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}"
+if kind get clusters | grep -Fxq "${KIND_CLUSTER_NAME}"; then
+  echo "Deleting stale kind cluster: ${KIND_CLUSTER_NAME}"
+  if ! timeout 180s kind delete cluster --name "${KIND_CLUSTER_NAME}"; then
+    echo "::warning::kind delete cluster timed out or failed; falling back to direct container cleanup"
+  fi
+else
+  echo "No stale kind cluster named ${KIND_CLUSTER_NAME}"
+fi
+
+remaining_containers=$(docker ps -aq --filter "label=${kind_cluster_label}")
+if [[ -n "${remaining_containers}" ]]; then
+  echo "Removing stale containers for ${KIND_CLUSTER_NAME}:"
+  docker ps -a --filter "label=${kind_cluster_label}"
+  docker rm -f ${remaining_containers}
+fi
+
+remaining_containers=$(docker ps -aq --filter "label=${kind_cluster_label}")
+if [[ -n "${remaining_containers}" ]]; then
+  echo "::error::stale containers still remain for ${KIND_CLUSTER_NAME}:"
+  docker ps -a --filter "label=${kind_cluster_label}"
+  exit 1
+fi
diff --git a/.github/actions/gpu-cluster-setup/increase-inotify-limits.sh b/.github/actions/gpu-cluster-setup/increase-inotify-limits.sh
new file mode 100644
index 000000000..843496a38
--- /dev/null
+++ b/.github/actions/gpu-cluster-setup/increase-inotify-limits.sh
@@ -0,0 +1,19 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+sudo sysctl -w fs.inotify.max_user_watches=524288
+sudo sysctl -w fs.inotify.max_user_instances=1024
diff --git a/.github/actions/gpu-cluster-setup/install-nvkind.sh b/.github/actions/gpu-cluster-setup/install-nvkind.sh
new file mode 100644
index 000000000..38f1ce0ae
--- /dev/null
+++ b/.github/actions/gpu-cluster-setup/install-nvkind.sh
@@ -0,0 +1,19 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+go install "github.com/NVIDIA/nvkind/cmd/nvkind@${NVKIND_VERSION}"
+nvkind --help
diff --git a/.github/actions/gpu-cluster-setup/runner-preflight.sh b/.github/actions/gpu-cluster-setup/runner-preflight.sh
new file mode 100644
index 000000000..678b9d419
--- /dev/null
+++ b/.github/actions/gpu-cluster-setup/runner-preflight.sh
@@ -0,0 +1,69 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+echo "=== Runner baseline ==="
+date -u
+hostname
+uptime
+nproc
+free -h
+df -h /
+df -ih /
+
+for value_name in MIN_GPU_COUNT MIN_FREE_DISK_GB MIN_AVAILABLE_MEMORY_GB; do
+  value="${!value_name}"
+  if ! [[ "${value}" =~ ^[0-9]+$ ]]; then
+    echo "::error::${value_name} must be an integer, got '${value}'"
+    exit 1
+  fi
+done
+
+echo "=== Docker health ==="
+docker info >/dev/null
+docker version
+
+echo "=== Host GPUs ==="
+nvidia-smi -L
+nvidia-smi
+
+mapfile -t gpu_names < <(nvidia-smi --query-gpu=name --format=csv,noheader)
+if [[ -n "${GPU_MODEL_PATTERN}" ]]; then
+  set +e
+  gpu_count=$(printf '%s\n' "${gpu_names[@]}" | grep -Eic -- "${GPU_MODEL_PATTERN}")
+  grep_status=$?
+  set -e
+  if (( grep_status == 2 )); then
+    echo "::error::invalid gpu_model_pattern regex: ${GPU_MODEL_PATTERN}"
+    exit 1
+  fi
+  if (( grep_status != 0 )); then
+    gpu_count=0
+  fi
+  echo "Visible GPUs matching '${GPU_MODEL_PATTERN}': ${gpu_count}"
+else
+  gpu_count="${#gpu_names[@]}"
+  echo "Visible GPUs: ${gpu_count}"
+fi
+
+if (( gpu_count < MIN_GPU_COUNT )); then
+  echo "::error::visible GPU count ${gpu_count} is below required minimum ${MIN_GPU_COUNT}"
+  exit 1
+fi
+
+echo "=== Existing kind state ==="
+kind get clusters || true
+docker ps -a --filter "label=io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}" || true
diff --git a/.github/actions/gpu-cluster-setup/validate-docker-gpu-access.sh b/.github/actions/gpu-cluster-setup/validate-docker-gpu-access.sh
new file mode 100644
index 000000000..6f01ba156
--- /dev/null
+++ b/.github/actions/gpu-cluster-setup/validate-docker-gpu-access.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+timeout 300s docker run --rm -v /dev/null:/var/run/nvidia-container-devices/all ubuntu:22.04 nvidia-smi -L
diff --git a/.github/actions/gpu-cluster-setup/validate-env.sh b/.github/actions/gpu-cluster-setup/validate-env.sh
new file mode 100644
index 000000000..697d077c2
--- /dev/null
+++ b/.github/actions/gpu-cluster-setup/validate-env.sh
@@ -0,0 +1,21 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+if [[ -z "${KIND_CLUSTER_NAME:-}" ]]; then
+  echo "::error::KIND_CLUSTER_NAME environment variable must be set by the calling workflow"
+  exit 1
+fi
diff --git a/.github/actions/gpu-cluster-setup/warm-kind-node-image.sh b/.github/actions/gpu-cluster-setup/warm-kind-node-image.sh
new file mode 100644
index 000000000..3d58a4887
--- /dev/null
+++ b/.github/actions/gpu-cluster-setup/warm-kind-node-image.sh
@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+echo "=== Kind node image cache ==="
+if docker image inspect "${KIND_NODE_IMAGE}" >/dev/null 2>&1; then
+  echo "Kind node image already cached: ${KIND_NODE_IMAGE}"
+else
+  echo "Pulling kind node image: ${KIND_NODE_IMAGE}"
+  timeout 600s docker pull "${KIND_NODE_IMAGE}"
+fi
+free_disk_gb=$(df -BG --output=avail / | tail -1 | tr -dc '0-9')
+if (( free_disk_gb < MIN_FREE_DISK_GB )); then
+  echo "::error::free disk on / is ${free_disk_gb}GiB after warming ${KIND_NODE_IMAGE}, need at least ${MIN_FREE_DISK_GB}GiB"
+  exit 1
+fi
+echo "Runner disk remains sufficient after kind image warm-up: ${free_disk_gb}GiB"
diff --git a/.github/actions/gpu-operator-install/action.yml b/.github/actions/gpu-operator-install/action.yml
index 86d247932..b30c63f2d 100644
--- a/.github/actions/gpu-operator-install/action.yml
+++ b/.github/actions/gpu-operator-install/action.yml
@@ -49,105 +49,33 @@ runs:
     - name: Install GPU Operator (helm)
       if: inputs.method == 'helm'
       shell: bash
-      run: |
-        helm repo add nvidia https://helm.ngc.nvidia.com/nvidia
-        helm repo update
-        helm upgrade -i \
-          --kube-context="kind-${KIND_CLUSTER_NAME}" \
-          --namespace gpu-operator \
-          --create-namespace \
-          --set driver.enabled=false \
-          --set toolkit.enabled=false \
-          --set dcgmExporter.enabled=false \
-          --set nfd.enabled=true \
-          --wait --timeout=600s \
-          gpu-operator nvidia/gpu-operator
-
+      run: bash "${{ github.action_path }}/install-gpu-operator-helm.sh"
     - name: Wait for GPU operands (helm)
       if: inputs.method == 'helm'
       shell: bash
-      run: |
-        echo "Waiting for device plugin to be ready..."
-        kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \
-          rollout status daemonset -l app=nvidia-device-plugin-daemonset --timeout=300s || true
-        echo "GPU Operator pods:"
-        kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods
-
+      run: bash "${{ github.action_path }}/wait-gpu-operands-helm.sh"
     # --- Bundle mode: aicr recipe → bundle → deploy ---
 
     - name: Generate recipe
       if: inputs.method == 'bundle'
       shell: bash
-      run: |
-        PLATFORM_FLAG=""
-        if [[ -n "${{ inputs.platform }}" ]]; then
-          PLATFORM_FLAG="--platform ${{ inputs.platform }}"
-        fi
-        ./aicr recipe \
-          --service kind \
-          --accelerator ${{ inputs.accelerator }} \
-          --os ubuntu \
-          --intent ${{ inputs.intent }} \
-          ${PLATFORM_FLAG} \
-          --output recipe.yaml
-        echo "Recipe written to recipe.yaml"
-
+      env:
+        AICR_ACCELERATOR: ${{ inputs.accelerator }}
+        AICR_INTENT: ${{ inputs.intent }}
+        AICR_PLATFORM: ${{ inputs.platform }}
+      run: bash "${{ github.action_path }}/generate-recipe.sh"
     - name: Generate deployment bundle
       if: inputs.method == 'bundle'
       shell: bash
-      run: |
-        ./aicr bundle \
-          --recipe recipe.yaml \
-          --accelerated-node-toleration nvidia.com/gpu:NoSchedule \
-          --output bundle
-        echo "--- Bundle contents ---"
-        ls -la bundle/
-
+      run: bash "${{ github.action_path }}/generate-bundle.sh"
     - name: Install bundle into cluster
       if: inputs.method == 'bundle'
       shell: bash
-      run: |
-        cd bundle
-        # The default keeps legacy bundle-mode behavior: do not wait on every
-        # Helm resource and keep deploying after component failures. H100
-        # qualification jobs override these inputs to hard-fail and wait.
-        chmod +x deploy.sh
-        DEPLOY_ARGS=()
-        if [[ "${{ inputs.wait }}" != "true" ]]; then
-          DEPLOY_ARGS+=(--no-wait)
-        fi
-        if [[ "${{ inputs.best_effort }}" == "true" ]]; then
-          DEPLOY_ARGS+=(--best-effort)
-        fi
-        if [[ "${#DEPLOY_ARGS[@]}" -gt 0 ]]; then
-          echo "Deploying bundle with args: ${DEPLOY_ARGS[*]}"
-        else
-          echo "Deploying bundle with default args"
-        fi
-        ./deploy.sh "${DEPLOY_ARGS[@]}"
-
+      env:
+        AICR_DEPLOY_WAIT: ${{ inputs.wait }}
+        AICR_DEPLOY_BEST_EFFORT: ${{ inputs.best_effort }}
+      run: bash "${{ github.action_path }}/install-bundle.sh"
     - name: Wait for GPU operands (bundle)
       if: inputs.method == 'bundle'
       shell: bash
-      run: |
-        echo "Waiting for GPU operator controller to deploy operands..."
-        # The GPU operator controller watches ClusterPolicy and creates
-        # DaemonSets for device-plugin, NFD, GFD, etc. This happens
-        # asynchronously after the helm install completes.
-        for i in $(seq 1 30); do
-          count=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \
-            get daemonset -l app=nvidia-device-plugin-daemonset --no-headers 2>/dev/null | wc -l)
-          if [[ "$count" -gt 0 ]]; then
-            echo "Device plugin DaemonSet found."
-            break
-          fi
-          echo "Waiting for device plugin DaemonSet to be created... (${i}/30)"
-          sleep 10
-        done
-        echo "Waiting for device plugin rollout..."
-        # Operands are excluded from control-plane nodes via nodeAffinity in
-        # the kind overlay, so all scheduled pods should become ready.
-        kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \
-          rollout status daemonset -l app=nvidia-device-plugin-daemonset --timeout=300s
-        echo "GPU Operator pods:"
-        kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods
+      run: bash "${{ github.action_path }}/wait-gpu-operands-bundle.sh"
diff --git a/.github/actions/gpu-operator-install/generate-bundle.sh b/.github/actions/gpu-operator-install/generate-bundle.sh
new file mode 100644
index 000000000..c7dd3f413
--- /dev/null
+++ b/.github/actions/gpu-operator-install/generate-bundle.sh
@@ -0,0 +1,23 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+./aicr bundle \
+  --recipe recipe.yaml \
+  --accelerated-node-toleration nvidia.com/gpu:NoSchedule \
+  --output bundle
+echo "--- Bundle contents ---"
+ls -la bundle/
diff --git a/.github/actions/gpu-operator-install/generate-recipe.sh b/.github/actions/gpu-operator-install/generate-recipe.sh
new file mode 100644
index 000000000..6015e69ed
--- /dev/null
+++ b/.github/actions/gpu-operator-install/generate-recipe.sh
@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+RECIPE_ARGS=(
+  --service kind
+  --accelerator "${AICR_ACCELERATOR}"
+  --os ubuntu
+  --intent "${AICR_INTENT}"
+)
+if [[ -n "${AICR_PLATFORM}" ]]; then
+  RECIPE_ARGS+=(--platform "${AICR_PLATFORM}")
+fi
+
+./aicr recipe "${RECIPE_ARGS[@]}" --output recipe.yaml
+echo "Recipe written to recipe.yaml"
diff --git a/.github/actions/gpu-operator-install/install-bundle.sh b/.github/actions/gpu-operator-install/install-bundle.sh
new file mode 100644
index 000000000..cefa4ce5d
--- /dev/null
+++ b/.github/actions/gpu-operator-install/install-bundle.sh
@@ -0,0 +1,35 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+cd bundle
+# The default keeps legacy bundle-mode behavior: do not wait on every
+# Helm resource and keep deploying after component failures. H100
+# qualification jobs override these inputs to hard-fail and wait.
+chmod +x deploy.sh
+DEPLOY_ARGS=()
+if [[ "${AICR_DEPLOY_WAIT}" != "true" ]]; then
+  DEPLOY_ARGS+=(--no-wait)
+fi
+if [[ "${AICR_DEPLOY_BEST_EFFORT}" == "true" ]]; then
+  DEPLOY_ARGS+=(--best-effort)
+fi
+if [[ "${#DEPLOY_ARGS[@]}" -gt 0 ]]; then
+  echo "Deploying bundle with args: ${DEPLOY_ARGS[*]}"
+else
+  echo "Deploying bundle with default args"
+fi
+./deploy.sh "${DEPLOY_ARGS[@]}"
diff --git a/.github/actions/gpu-operator-install/install-gpu-operator-helm.sh b/.github/actions/gpu-operator-install/install-gpu-operator-helm.sh
new file mode 100644
index 000000000..0aea450eb
--- /dev/null
+++ b/.github/actions/gpu-operator-install/install-gpu-operator-helm.sh
@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+helm repo add nvidia https://helm.ngc.nvidia.com/nvidia
+helm repo update
+helm upgrade -i \
+  --kube-context="kind-${KIND_CLUSTER_NAME}" \
+  --namespace gpu-operator \
+  --create-namespace \
+  --set driver.enabled=false \
+  --set toolkit.enabled=false \
+  --set dcgmExporter.enabled=false \
+  --set nfd.enabled=true \
+  --wait --timeout=600s \
+  gpu-operator nvidia/gpu-operator
diff --git a/.github/actions/gpu-operator-install/wait-gpu-operands-bundle.sh b/.github/actions/gpu-operator-install/wait-gpu-operands-bundle.sh
new file mode 100644
index 000000000..496eb372e
--- /dev/null
+++ b/.github/actions/gpu-operator-install/wait-gpu-operands-bundle.sh
@@ -0,0 +1,38 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+echo "Waiting for GPU operator controller to deploy operands..."
+# The GPU operator controller watches ClusterPolicy and creates
+# DaemonSets for device-plugin, NFD, GFD, etc. This happens
+# asynchronously after the helm install completes.
+for i in $(seq 1 30); do
+  count=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \
+    get daemonset -l app=nvidia-device-plugin-daemonset --no-headers 2>/dev/null | wc -l)
+  if [[ "$count" -gt 0 ]]; then
+    echo "Device plugin DaemonSet found."
+    break
+  fi
+  echo "Waiting for device plugin DaemonSet to be created... (${i}/30)"
+  sleep 10
+done
+echo "Waiting for device plugin rollout..."
+# Operands are excluded from control-plane nodes via nodeAffinity in
+# the kind overlay, so all scheduled pods should become ready.
+kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \
+  rollout status daemonset -l app=nvidia-device-plugin-daemonset --timeout=300s
+echo "GPU Operator pods:"
+kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods
diff --git a/.github/actions/gpu-operator-install/wait-gpu-operands-helm.sh b/.github/actions/gpu-operator-install/wait-gpu-operands-helm.sh
new file mode 100644
index 000000000..2ad7e801d
--- /dev/null
+++ b/.github/actions/gpu-operator-install/wait-gpu-operands-helm.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+echo "Waiting for device plugin to be ready..."
+kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \
+  rollout status daemonset -l app=nvidia-device-plugin-daemonset --timeout=300s || true
+echo "GPU Operator pods:"
+kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods
diff --git a/.github/actions/gpu-snapshot-validate/action.yml b/.github/actions/gpu-snapshot-validate/action.yml
index b89224a60..7af987da0 100644
--- a/.github/actions/gpu-snapshot-validate/action.yml
+++ b/.github/actions/gpu-snapshot-validate/action.yml
@@ -36,54 +36,18 @@ runs:
   steps:
     - name: Run aicr snapshot
       shell: bash
-      run: |
-        ./aicr snapshot \
-          --kubeconfig="${HOME}/.kube/config" \
-          --namespace=default \
-          --image=ko.local:smoke-test \
-          --require-gpu \
-          --timeout="${{ inputs.snapshot_timeout }}" \
-          --output=snapshot.yaml
-        echo "--- Snapshot output ---"
-        cat snapshot.yaml
-
+      env:
+        SNAPSHOT_TIMEOUT: ${{ inputs.snapshot_timeout }}
+      run: bash "${{ github.action_path }}/run-snapshot.sh"
     - name: Validate snapshot detected GPU
       shell: bash
-      run: |
-        # Query by subtype field (not index) — #502 added a "hardware" subtype before "smi".
-        GPU_MODEL=$(yq eval '.measurements[] | select(.type == "GPU") | .subtypes[] | select(.subtype == "smi") | .data["gpu.model"]' snapshot.yaml)
-        GPU_COUNT=$(yq eval '.measurements[] | select(.type == "GPU") | .subtypes[] | select(.subtype == "smi") | .data["gpu-count"]' snapshot.yaml)
-        echo "GPU model: ${GPU_MODEL}"
-        echo "GPU count: ${GPU_COUNT}"
-        if [[ "${GPU_MODEL}" != *"${{ inputs.gpu_model }}"* ]]; then
-          echo "::error::Expected ${{ inputs.gpu_model }} GPU in snapshot, got: ${GPU_MODEL}"
-          exit 1
-        fi
-        if [[ "${GPU_COUNT}" -lt ${{ inputs.min_gpu_count }} ]]; then
-          echo "::error::Expected gpu-count >= ${{ inputs.min_gpu_count }}, got: ${GPU_COUNT}"
-          exit 1
-        fi
-        echo "Snapshot correctly detected ${GPU_COUNT}x ${GPU_MODEL}"
-
+      env:
+        EXPECTED_GPU_MODEL: ${{ inputs.gpu_model }}
+        MIN_GPU_COUNT: ${{ inputs.min_gpu_count }}
+      run: bash "${{ github.action_path }}/validate-snapshot-gpu.sh"
     - name: Debug snapshot Job
       if: failure()
       shell: bash
-      run: |
-        kubectl_kind() {
-          timeout 30s kubectl --request-timeout=10s --context="kind-${{ inputs.cluster_name }}" "$@"
-        }
-
-        echo "=== Snapshot Job ==="
-        kubectl_kind -n default get job aicr -o yaml || true
-        echo "=== Snapshot Pods ==="
-        kubectl_kind -n default get pods -l app.kubernetes.io/name=aicr -o wide || true
-        echo "=== Snapshot Job describe ==="
-        kubectl_kind -n default describe job aicr || true
-        echo "=== Snapshot Pod describe ==="
-        kubectl_kind -n default describe pods -l app.kubernetes.io/name=aicr || true
-        echo "=== Snapshot current logs ==="
-        kubectl_kind -n default logs -l app.kubernetes.io/name=aicr --all-containers --tail=200 || true
-        echo "=== Snapshot previous logs ==="
-        kubectl_kind -n default logs -l app.kubernetes.io/name=aicr --all-containers --previous --tail=200 || true
-        echo "=== Snapshot ConfigMap ==="
-        kubectl_kind -n default get configmap aicr-snapshot -o yaml || true
+      env:
+        KIND_CLUSTER_NAME: ${{ inputs.cluster_name }}
+      run: bash "${{ github.action_path }}/debug-snapshot-job.sh"
diff --git a/.github/actions/gpu-snapshot-validate/debug-snapshot-job.sh b/.github/actions/gpu-snapshot-validate/debug-snapshot-job.sh
new file mode 100644
index 000000000..2e0f1547f
--- /dev/null
+++ b/.github/actions/gpu-snapshot-validate/debug-snapshot-job.sh
@@ -0,0 +1,35 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+kubectl_kind() {
+  timeout 30s kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" "$@"
+}
+
+echo "=== Snapshot Job ==="
+kubectl_kind -n default get job aicr -o yaml || true
+echo "=== Snapshot Pods ==="
+kubectl_kind -n default get pods -l app.kubernetes.io/name=aicr -o wide || true
+echo "=== Snapshot Job describe ==="
+kubectl_kind -n default describe job aicr || true
+echo "=== Snapshot Pod describe ==="
+kubectl_kind -n default describe pods -l app.kubernetes.io/name=aicr || true
+echo "=== Snapshot current logs ==="
+kubectl_kind -n default logs -l app.kubernetes.io/name=aicr --all-containers --tail=200 || true
+echo "=== Snapshot previous logs ==="
+kubectl_kind -n default logs -l app.kubernetes.io/name=aicr --all-containers --previous --tail=200 || true
+echo "=== Snapshot ConfigMap ==="
+kubectl_kind -n default get configmap aicr-snapshot -o yaml || true
diff --git a/.github/actions/gpu-snapshot-validate/run-snapshot.sh b/.github/actions/gpu-snapshot-validate/run-snapshot.sh
new file mode 100644
index 000000000..e45b575ef
--- /dev/null
+++ b/.github/actions/gpu-snapshot-validate/run-snapshot.sh
@@ -0,0 +1,26 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+./aicr snapshot \
+  --kubeconfig="${HOME}/.kube/config" \
+  --namespace=default \
+  --image=ko.local:smoke-test \
+  --require-gpu \
+  --timeout="${SNAPSHOT_TIMEOUT}" \
+  --output=snapshot.yaml
+echo "--- Snapshot output ---"
+cat snapshot.yaml
diff --git a/.github/actions/gpu-snapshot-validate/validate-snapshot-gpu.sh b/.github/actions/gpu-snapshot-validate/validate-snapshot-gpu.sh
new file mode 100644
index 000000000..57a622a2d
--- /dev/null
+++ b/.github/actions/gpu-snapshot-validate/validate-snapshot-gpu.sh
@@ -0,0 +1,31 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+# Query by subtype field (not index) — #502 added a "hardware" subtype before "smi".
+GPU_MODEL=$(yq eval '.measurements[] | select(.type == "GPU") | .subtypes[] | select(.subtype == "smi") | .data["gpu.model"]' snapshot.yaml)
+GPU_COUNT=$(yq eval '.measurements[] | select(.type == "GPU") | .subtypes[] | select(.subtype == "smi") | .data["gpu-count"]' snapshot.yaml)
+echo "GPU model: ${GPU_MODEL}"
+echo "GPU count: ${GPU_COUNT}"
+if [[ "${GPU_MODEL}" != *"${EXPECTED_GPU_MODEL}"* ]]; then
+  echo "::error::Expected ${EXPECTED_GPU_MODEL} GPU in snapshot, got: ${GPU_MODEL}"
+  exit 1
+fi
+if [[ "${GPU_COUNT}" -lt ${MIN_GPU_COUNT} ]]; then
+  echo "::error::Expected gpu-count >= ${MIN_GPU_COUNT}, got: ${GPU_COUNT}"
+  exit 1
+fi
+echo "Snapshot correctly detected ${GPU_COUNT}x ${GPU_MODEL}"
diff --git a/.github/actions/gpu-test-cleanup/action.yml b/.github/actions/gpu-test-cleanup/action.yml
index c085ed630..a1eef57b9 100644
--- a/.github/actions/gpu-test-cleanup/action.yml
+++ b/.github/actions/gpu-test-cleanup/action.yml
@@ -32,118 +32,19 @@ runs:
       shell: bash
       env:
         KIND_CLUSTER_NAME: ${{ inputs.cluster_name }}
-      run: |
-        set -o pipefail
-        mkdir -p /tmp/debug-artifacts
-        CONTROL_PLANE_COMPONENTS="kube-apiserver kube-controller-manager kube-scheduler etcd"
-        kubectl_kind() {
-          timeout 30s kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" "$@"
-        }
-
-        kubectl_kind get all --all-namespaces > /tmp/debug-artifacts/all-resources.txt || true
-        kubectl_kind get events --all-namespaces --sort-by='.lastTimestamp' > /tmp/debug-artifacts/events.txt || true
-        kubectl_kind get --raw='/livez?verbose' > /tmp/debug-artifacts/apiserver-livez.txt 2>&1 || true
-        kubectl_kind get --raw='/readyz?verbose' > /tmp/debug-artifacts/apiserver-readyz.txt 2>&1 || true
-        kubectl_kind -n kube-system get pods -l tier=control-plane -o wide \
-          > /tmp/debug-artifacts/control-plane-pods.txt 2>&1 || true
-        kubectl_kind -n kube-system get events --sort-by='.lastTimestamp' \
-          > /tmp/debug-artifacts/kube-system-events.txt 2>&1 || true
-        for component in ${CONTROL_PLANE_COMPONENTS}; do
-          kubectl_kind -n kube-system describe pod -l "component=${component}" \
-            > "/tmp/debug-artifacts/${component}-describe.txt" 2>&1 || true
-          kubectl_kind -n kube-system logs -l "component=${component}" --all-containers --tail=300 \
-            > "/tmp/debug-artifacts/${component}-logs.txt" 2>&1 || true
-          kubectl_kind -n kube-system logs -l "component=${component}" --all-containers --previous --tail=300 \
-            > "/tmp/debug-artifacts/${component}-previous-logs.txt" 2>&1 || true
-          kubectl_kind -n kube-system get lease "${component}" -o yaml \
-            > "/tmp/debug-artifacts/${component}-lease.yaml" 2>&1 || true
-        done
-        kubectl_kind -n gpu-operator get pods -o wide > /tmp/debug-artifacts/gpu-operator-pods.txt || true
-        kubectl_kind -n gpu-operator logs -l app=nvidia-device-plugin-daemonset --tail=100 > /tmp/debug-artifacts/device-plugin-logs.txt || true
-        kubectl_kind -n gpu-operator logs -l app.kubernetes.io/component=gpu-operator --tail=100 > /tmp/debug-artifacts/gpu-operator-logs.txt || true
-        kubectl_kind get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded > /tmp/debug-artifacts/non-running-pods.txt || true
-        tar_inputs=()
-        [[ -f recipe.yaml ]] && tar_inputs+=(recipe.yaml)
-        [[ -d bundle ]] && tar_inputs+=(bundle)
-        if [[ "${#tar_inputs[@]}" -gt 0 ]]; then
-          echo "Archiving runtime bundle inputs: ${tar_inputs[*]}"
-          tar -czf /tmp/debug-artifacts/aicr-runtime-bundle.tar.gz "${tar_inputs[@]}" || true
-        else
-          echo "No recipe.yaml or bundle directory found; skipping runtime bundle archive"
-        fi
-
-        docker ps --filter "label=io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}" \
-          --format '{{.Names}}' | sort | while read -r node_container; do
-            [[ -z "${node_container}" ]] && continue
-            node_file="${node_container//[^A-Za-z0-9_.-]/_}"
-            timeout 30s docker exec "${node_container}" journalctl -u kubelet \
-              --since "90 minutes ago" --no-pager \
-              > "/tmp/debug-artifacts/${node_file}-kubelet-journal.txt" 2>&1 || true
-            timeout 30s docker exec "${node_container}" journalctl -u containerd \
-              --since "90 minutes ago" --no-pager \
-              > "/tmp/debug-artifacts/${node_file}-containerd-journal.txt" 2>&1 || true
-            timeout 30s docker exec "${node_container}" crictl ps -a \
-              > "/tmp/debug-artifacts/${node_file}-crictl-ps-a.txt" 2>&1 || true
-            timeout 30s docker exec "${node_container}" crictl pods \
-              > "/tmp/debug-artifacts/${node_file}-crictl-pods.txt" 2>&1 || true
-            timeout 30s docker exec "${node_container}" crictl stats \
-              > "/tmp/debug-artifacts/${node_file}-crictl-stats.txt" 2>&1 || true
-            timeout 30s docker exec "${node_container}" sh -c '
-              date
-              uptime || true
-              free -h || true
-              df -h / /var/lib/containerd /var/lib/kubelet 2>/dev/null || df -h
-              echo "--- top cpu/memory processes ---"
-              ps -eo pid,ppid,stat,etime,%cpu,%mem,comm,args --sort=-%cpu | head -40 || true
-            ' > "/tmp/debug-artifacts/${node_file}-node-pressure.txt" 2>&1 || true
-            timeout 120s docker exec "${node_container}" sh -c '
-              for component in kube-apiserver kube-controller-manager kube-scheduler etcd; do
-                echo "=== ${component} static pod manifest ==="
-                sed -n "1,220p" "/etc/kubernetes/manifests/${component}.yaml" 2>/dev/null || true
-                echo "=== ${component} CRI containers ==="
-                crictl ps -a --name "${component}" || true
-                count=0
-                for container_id in $(crictl ps -a --name "${component}" -q 2>/dev/null); do
-                  count=$((count + 1))
-                  if [ "${count}" -gt 8 ]; then
-                    echo "Skipping remaining ${component} CRI containers after first 8 entries."
-                    break
-                  fi
-                  echo "=== crictl inspect ${component} ${container_id} ==="
-                  crictl inspect "${container_id}" || true
-                  echo "=== crictl logs ${component} ${container_id} ==="
-                  crictl logs --tail=300 "${container_id}" || true
-                done
-              done
-            ' > "/tmp/debug-artifacts/${node_file}-control-plane-cri.txt" 2>&1 || true
-          done
-
+      run: bash "${{ github.action_path }}/collect-debug-artifacts.sh"
     - name: Export kind logs
       if: failure() || cancelled()
       shell: bash
       env:
         KIND_CLUSTER_NAME: ${{ inputs.cluster_name }}
-      run: |
-        mkdir -p /tmp/kind-logs
-        timeout 300s kind export logs /tmp/kind-logs --name "${KIND_CLUSTER_NAME}" || true
-
+      run: bash "${{ github.action_path }}/export-kind-logs.sh"
     - name: Cleanup
       if: always()
       shell: bash
       env:
         KIND_CLUSTER_NAME: ${{ inputs.cluster_name }}
-      run: |
-        timeout 300s kind delete cluster --name "${KIND_CLUSTER_NAME}" || true
-        kind_cluster_label="io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}"
-        remaining_containers=$(docker ps -aq --filter "label=${kind_cluster_label}")
-        if [[ -n "${remaining_containers}" ]]; then
-          echo "Removing leftover kind containers for ${KIND_CLUSTER_NAME}:"
-          docker ps -a --filter "label=${kind_cluster_label}"
-          docker rm -f ${remaining_containers} || true
-        fi
-        timeout 60s docker builder prune -f --filter "until=24h" || true
-        timeout 60s docker system prune -f --filter "until=24h" || true
-
+      run: bash "${{ github.action_path }}/cleanup-kind-cluster.sh"
     - name: Upload debug artifacts
       if: failure() || cancelled()
       uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f  # v6.0.0
diff --git a/.github/actions/gpu-test-cleanup/cleanup-kind-cluster.sh b/.github/actions/gpu-test-cleanup/cleanup-kind-cluster.sh
new file mode 100644
index 000000000..4603d494d
--- /dev/null
+++ b/.github/actions/gpu-test-cleanup/cleanup-kind-cluster.sh
@@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+timeout 300s kind delete cluster --name "${KIND_CLUSTER_NAME}" || true
+docker_timeout() {
+  local limit="$1"
+  shift
+  timeout "${limit}" docker "$@"
+}
+kind_cluster_label="io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}"
+remaining_containers=$(docker_timeout 30s ps -aq --filter "label=${kind_cluster_label}" || true)
+if [[ -n "${remaining_containers}" ]]; then
+  echo "Removing leftover kind containers for ${KIND_CLUSTER_NAME}:"
+  docker_timeout 30s ps -a --filter "label=${kind_cluster_label}" || true
+  docker_timeout 30s rm -f ${remaining_containers} || true
+fi
+docker_timeout 60s builder prune -f --filter "until=24h" || true
+docker_timeout 60s system prune -f --filter "until=24h" || true
diff --git a/.github/actions/gpu-test-cleanup/collect-debug-artifacts.sh b/.github/actions/gpu-test-cleanup/collect-debug-artifacts.sh
new file mode 100644
index 000000000..417606c7d
--- /dev/null
+++ b/.github/actions/gpu-test-cleanup/collect-debug-artifacts.sh
@@ -0,0 +1,104 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -o pipefail
+mkdir -p /tmp/debug-artifacts
+CONTROL_PLANE_COMPONENTS="kube-apiserver kube-controller-manager kube-scheduler etcd"
+kubectl_kind() {
+  timeout 30s kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" "$@"
+}
+docker_timeout() {
+  local limit="$1"
+  shift
+  timeout "${limit}" docker "$@"
+}
+
+kubectl_kind get all --all-namespaces > /tmp/debug-artifacts/all-resources.txt || true
+kubectl_kind get events --all-namespaces --sort-by='.lastTimestamp' > /tmp/debug-artifacts/events.txt || true
+kubectl_kind get --raw='/livez?verbose' > /tmp/debug-artifacts/apiserver-livez.txt 2>&1 || true
+kubectl_kind get --raw='/readyz?verbose' > /tmp/debug-artifacts/apiserver-readyz.txt 2>&1 || true
+kubectl_kind -n kube-system get pods -l tier=control-plane -o wide \
+  > /tmp/debug-artifacts/control-plane-pods.txt 2>&1 || true
+kubectl_kind -n kube-system get events --sort-by='.lastTimestamp' \
+  > /tmp/debug-artifacts/kube-system-events.txt 2>&1 || true
+for component in ${CONTROL_PLANE_COMPONENTS}; do
+  kubectl_kind -n kube-system describe pod -l "component=${component}" \
+    > "/tmp/debug-artifacts/${component}-describe.txt" 2>&1 || true
+  kubectl_kind -n kube-system logs -l "component=${component}" --all-containers --tail=300 \
+    > "/tmp/debug-artifacts/${component}-logs.txt" 2>&1 || true
+  kubectl_kind -n kube-system logs -l "component=${component}" --all-containers --previous --tail=300 \
+    > "/tmp/debug-artifacts/${component}-previous-logs.txt" 2>&1 || true
+  kubectl_kind -n kube-system get lease "${component}" -o yaml \
+    > "/tmp/debug-artifacts/${component}-lease.yaml" 2>&1 || true
+done
+kubectl_kind -n gpu-operator get pods -o wide > /tmp/debug-artifacts/gpu-operator-pods.txt || true
+kubectl_kind -n gpu-operator logs -l app=nvidia-device-plugin-daemonset --tail=100 > /tmp/debug-artifacts/device-plugin-logs.txt || true
+kubectl_kind -n gpu-operator logs -l app.kubernetes.io/component=gpu-operator --tail=100 > /tmp/debug-artifacts/gpu-operator-logs.txt || true
+kubectl_kind get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded > /tmp/debug-artifacts/non-running-pods.txt || true
+tar_inputs=()
+[[ -f recipe.yaml ]] && tar_inputs+=(recipe.yaml)
+[[ -d bundle ]] && tar_inputs+=(bundle)
+if [[ "${#tar_inputs[@]}" -gt 0 ]]; then
+  echo "Archiving runtime bundle inputs: ${tar_inputs[*]}"
+  tar -czf /tmp/debug-artifacts/aicr-runtime-bundle.tar.gz "${tar_inputs[@]}" || true
+else
+  echo "No recipe.yaml or bundle directory found; skipping runtime bundle archive"
+fi
+
+docker_timeout 30s ps --filter "label=io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}" \
+  --format '{{.Names}}' | sort | while read -r node_container; do
+    [[ -z "${node_container}" ]] && continue
+    node_file="${node_container//[^A-Za-z0-9_.-]/_}"
+    docker_timeout 30s exec "${node_container}" journalctl -u kubelet \
+      --since "90 minutes ago" --no-pager \
+      > "/tmp/debug-artifacts/${node_file}-kubelet-journal.txt" 2>&1 || true
+    docker_timeout 30s exec "${node_container}" journalctl -u containerd \
+      --since "90 minutes ago" --no-pager \
+      > "/tmp/debug-artifacts/${node_file}-containerd-journal.txt" 2>&1 || true
+    docker_timeout 30s exec "${node_container}" crictl ps -a \
+      > "/tmp/debug-artifacts/${node_file}-crictl-ps-a.txt" 2>&1 || true
+    docker_timeout 30s exec "${node_container}" crictl pods \
+      > "/tmp/debug-artifacts/${node_file}-crictl-pods.txt" 2>&1 || true
+    docker_timeout 30s exec "${node_container}" crictl stats \
+      > "/tmp/debug-artifacts/${node_file}-crictl-stats.txt" 2>&1 || true
+    docker_timeout 30s exec "${node_container}" sh -c '
+      date
+      uptime || true
+      free -h || true
+      df -h / /var/lib/containerd /var/lib/kubelet 2>/dev/null || df -h
+      echo "--- top cpu/memory processes ---"
+      ps -eo pid,ppid,stat,etime,%cpu,%mem,comm,args --sort=-%cpu | head -40 || true
+    ' > "/tmp/debug-artifacts/${node_file}-node-pressure.txt" 2>&1 || true
+    docker_timeout 120s exec "${node_container}" sh -c '
+      for component in kube-apiserver kube-controller-manager kube-scheduler etcd; do
+        echo "=== ${component} static pod manifest ==="
+        sed -n "1,220p" "/etc/kubernetes/manifests/${component}.yaml" 2>/dev/null || true
+        echo "=== ${component} CRI containers ==="
+        crictl ps -a --name "${component}" || true
+        count=0
+        for container_id in $(crictl ps -a --name "${component}" -q 2>/dev/null); do
+          count=$((count + 1))
+          if [ "${count}" -gt 8 ]; then
+            echo "Skipping remaining ${component} CRI containers after first 8 entries."
+            break
+          fi
+          echo "=== crictl inspect ${component} ${container_id} ==="
+          crictl inspect "${container_id}" || true
+          echo "=== crictl logs ${component} ${container_id} ==="
+          crictl logs --tail=300 "${container_id}" || true
+        done
+      done
+    ' > "/tmp/debug-artifacts/${node_file}-control-plane-cri.txt" 2>&1 || true
+  done || true
diff --git a/.github/actions/gpu-test-cleanup/export-kind-logs.sh b/.github/actions/gpu-test-cleanup/export-kind-logs.sh
new file mode 100644
index 000000000..2522481eb
--- /dev/null
+++ b/.github/actions/gpu-test-cleanup/export-kind-logs.sh
@@ -0,0 +1,19 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+mkdir -p /tmp/kind-logs
+timeout 300s kind export logs /tmp/kind-logs --name "${KIND_CLUSTER_NAME}" || true
diff --git a/.github/actions/install-karpenter-kwok/action.yml b/.github/actions/install-karpenter-kwok/action.yml
index f66848e6f..f26aa38a5 100644
--- a/.github/actions/install-karpenter-kwok/action.yml
+++ b/.github/actions/install-karpenter-kwok/action.yml
@@ -38,11 +38,7 @@ runs:
     - name: Resolve versions
       id: versions
       shell: bash
-      run: |
-        echo "karpenter=$(yq eval '.testing_tools.karpenter' .settings.yaml)" >> "$GITHUB_OUTPUT"
-        echo "ko=$(yq eval '.build_tools.ko' .settings.yaml)" >> "$GITHUB_OUTPUT"
-        echo "go=$(go env GOVERSION)" >> "$GITHUB_OUTPUT"
-
+      run: bash "${{ github.action_path }}/resolve-versions.sh"
     - name: Install ko
       uses: ./.github/actions/setup-build-tools
       with:
@@ -68,30 +64,4 @@ runs:
         KWOK_HELM_TIMEOUT: ${{ inputs.kwok_helm_timeout }}
         KO_BUILD_TIMEOUT: ${{ inputs.ko_build_timeout }}
         KARPENTER_HELM_TIMEOUT: ${{ inputs.karpenter_helm_timeout }}
-      run: |
-        set -euo pipefail
-        validate_duration_input() {
-          local input_name="$1"
-          local input_value="$2"
-
-          if ! [[ "${input_value}" =~ ^[0-9]+[smh]$ ]]; then
-            echo "::error::${input_name} must be a duration like 300s, 10m, or 1h; got '${input_value}'"
-            exit 1
-          fi
-        }
-
-        validate_seconds_input() {
-          local input_name="$1"
-          local input_value="$2"
-
-          if ! [[ "${input_value}" =~ ^[0-9]+$ ]]; then
-            echo "::error::${input_name} must be an integer number of seconds, got '${input_value}'"
-            exit 1
-          fi
-        }
-
-        validate_duration_input kwok_helm_timeout "${KWOK_HELM_TIMEOUT}"
-        validate_seconds_input ko_build_timeout "${KO_BUILD_TIMEOUT}"
-        validate_duration_input karpenter_helm_timeout "${KARPENTER_HELM_TIMEOUT}"
-        bash kwok/scripts/install-karpenter-kwok.sh
-        kubectl --context="kind-${KIND_CLUSTER_NAME}" apply -f kwok/manifests/karpenter/nodepool.yaml
+      run: bash "${{ github.action_path }}/install-karpenter-kwok.sh"
diff --git a/.github/actions/install-karpenter-kwok/install-karpenter-kwok.sh b/.github/actions/install-karpenter-kwok/install-karpenter-kwok.sh
new file mode 100644
index 000000000..2fdb26312
--- /dev/null
+++ b/.github/actions/install-karpenter-kwok/install-karpenter-kwok.sh
@@ -0,0 +1,41 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+validate_duration_input() {
+  local input_name="$1"
+  local input_value="$2"
+
+  if ! [[ "${input_value}" =~ ^[0-9]+[smh]$ ]]; then
+    echo "::error::${input_name} must be a duration like 300s, 10m, or 1h; got '${input_value}'"
+    exit 1
+  fi
+}
+
+validate_seconds_input() {
+  local input_name="$1"
+  local input_value="$2"
+
+  if ! [[ "${input_value}" =~ ^[0-9]+$ ]]; then
+    echo "::error::${input_name} must be an integer number of seconds, got '${input_value}'"
+    exit 1
+  fi
+}
+
+validate_duration_input kwok_helm_timeout "${KWOK_HELM_TIMEOUT}"
+validate_seconds_input ko_build_timeout "${KO_BUILD_TIMEOUT}"
+validate_duration_input karpenter_helm_timeout "${KARPENTER_HELM_TIMEOUT}"
+bash kwok/scripts/install-karpenter-kwok.sh
+kubectl --context="kind-${KIND_CLUSTER_NAME}" apply -f kwok/manifests/karpenter/nodepool.yaml
diff --git a/.github/actions/install-karpenter-kwok/resolve-versions.sh b/.github/actions/install-karpenter-kwok/resolve-versions.sh
new file mode 100644
index 000000000..84e85458e
--- /dev/null
+++ b/.github/actions/install-karpenter-kwok/resolve-versions.sh
@@ -0,0 +1,20 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+echo "karpenter=$(yq eval '.testing_tools.karpenter' .settings.yaml)" >> "$GITHUB_OUTPUT"
+echo "ko=$(yq eval '.build_tools.ko' .settings.yaml)" >> "$GITHUB_OUTPUT"
+echo "go=$(go env GOVERSION)" >> "$GITHUB_OUTPUT"
diff --git a/.github/scripts/gpu-chainsaw-health.sh b/.github/scripts/gpu-chainsaw-health.sh
new file mode 100644
index 000000000..35450eb00
--- /dev/null
+++ b/.github/scripts/gpu-chainsaw-health.sh
@@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+test_dir="$1"
+
+chainsaw test \
+  --test-dir "${test_dir}" \
+  --config tests/chainsaw/chainsaw-config.yaml \
+  --cleanup-timeout 120s \
+  --delete-timeout 120s
diff --git a/.github/scripts/gpu-debug-diagnostics.sh b/.github/scripts/gpu-debug-diagnostics.sh
new file mode 100644
index 000000000..0d1cd76a2
--- /dev/null
+++ b/.github/scripts/gpu-debug-diagnostics.sh
@@ -0,0 +1,146 @@
+#!/usr/bin/env bash
+set -o pipefail
+
+mode="${GPU_TEST_DIAGNOSTIC_MODE:-smoke}"
+
+kubectl_kind() {
+  timeout 30s kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" "$@"
+}
+
+print_workload_images() {
+  local ns="$1"
+  kubectl_kind -n "${ns}" get deployment,daemonset,statefulset -o json 2>/dev/null \
+    | jq -r '
+      .items[] |
+      [
+        .kind,
+        .metadata.namespace + "/" + .metadata.name,
+        (([.spec.template.spec.containers[]?.image] +
+          [.spec.template.spec.initContainers[]?.image]) | unique | join(","))
+      ] | @tsv
+    ' || true
+}
+
+print_workload_inventory() {
+  local ns
+  echo "=== Workload image inventory ==="
+  for ns in "$@"; do
+    echo "--- ${ns} ---"
+    print_workload_images "${ns}"
+  done
+}
+
+print_grafana_diagnostics() {
+  echo "=== Grafana deployment ==="
+  kubectl_kind -n monitoring get deployment grafana -o wide 2>/dev/null || true
+  echo "=== Grafana pods ==="
+  kubectl_kind -n monitoring get pods -l app.kubernetes.io/name=grafana -o wide 2>/dev/null || true
+  echo "=== Grafana deployment describe ==="
+  kubectl_kind -n monitoring describe deployment grafana 2>/dev/null || true
+  echo "=== Grafana pod describe ==="
+  kubectl_kind -n monitoring describe pods -l app.kubernetes.io/name=grafana 2>/dev/null || true
+}
+
+print_kai_diagnostics() {
+  echo "=== KAI scheduler pods ==="
+  kubectl_kind -n kai-scheduler get pods -o wide 2>/dev/null || true
+  echo "=== KAI admission deployment ==="
+  kubectl_kind -n kai-scheduler get deployment admission -o wide 2>/dev/null || true
+  echo "=== KAI admission deployment describe ==="
+  kubectl_kind -n kai-scheduler describe deployment admission 2>/dev/null || true
+  echo "=== KAI admission pod describe ==="
+  kubectl_kind -n kai-scheduler get pods -o name 2>/dev/null \
+    | grep '^pod/admission-' \
+    | while read -r pod; do
+        kubectl_kind -n kai-scheduler describe "${pod}" 2>/dev/null || true
+      done || true
+  echo "=== KAI admission logs ==="
+  kubectl_kind -n kai-scheduler logs deployment/admission --all-containers --tail=200 2>/dev/null || true
+  echo "=== KAI scheduler logs ==="
+  kubectl_kind -n kai-scheduler logs deployment/kai-scheduler-default --tail=100 2>/dev/null || true
+  echo "=== KAI scheduler queues ==="
+  kubectl_kind get queues -A 2>/dev/null || true
+  echo "=== KAI scheduler podgroups ==="
+  kubectl_kind get podgroups -A 2>/dev/null || true
+  echo "=== Recent events (kai-scheduler) ==="
+  kubectl_kind -n kai-scheduler get events --sort-by='.lastTimestamp' 2>/dev/null | tail -50 || true
+}
+
+print_common_gpu_diagnostics() {
+  echo "=== ClusterPolicy status ==="
+  kubectl_kind get clusterpolicy -o yaml 2>/dev/null || true
+  echo "=== GPU Operator pods ==="
+  kubectl_kind -n gpu-operator get pods -o wide 2>/dev/null || true
+  echo "=== Non-running pods (all namespaces) ==="
+  kubectl_kind get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded 2>/dev/null || true
+  echo "=== Recent events (gpu-operator) ==="
+  kubectl_kind -n gpu-operator get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true
+}
+
+case "${mode}" in
+  smoke)
+    print_common_gpu_diagnostics
+    echo "=== Node status ==="
+    kubectl_kind get nodes -o wide 2>/dev/null || true
+    ;;
+  training)
+    print_workload_inventory cert-manager gpu-operator monitoring skyhook nvsentinel nvidia-dra-driver \
+      nvidia-network-operator kai-scheduler kubeflow
+    print_grafana_diagnostics
+    print_kai_diagnostics
+    echo "=== Kubeflow Trainer deployment ==="
+    kubectl_kind -n kubeflow get deployment kubeflow-trainer-controller-manager -o wide 2>/dev/null || true
+    echo "=== Kubeflow pods ==="
+    kubectl_kind -n kubeflow get pods -o wide 2>/dev/null || true
+    echo "=== Kubeflow validating webhooks ==="
+    kubectl_kind get validatingwebhookconfigurations validator.trainer.kubeflow.org -o yaml 2>/dev/null || true
+    echo "=== Kubeflow Trainer CRD ==="
+    kubectl_kind get crd trainjobs.trainer.kubeflow.org -o yaml 2>/dev/null || true
+    echo "=== Non-running pods (all namespaces) ==="
+    kubectl_kind get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded 2>/dev/null || true
+    echo "=== GPU Operator pods ==="
+    kubectl_kind -n gpu-operator get pods -o wide 2>/dev/null || true
+    echo "=== Node resources ==="
+    kubectl_kind describe nodes 2>/dev/null | grep -A 20 "Allocated resources" || true
+    ;;
+  inference)
+    print_workload_inventory cert-manager gpu-operator monitoring skyhook nvsentinel nvidia-dra-driver \
+      nvidia-network-operator kai-scheduler dynamo-system kgateway-system
+    print_common_gpu_diagnostics
+    echo "=== Dynamo pods ==="
+    kubectl_kind -n dynamo-system get pods -o wide 2>/dev/null || true
+    echo "=== Dynamo operator logs ==="
+    kubectl_kind -n dynamo-system logs deployment/dynamo-operator-controller-manager --tail=100 -c manager 2>/dev/null || true
+    echo "=== Recent events (dynamo-system) ==="
+    kubectl_kind -n dynamo-system get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true
+    print_kai_diagnostics
+    echo "=== Custom metrics API ==="
+    for metric in gpu_utilization gpu_memory_used gpu_power_usage; do
+      echo "--- ${metric} ---"
+      for ns in gpu-operator dynamo-system; do
+        kubectl_kind get --raw "/apis/custom.metrics.k8s.io/v1beta1/namespaces/${ns}/pods/*/${metric}" 2>/dev/null | jq . || true
+      done
+    done
+    print_grafana_diagnostics
+    echo "=== prometheus-adapter pods ==="
+    kubectl_kind -n monitoring get pods -l app.kubernetes.io/name=prometheus-adapter -o wide 2>/dev/null || true
+    echo "=== kgateway pods ==="
+    kubectl_kind -n kgateway-system get pods -o wide 2>/dev/null || true
+    echo "=== GatewayClass status ==="
+    kubectl_kind get gatewayclass -o yaml 2>/dev/null || true
+    echo "=== Gateway status ==="
+    kubectl_kind get gateways -A -o yaml 2>/dev/null || true
+    echo "=== DCGM Exporter pods ==="
+    kubectl_kind -n gpu-operator get pods -l app=nvidia-dcgm-exporter -o wide 2>/dev/null || true
+    echo "=== Monitoring pods ==="
+    kubectl_kind -n monitoring get pods -o wide 2>/dev/null || true
+    echo "=== DRA ResourceSlices ==="
+    kubectl_kind get resourceslices -o wide 2>/dev/null || true
+    echo "=== Node status ==="
+    kubectl_kind get nodes -o wide 2>/dev/null || true
+    ;;
+  *)
+    echo "::error::unknown GPU_TEST_DIAGNOSTIC_MODE: ${mode}"
+    exit 1
+    ;;
+esac
diff --git a/.github/scripts/gpu-smoke-run-nvidia-smi.sh b/.github/scripts/gpu-smoke-run-nvidia-smi.sh
new file mode 100644
index 000000000..cefff49fa
--- /dev/null
+++ b/.github/scripts/gpu-smoke-run-nvidia-smi.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+cat <<'EOF' | kubectl --context="kind-${KIND_CLUSTER_NAME}" apply -f -
+apiVersion: v1
+kind: Pod
+metadata:
+  name: gpu-smoke-test
+spec:
+  restartPolicy: Never
+  containers:
+  - name: nvidia-smi
+    image: ubuntu:22.04
+    command: ["nvidia-smi"]
+    resources:
+      limits:
+        nvidia.com/gpu: 1
+EOF
+
+echo "Waiting for gpu-smoke-test pod to complete..."
+kubectl --context="kind-${KIND_CLUSTER_NAME}" wait pod/gpu-smoke-test \
+  --for=condition=Ready --timeout=120s || true
+kubectl --context="kind-${KIND_CLUSTER_NAME}" wait pod/gpu-smoke-test \
+  --for=jsonpath='{.status.phase}'=Succeeded --timeout=120s
diff --git a/.github/scripts/gpu-validate-conformance.sh b/.github/scripts/gpu-validate-conformance.sh
new file mode 100644
index 000000000..2fb3fa4a3
--- /dev/null
+++ b/.github/scripts/gpu-validate-conformance.sh
@@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+AICR_VALIDATOR_IMAGE_REGISTRY=ko.local \
+./aicr validate \
+  --recipe recipe.yaml \
+  --phase conformance \
+  --namespace gpu-operator \
+  --kubeconfig="${HOME}/.kube/config" \
+  --require-gpu \
+  --image=ko.local:smoke-test \
+  --timeout=10m \
+  --toleration '*' \
+  --output=validation-result.yaml \
+  --evidence-dir=conformance-evidence
diff --git a/.github/workflows/gpu-h100-inference-test.yaml b/.github/workflows/gpu-h100-inference-test.yaml
index 27d9d27cc..08152c25e 100644
--- a/.github/workflows/gpu-h100-inference-test.yaml
+++ b/.github/workflows/gpu-h100-inference-test.yaml
@@ -38,6 +38,8 @@ jobs:
       should-run: ${{ steps.filter.outputs.matched }}
     steps:
       - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
+        with:
+          persist-credentials: false
       - uses: dorny/paths-filter@fbd0ab8f3e69293af611ebaee6363fc25e6d187d  # v4.0.1
         id: filter
         with:
@@ -215,12 +217,7 @@ jobs:
       - name: Run chainsaw health checks
         # The H100 stack can make namespace cleanup API calls slow under load.
         # Keep cleanup enabled, but allow more than the default 30s deadline.
-        run: |
-          chainsaw test \
-            --test-dir tests/chainsaw/ai-conformance/kind-inference-dynamo \
-            --config tests/chainsaw/chainsaw-config.yaml \
-            --cleanup-timeout 120s \
-            --delete-timeout 120s
+        run: bash .github/scripts/gpu-chainsaw-health.sh tests/chainsaw/ai-conformance/kind-inference-dynamo
 
       # --- CNCF AI Conformance validation ---
       # Runs after the stack health checks so gateway and metrics validators
@@ -245,19 +242,7 @@ jobs:
 
       - name: Validate CNCF AI Conformance
         id: validate-conformance
-        run: |
-          AICR_VALIDATOR_IMAGE_REGISTRY=ko.local \
-          ./aicr validate \
-            --recipe recipe.yaml \
-            --phase conformance \
-            --namespace gpu-operator \
-            --kubeconfig="${HOME}/.kube/config" \
-            --require-gpu \
-            --image=ko.local:smoke-test \
-            --timeout=10m \
-            --toleration '*' \
-            --output=validation-result.yaml \
-            --evidence-dir=conformance-evidence
+        run: bash .github/scripts/gpu-validate-conformance.sh
 
       # Dynamo smoke is intentionally disabled for now. The vLLM runtime image
       # adds significant latency and flakiness in Kind CI, and training has no
@@ -282,105 +267,9 @@ jobs:
         if: failure()
         timeout-minutes: 5
         shell: bash
-        run: |
-          set -o pipefail
-          kubectl_kind() {
-            timeout 30s kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" "$@"
-          }
-          print_workload_images() {
-            local ns="$1"
-            kubectl_kind -n "${ns}" get deployment,daemonset,statefulset -o json 2>/dev/null \
-              | jq -r '
-                .items[] |
-                [
-                  .kind,
-                  .metadata.namespace + "/" + .metadata.name,
-                  (([.spec.template.spec.containers[]?.image] +
-                    [.spec.template.spec.initContainers[]?.image]) | unique | join(","))
-                ] | @tsv
-              ' || true
-          }
-
-          echo "=== Workload image inventory ==="
-          for NS in cert-manager gpu-operator monitoring skyhook nvsentinel nvidia-dra-driver \
-            nvidia-network-operator kai-scheduler dynamo-system kgateway-system; do
-            echo "--- ${NS} ---"
-            print_workload_images "${NS}"
-          done
-          echo "=== ClusterPolicy status ==="
-          kubectl_kind get clusterpolicy -o yaml 2>/dev/null || true
-          echo "=== GPU Operator pods ==="
-          kubectl_kind -n gpu-operator get pods -o wide 2>/dev/null || true
-          echo "=== Non-running pods (all namespaces) ==="
-          kubectl_kind get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded 2>/dev/null || true
-          echo "=== Recent events (gpu-operator) ==="
-          kubectl_kind -n gpu-operator get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true
-          echo "=== Dynamo pods ==="
-          kubectl_kind -n dynamo-system get pods -o wide 2>/dev/null || true
-          echo "=== Dynamo operator logs ==="
-          kubectl_kind -n dynamo-system \
-            logs deployment/dynamo-operator-controller-manager --tail=100 -c manager 2>/dev/null || true
-          echo "=== Recent events (dynamo-system) ==="
-          kubectl_kind -n dynamo-system get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true
-          echo "=== KAI scheduler pods ==="
-          kubectl_kind -n kai-scheduler get pods -o wide 2>/dev/null || true
-          echo "=== KAI admission deployment ==="
-          kubectl_kind -n kai-scheduler get deployment admission -o wide 2>/dev/null || true
-          echo "=== KAI admission deployment describe ==="
-          kubectl_kind -n kai-scheduler describe deployment admission 2>/dev/null || true
-          echo "=== KAI admission pod describe ==="
-          kubectl_kind -n kai-scheduler get pods -o name 2>/dev/null \
-            | grep '^pod/admission-' \
-            | while read -r pod; do
-                kubectl_kind -n kai-scheduler describe "${pod}" 2>/dev/null || true
-              done || true
-          echo "=== KAI admission logs ==="
-          kubectl_kind -n kai-scheduler \
-            logs deployment/admission --all-containers --tail=200 2>/dev/null || true
-          echo "=== KAI scheduler logs ==="
-          kubectl_kind -n kai-scheduler \
-            logs deployment/kai-scheduler-default --tail=100 2>/dev/null || true
-          echo "=== KAI scheduler queues ==="
-          kubectl_kind get queues -A 2>/dev/null || true
-          echo "=== KAI scheduler podgroups ==="
-          kubectl_kind get podgroups -A 2>/dev/null || true
-          echo "=== Recent events (kai-scheduler) ==="
-          kubectl_kind -n kai-scheduler get events --sort-by='.lastTimestamp' 2>/dev/null | tail -50 || true
-          echo "=== Custom metrics API ==="
-          for METRIC in gpu_utilization gpu_memory_used gpu_power_usage; do
-            echo "--- ${METRIC} ---"
-            for NS in gpu-operator dynamo-system; do
-              kubectl_kind get --raw \
-                "/apis/custom.metrics.k8s.io/v1beta1/namespaces/${NS}/pods/*/${METRIC}" 2>/dev/null | jq . || true
-            done
-          done
-          echo "=== Grafana deployment ==="
-          kubectl_kind -n monitoring get deployment grafana -o wide 2>/dev/null || true
-          echo "=== Grafana pods ==="
-          kubectl_kind -n monitoring get pods \
-            -l app.kubernetes.io/name=grafana -o wide 2>/dev/null || true
-          echo "=== Grafana deployment describe ==="
-          kubectl_kind -n monitoring describe deployment grafana 2>/dev/null || true
-          echo "=== Grafana pod describe ==="
-          kubectl_kind -n monitoring describe pods \
-            -l app.kubernetes.io/name=grafana 2>/dev/null || true
-          echo "=== prometheus-adapter pods ==="
-          kubectl_kind -n monitoring get pods -l app.kubernetes.io/name=prometheus-adapter -o wide 2>/dev/null || true
-          echo "=== kgateway pods ==="
-          kubectl_kind -n kgateway-system get pods -o wide 2>/dev/null || true
-          echo "=== GatewayClass status ==="
-          kubectl_kind get gatewayclass -o yaml 2>/dev/null || true
-          echo "=== Gateway status ==="
-          kubectl_kind get gateways -A -o yaml 2>/dev/null || true
-          echo "=== DCGM Exporter pods ==="
-          kubectl_kind -n gpu-operator \
-            get pods -l app=nvidia-dcgm-exporter -o wide 2>/dev/null || true
-          echo "=== Monitoring pods ==="
-          kubectl_kind -n monitoring get pods -o wide 2>/dev/null || true
-          echo "=== DRA ResourceSlices ==="
-          kubectl_kind get resourceslices -o wide 2>/dev/null || true
-          echo "=== Node status ==="
-          kubectl_kind get nodes -o wide 2>/dev/null || true
+        env:
+          GPU_TEST_DIAGNOSTIC_MODE: inference
+        run: bash .github/scripts/gpu-debug-diagnostics.sh
 
       - name: GPU Test Cleanup
         if: always()
diff --git a/.github/workflows/gpu-h100-training-test.yaml b/.github/workflows/gpu-h100-training-test.yaml
index 972e38f73..f5ecbaf7f 100644
--- a/.github/workflows/gpu-h100-training-test.yaml
+++ b/.github/workflows/gpu-h100-training-test.yaml
@@ -38,6 +38,8 @@ jobs:
       should-run: ${{ steps.filter.outputs.matched }}
     steps:
       - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
+        with:
+          persist-credentials: false
       - uses: dorny/paths-filter@fbd0ab8f3e69293af611ebaee6363fc25e6d187d  # v4.0.1
         id: filter
         with:
@@ -211,12 +213,7 @@ jobs:
       - name: Run chainsaw health checks
         # The H100 stack can make namespace cleanup API calls slow under load.
         # Keep cleanup enabled, but allow more than the default 30s deadline.
-        run: |
-          chainsaw test \
-            --test-dir tests/chainsaw/ai-conformance/kind-training-kubeflow \
-            --config tests/chainsaw/chainsaw-config.yaml \
-            --cleanup-timeout 120s \
-            --delete-timeout 120s
+        run: bash .github/scripts/gpu-chainsaw-health.sh tests/chainsaw/ai-conformance/kind-training-kubeflow
 
       # --- CNCF AI Conformance validation ---
       # Runs last to ensure the DCGM → Prometheus → adapter pipeline
@@ -241,19 +238,7 @@ jobs:
 
       - name: Validate CNCF AI Conformance
         id: validate-conformance
-        run: |
-          AICR_VALIDATOR_IMAGE_REGISTRY=ko.local \
-          ./aicr validate \
-            --recipe recipe.yaml \
-            --phase conformance \
-            --namespace gpu-operator \
-            --kubeconfig="${HOME}/.kube/config" \
-            --require-gpu \
-            --image=ko.local:smoke-test \
-            --timeout=10m \
-            --toleration '*' \
-            --output=validation-result.yaml \
-            --evidence-dir=conformance-evidence
+        run: bash .github/scripts/gpu-validate-conformance.sh
 
       # --- Validation artifacts ---
 
@@ -274,81 +259,9 @@ jobs:
         if: failure()
         timeout-minutes: 5
         shell: bash
-        run: |
-          set -o pipefail
-          kubectl_kind() {
-            timeout 30s kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" "$@"
-          }
-          print_workload_images() {
-            local ns="$1"
-            kubectl_kind -n "${ns}" get deployment,daemonset,statefulset -o json 2>/dev/null \
-              | jq -r '
-                .items[] |
-                [
-                  .kind,
-                  .metadata.namespace + "/" + .metadata.name,
-                  (([.spec.template.spec.containers[]?.image] +
-                    [.spec.template.spec.initContainers[]?.image]) | unique | join(","))
-                ] | @tsv
-              ' || true
-          }
-
-          echo "=== Workload image inventory ==="
-          for NS in cert-manager gpu-operator monitoring skyhook nvsentinel nvidia-dra-driver \
-            nvidia-network-operator kai-scheduler kubeflow; do
-            echo "--- ${NS} ---"
-            print_workload_images "${NS}"
-          done
-          echo "=== Grafana deployment ==="
-          kubectl_kind -n monitoring get deployment grafana -o wide 2>/dev/null || true
-          echo "=== Grafana pods ==="
-          kubectl_kind -n monitoring get pods \
-            -l app.kubernetes.io/name=grafana -o wide 2>/dev/null || true
-          echo "=== Grafana deployment describe ==="
-          kubectl_kind -n monitoring describe deployment grafana 2>/dev/null || true
-          echo "=== Grafana pod describe ==="
-          kubectl_kind -n monitoring describe pods \
-            -l app.kubernetes.io/name=grafana 2>/dev/null || true
-          echo "=== KAI scheduler pods ==="
-          kubectl_kind -n kai-scheduler get pods -o wide 2>/dev/null || true
-          echo "=== KAI admission deployment ==="
-          kubectl_kind -n kai-scheduler get deployment admission -o wide 2>/dev/null || true
-          echo "=== KAI admission deployment describe ==="
-          kubectl_kind -n kai-scheduler describe deployment admission 2>/dev/null || true
-          echo "=== KAI admission pod describe ==="
-          kubectl_kind -n kai-scheduler get pods -o name 2>/dev/null \
-            | grep '^pod/admission-' \
-            | while read -r pod; do
-                kubectl_kind -n kai-scheduler describe "${pod}" 2>/dev/null || true
-              done || true
-          echo "=== KAI admission logs ==="
-          kubectl_kind -n kai-scheduler \
-            logs deployment/admission --all-containers --tail=200 2>/dev/null || true
-          echo "=== KAI scheduler logs ==="
-          kubectl_kind -n kai-scheduler \
-            logs deployment/kai-scheduler-default --tail=100 2>/dev/null || true
-          echo "=== KAI scheduler queues ==="
-          kubectl_kind get queues -A 2>/dev/null || true
-          echo "=== KAI scheduler podgroups ==="
-          kubectl_kind get podgroups -A 2>/dev/null || true
-          echo "=== Recent events (kai-scheduler) ==="
-          kubectl_kind -n kai-scheduler get events --sort-by='.lastTimestamp' 2>/dev/null | tail -50 || true
-          echo "=== Kubeflow Trainer deployment ==="
-          kubectl_kind -n kubeflow get deployment kubeflow-trainer-controller-manager -o wide 2>/dev/null || true
-          echo "=== Kubeflow pods ==="
-          kubectl_kind -n kubeflow get pods -o wide 2>/dev/null || true
-          echo "=== Kubeflow validating webhooks ==="
-          kubectl_kind get validatingwebhookconfigurations validator.trainer.kubeflow.org -o yaml 2>/dev/null || true
-          echo "=== Kubeflow Trainer CRD ==="
-          kubectl_kind get crd trainjobs.trainer.kubeflow.org -o yaml 2>/dev/null || true
-          echo "=== Non-running pods (all namespaces) ==="
-          kubectl_kind get pods -A \
-            --field-selector=status.phase!=Running,status.phase!=Succeeded 2>/dev/null || true
-          echo "=== GPU Operator pods ==="
-          kubectl_kind -n gpu-operator get pods -o wide 2>/dev/null || true
-          echo "=== Node resources ==="
-          kubectl_kind describe nodes 2>/dev/null | \
-            grep -A 20 "Allocated resources" || true
+        env:
+          GPU_TEST_DIAGNOSTIC_MODE: training
+        run: bash .github/scripts/gpu-debug-diagnostics.sh
 
       - name: GPU Test Cleanup
         if: always()
diff --git a/.github/workflows/gpu-smoke-test.yaml b/.github/workflows/gpu-smoke-test.yaml
index 805548afc..0cbb2da5c 100644
--- a/.github/workflows/gpu-smoke-test.yaml
+++ b/.github/workflows/gpu-smoke-test.yaml
@@ -38,6 +38,8 @@ jobs:
       should-run: ${{ steps.filter.outputs.matched }}
     steps:
       - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
+        with:
+          persist-credentials: false
       - uses: dorny/paths-filter@fbd0ab8f3e69293af611ebaee6363fc25e6d187d  # v4.0.1
         id: filter
         with:
@@ -106,28 +108,7 @@ jobs:
           method: helm
 
       - name: Run nvidia-smi in a pod
-        run: |
-          cat <<'EOF' | kubectl --context="kind-${KIND_CLUSTER_NAME}" apply -f -
-          apiVersion: v1
-          kind: Pod
-          metadata:
-            name: gpu-smoke-test
-          spec:
-            restartPolicy: Never
-            containers:
-            - name: nvidia-smi
-              image: ubuntu:22.04
-              command: ["nvidia-smi"]
-              resources:
-                limits:
-                  nvidia.com/gpu: 1
-          EOF
-
-          echo "Waiting for gpu-smoke-test pod to complete..."
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" wait pod/gpu-smoke-test \
-            --for=condition=Ready --timeout=120s || true
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" wait pod/gpu-smoke-test \
-            --for=jsonpath='{.status.phase}'=Succeeded --timeout=120s
+        run: bash .github/scripts/gpu-smoke-run-nvidia-smi.sh
 
       - name: Show nvidia-smi output
         run: kubectl --context="kind-${KIND_CLUSTER_NAME}" logs gpu-smoke-test
@@ -143,17 +124,9 @@ jobs:
 
       - name: Debug diagnostics
         if: failure()
-        run: |
-          echo "=== ClusterPolicy status ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" get clusterpolicy -o yaml 2>/dev/null || true
-          echo "=== GPU Operator pods ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods -o wide 2>/dev/null || true
-          echo "=== Non-running pods (all namespaces) ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded 2>/dev/null || true
-          echo "=== Recent events (gpu-operator) ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true
-          echo "=== Node status ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" get nodes -o wide 2>/dev/null || true
+        env:
+          GPU_TEST_DIAGNOSTIC_MODE: smoke
+        run: bash .github/scripts/gpu-debug-diagnostics.sh
 
       - name: GPU Test Cleanup
         if: always()
diff --git a/docs/user/cli-reference.md b/docs/user/cli-reference.md
index ba62aef1a..02b104a8c 100644
--- a/docs/user/cli-reference.md
+++ b/docs/user/cli-reference.md
@@ -1322,8 +1322,8 @@ After `helm install`, the same manifests are re-applied as post-install to ensur
 
 Components that use operator patterns with custom resources that reconcile asynchronously use component-specific install behavior:
 
-- `kai-scheduler` installs without `--wait` to avoid Helm timing out on CR readiness. It uses a 30 minute per-attempt timeout and caps the retry budget at 1 retry so hook diagnostics surface quickly on cold runners.
-- `dynamo-platform` uses client-side apply so retries and upgrades do not conflict with Grove-managed webhook certificate Secret data, while preserving Helm `--wait` behavior. Dynamo installs use a 20 minute per-attempt timeout and cap the retry budget at 3 retries because cold-start failures often involve operator webhook and certificate resources settling across attempts.
+- `kai-scheduler` installs without `--wait` to avoid Helm timing out on CR readiness. It uses a 30-minute per-attempt timeout and caps the retry budget at 1 retry so hook diagnostics surface quickly on cold runners.
+- `dynamo-platform` uses client-side apply so retries and upgrades do not conflict with Grove-managed webhook certificate Secret data, while preserving Helm `--wait` behavior. Dynamo installs use a 20-minute per-attempt timeout and cap the retry budget at 3 retries because cold-start failures often involve operator webhook and certificate resources settling across attempts.
 - `kube-prometheus-stack` keeps the default timeout and retry budget because Grafana can hit its Deployment progress deadline before a longer Helm timeout would help, especially on slower H100 CI runners under image-pull and control-plane load.
 
 ##### DRA kubelet plugin registration
diff --git a/pkg/bundler/deployer/helm/helm_test.go b/pkg/bundler/deployer/helm/helm_test.go
index ff41c88e4..4028a8246 100644
--- a/pkg/bundler/deployer/helm/helm_test.go
+++ b/pkg/bundler/deployer/helm/helm_test.go
@@ -575,12 +575,16 @@ func TestGenerate_DeployScriptComponentTimeouts(t *testing.T) {
 			wantRetryCap:        `if [[ "${COMPONENT_MAX_RETRIES}" -gt 3 ]]`,
 			wantApplyArgs:       `COMPONENT_HELM_APPLY_ARGS=(--server-side=false)`,
 			wantSnippets: []string{
+				`helm_supports_server_side_flag`,
+				`--request-timeout="${KUBECTL_REQUEST_TIMEOUT}"`,
+				`dynamo-platform conflict mitigation requires a Helm client that supports --server-side=false`,
 				`dump_dynamo_platform_helm_diagnostics "${namespace}"`,
 				`deployment/dynamo-platform-dynamo-operator-controller-manager`,
 				`--previous --tail=200`,
 			},
 			wantReadmeSnippets: []string{
 				`--server-side=false`,
+				`Helm client that supports the flag`,
 				`--wait --timeout 20m`,
 			},
 		},
diff --git a/pkg/bundler/deployer/helm/templates/README.md.tmpl b/pkg/bundler/deployer/helm/templates/README.md.tmpl
index ba3cef380..3a1368470 100644
--- a/pkg/bundler/deployer/helm/templates/README.md.tmpl
+++ b/pkg/bundler/deployer/helm/templates/README.md.tmpl
@@ -104,6 +104,9 @@ helm upgrade --install {{ .Name }} {{ .ChartName }} \
   {{ end -}}
 {{ end -}}
 ```
+{{ if eq .Name "dynamo-platform" }}
+`--server-side=false` requires a Helm client that supports the flag (Helm v4 or compatible). `deploy.sh` checks support before adding the flag and warns before falling back when unavailable.
+{{ end -}}
 {{ end -}}
 {{ if .HasManifests }}
 ```bash
diff --git a/pkg/bundler/deployer/helm/templates/component-README.md.tmpl b/pkg/bundler/deployer/helm/templates/component-README.md.tmpl
index 7797779a0..84517401d 100644
--- a/pkg/bundler/deployer/helm/templates/component-README.md.tmpl
+++ b/pkg/bundler/deployer/helm/templates/component-README.md.tmpl
@@ -70,6 +70,9 @@ helm upgrade --install {{ .Name }} {{ .ChartName }} \
   {{ end -}}
 {{ end -}}
 ```
+{{ if eq .Name "dynamo-platform" }}
+`--server-side=false` requires a Helm client that supports the flag (Helm v4 or compatible). `deploy.sh` checks support before adding the flag and warns before falling back when unavailable.
+{{ end -}}
 {{ if .HasManifests }}
 After the chart is installed, apply additional manifests:
 
diff --git a/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl b/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl
index 826ab08ed..c8f7ceb68 100644
--- a/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl
+++ b/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl
@@ -24,6 +24,7 @@ trap 'rm -rf "${HELM_WORKDIR}"' EXIT
 cd "${HELM_WORKDIR}"
 
 HELM_TIMEOUT="10m"
+KUBECTL_REQUEST_TIMEOUT="10s"
 NO_WAIT=false
 BEST_EFFORT=false
 FAILED_COMPONENTS=""
@@ -59,6 +60,10 @@ function backoff_seconds() {
   echo "${seconds}"
 }
 
+function helm_supports_server_side_flag() {
+  helm help upgrade 2>/dev/null | grep -q -- '--server-side'
+}
+
 function retry() {
   local desc="$1"; shift
   local attempt=0
@@ -152,23 +157,23 @@ function dump_dynamo_platform_helm_diagnostics() {
 
   echo "  --- ${namespace} diagnostics ---"
   echo "  Deployments:"
-  kubectl get deployments -n "${namespace}" -o wide 2>/dev/null || true
+  kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" get deployments -n "${namespace}" -o wide 2>/dev/null || true
   echo "  Jobs:"
-  kubectl get jobs -n "${namespace}" 2>/dev/null || true
+  kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" get jobs -n "${namespace}" 2>/dev/null || true
   echo "  Pods:"
-  kubectl get pods -n "${namespace}" -o wide 2>/dev/null || true
+  kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" get pods -n "${namespace}" -o wide 2>/dev/null || true
   echo "  Pod descriptions:"
-  kubectl describe pods -n "${namespace}" 2>/dev/null || true
+  kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" describe pods -n "${namespace}" 2>/dev/null || true
   echo "  Dynamo operator manager logs:"
-  kubectl logs deployment/dynamo-platform-dynamo-operator-controller-manager -n "${namespace}" -c manager --tail=200 2>/dev/null || true
+  kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" logs deployment/dynamo-platform-dynamo-operator-controller-manager -n "${namespace}" -c manager --tail=200 2>/dev/null || true
   echo "  Dynamo operator manager previous logs:"
-  kubectl logs deployment/dynamo-platform-dynamo-operator-controller-manager -n "${namespace}" -c manager --previous --tail=200 2>/dev/null || true
+  kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" logs deployment/dynamo-platform-dynamo-operator-controller-manager -n "${namespace}" -c manager --previous --tail=200 2>/dev/null || true
   echo "  Grove operator logs:"
-  kubectl logs deployment/grove-operator -n "${namespace}" --all-containers --tail=100 2>/dev/null || true
+  kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" logs deployment/grove-operator -n "${namespace}" --all-containers --tail=100 2>/dev/null || true
   echo "  Grove operator previous logs:"
-  kubectl logs deployment/grove-operator -n "${namespace}" --all-containers --previous --tail=100 2>/dev/null || true
+  kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" logs deployment/grove-operator -n "${namespace}" --all-containers --previous --tail=100 2>/dev/null || true
   echo "  Recent events:"
-  kubectl get events -n "${namespace}" --sort-by='.lastTimestamp' 2>/dev/null | tail -50 || true
+  kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" get events -n "${namespace}" --sort-by='.lastTimestamp' 2>/dev/null | tail -50 || true
   echo "  --- End ${namespace} diagnostics ---"
 }
 
@@ -414,8 +419,12 @@ fi
 COMPONENT_HELM_TIMEOUT="20m"
 # Grove owns the generated webhook certificate Secret data after install.
 # Client-side apply avoids server-side field ownership conflicts during retries.
-# Requires Helm v4+ for --server-side=false; AICR bundles pin Helm v4 in .settings.yaml.
-COMPONENT_HELM_APPLY_ARGS=(--server-side=false)
+# This flag requires a Helm client that supports --server-side=false.
+if helm_supports_server_side_flag; then
+  COMPONENT_HELM_APPLY_ARGS=(--server-side=false)
+else
+  echo "::warning::dynamo-platform conflict mitigation requires a Helm client that supports --server-side=false; proceeding without this flag"
+fi
 if [[ "${COMPONENT_MAX_RETRIES}" -gt 3 ]]; then
   COMPONENT_MAX_RETRIES="3"
 fi

From 1d553296663225f40a2edb21bd228eaadeeab427 Mon Sep 17 00:00:00 2001
From: Yuan Chen <yuanchen97@gmail.com>
Date: Sun, 26 Apr 2026 15:39:11 -0700
Subject: [PATCH 19/21] ci: address GPU workflow review hardening

---
 .github/actions/README.md                     |  7 +++
 .github/actions/aicr-build/build-cli.sh       | 14 +++++
 .../aicr-build/build-snapshot-agent.sh        | 16 +++++-
 .../aicr-build/build-validator-images.sh      | 23 +++++++-
 .github/actions/aicr-build/stage-cli.sh       | 14 +++++
 .../check-runner-capacity.sh                  | 10 ++--
 .../configure-nvidia-container-toolkit.sh     | 15 ++++-
 .../create-gpu-kind-cluster.sh                | 56 ++++++++++++++++++-
 .../delete-stale-kind-cluster.sh              | 18 +++---
 .../gpu-cluster-setup/install-nvkind.sh       |  8 ++-
 .../gpu-cluster-setup/warm-kind-node-image.sh | 10 ++--
 .../gpu-operator-install/generate-bundle.sh   |  1 +
 .../install-gpu-operator-helm.sh              |  5 +-
 .../wait-gpu-operands-bundle.sh               |  6 +-
 .../wait-gpu-operands-helm.sh                 | 19 ++++++-
 .../validate-snapshot-gpu.sh                  |  4 ++
 .../collect-debug-artifacts.sh                |  4 +-
 .../install-karpenter-kwok.sh                 |  8 ++-
 .github/scripts/gpu-chainsaw-health.sh        | 32 ++++++++++-
 .github/scripts/gpu-debug-diagnostics.sh      | 24 ++++++--
 .github/scripts/gpu-smoke-run-nvidia-smi.sh   | 29 ++++++++--
 .../scripts/gpu-smoke-show-nvidia-smi-log.sh  | 35 ++++++++++++
 .github/scripts/gpu-validate-conformance.sh   | 14 +++++
 .github/workflows/gpu-smoke-test.yaml         |  2 +-
 docs/user/cli-reference.md                    |  2 +-
 pkg/bundler/deployer/helm/helm_test.go        | 11 ++--
 .../deployer/helm/templates/README.md.tmpl    |  2 +-
 .../helm/templates/component-README.md.tmpl   |  2 +-
 .../deployer/helm/templates/deploy.sh.tmpl    | 56 ++++++++++++++-----
 29 files changed, 381 insertions(+), 66 deletions(-)
 create mode 100644 .github/scripts/gpu-smoke-show-nvidia-smi-log.sh

diff --git a/.github/actions/README.md b/.github/actions/README.md
index 3b1ee648d..15710df7d 100644
--- a/.github/actions/README.md
+++ b/.github/actions/README.md
@@ -4,6 +4,13 @@ This directory contains a modular, reusable GitHub Actions architecture optimize
 
 ## Composite Actions
 
+### Script Conventions
+
+Composite action helper scripts in this directory are intentionally portable
+across checkout modes: keep them mode `0644` and invoke them as
+`bash path/to/script.sh` from workflows or `action.yml` files. Do not rely on
+executable bits or `./script.sh` invocation.
+
 ### Core CI/CD Actions
 
 #### `security-scan/`
diff --git a/.github/actions/aicr-build/build-cli.sh b/.github/actions/aicr-build/build-cli.sh
index 81b657f58..83c834aad 100644
--- a/.github/actions/aicr-build/build-cli.sh
+++ b/.github/actions/aicr-build/build-cli.sh
@@ -1,4 +1,18 @@
 #!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 set -euo pipefail
 
 mkdir -p dist
diff --git a/.github/actions/aicr-build/build-snapshot-agent.sh b/.github/actions/aicr-build/build-snapshot-agent.sh
index 9650fbd0d..512aad2f0 100644
--- a/.github/actions/aicr-build/build-snapshot-agent.sh
+++ b/.github/actions/aicr-build/build-snapshot-agent.sh
@@ -1,9 +1,23 @@
 #!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 set -euo pipefail
 
 # Build snapshot agent image with CUDA base (provides nvidia-smi for GPU detection).
 # Uses cuda:base (~250MB) instead of cuda:runtime (~1.8GB) because only nvidia-smi is needed.
-docker build -t ko.local:smoke-test -f - . <<'DOCKERFILE'
+timeout 900s docker build -t ko.local:smoke-test -f - . <<'DOCKERFILE'
 FROM nvcr.io/nvidia/cuda:13.1.0-base-ubuntu24.04
 COPY dist/aicr /usr/local/bin/aicr
 ENTRYPOINT ["/usr/local/bin/aicr"]
diff --git a/.github/actions/aicr-build/build-validator-images.sh b/.github/actions/aicr-build/build-validator-images.sh
index 4389b15b4..76682af90 100644
--- a/.github/actions/aicr-build/build-validator-images.sh
+++ b/.github/actions/aicr-build/build-validator-images.sh
@@ -1,4 +1,18 @@
 #!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 set -euo pipefail
 
 if [[ -n "${VALIDATOR_PHASES}" ]]; then
@@ -14,12 +28,19 @@ fi
 
 mkdir -p dist/validator
 for phase in ${PHASES//,/ }; do
+  if ! [[ "${phase}" =~ ^[a-z][a-z0-9_-]*$ ]]; then
+    echo "::error::invalid validator phase '${phase}'; expected ^[a-z][a-z0-9_-]*$"
+    exit 1
+  fi
   echo "Building validator binary: ${phase}"
   CGO_ENABLED=0 go build -trimpath -o "dist/validator/${phase}" "./validators/${phase}"
 done
 
 for phase in ${PHASES//,/ }; do
-  mkdir -p "validators/${phase}/testdata"
+  if [[ ! -d "validators/${phase}/testdata" ]]; then
+    echo "::warning::validators/${phase}/testdata is missing; creating empty testdata directory"
+    mkdir -p "validators/${phase}/testdata"
+  fi
   docker build -t "ko.local/aicr-validators/${phase}:latest" -f - . <<DOCKERFILE
 FROM gcr.io/distroless/static-debian12:nonroot
 COPY dist/validator/${phase} /${phase}
diff --git a/.github/actions/aicr-build/stage-cli.sh b/.github/actions/aicr-build/stage-cli.sh
index 929aed9e8..c5b737a4d 100644
--- a/.github/actions/aicr-build/stage-cli.sh
+++ b/.github/actions/aicr-build/stage-cli.sh
@@ -1,4 +1,18 @@
 #!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 set -euo pipefail
 
 cp dist/aicr ./aicr
diff --git a/.github/actions/gpu-cluster-setup/check-runner-capacity.sh b/.github/actions/gpu-cluster-setup/check-runner-capacity.sh
index 2e83beeb9..ff6c3168e 100644
--- a/.github/actions/gpu-cluster-setup/check-runner-capacity.sh
+++ b/.github/actions/gpu-cluster-setup/check-runner-capacity.sh
@@ -14,9 +14,11 @@
 # limitations under the License.
 
 set -euo pipefail
-free_disk_gb=$(df -BG --output=avail / | tail -1 | tr -dc '0-9')
-if (( free_disk_gb < MIN_FREE_DISK_GB )); then
-  echo "::error::free disk on / is ${free_disk_gb}GiB, need at least ${MIN_FREE_DISK_GB}GiB"
+free_disk_bytes=$(df -B1 --output=avail / | tail -1 | tr -dc '0-9')
+min_free_disk_bytes=$((MIN_FREE_DISK_GB * 1024 * 1024 * 1024))
+free_disk_gib=$((free_disk_bytes / 1024 / 1024 / 1024))
+if (( free_disk_bytes < min_free_disk_bytes )); then
+  echo "::error::free disk on / is ${free_disk_bytes} bytes (${free_disk_gib}GiB), need at least ${min_free_disk_bytes} bytes (${MIN_FREE_DISK_GB}GiB)"
   exit 1
 fi
 
@@ -26,4 +28,4 @@ if (( available_memory_gb < MIN_AVAILABLE_MEMORY_GB )); then
   exit 1
 fi
 
-echo "Runner capacity is sufficient: disk=${free_disk_gb}GiB memory=${available_memory_gb}GiB"
+echo "Runner capacity is sufficient: disk=${free_disk_gib}GiB (${free_disk_bytes} bytes) memory=${available_memory_gb}GiB"
diff --git a/.github/actions/gpu-cluster-setup/configure-nvidia-container-toolkit.sh b/.github/actions/gpu-cluster-setup/configure-nvidia-container-toolkit.sh
index 0a2fcd814..50d8c10fa 100644
--- a/.github/actions/gpu-cluster-setup/configure-nvidia-container-toolkit.sh
+++ b/.github/actions/gpu-cluster-setup/configure-nvidia-container-toolkit.sh
@@ -18,4 +18,17 @@ set -euo pipefail
 sudo nvidia-ctk runtime configure --runtime=docker --set-as-default --cdi.enabled
 sudo nvidia-ctk config --set accept-nvidia-visible-devices-as-volume-mounts=true --in-place
 sudo nvidia-ctk config --set accept-nvidia-visible-devices-envvar-when-unprivileged=false --in-place
-sudo systemctl restart docker
+timeout 120s sudo systemctl restart docker
+
+for attempt in $(seq 1 30); do
+  if systemctl is-active --quiet docker && docker info >/dev/null 2>&1; then
+    echo "Docker is healthy after NVIDIA runtime configuration."
+    exit 0
+  fi
+  echo "Waiting for Docker to become healthy... (${attempt}/30)"
+  sleep 2
+done
+
+echo "::error::Docker did not become healthy after NVIDIA runtime configuration"
+sudo systemctl status docker --no-pager || true
+exit 1
diff --git a/.github/actions/gpu-cluster-setup/create-gpu-kind-cluster.sh b/.github/actions/gpu-cluster-setup/create-gpu-kind-cluster.sh
index 42a282e17..2f9ab4817 100644
--- a/.github/actions/gpu-cluster-setup/create-gpu-kind-cluster.sh
+++ b/.github/actions/gpu-cluster-setup/create-gpu-kind-cluster.sh
@@ -96,6 +96,11 @@ esac
 if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" || "${CONTROL_PLANE_LEADER_ELECTION_TUNING}" == "true" ]]; then
   patch_dir="$(mktemp -d)"
   config_template="$(mktemp)"
+  cleanup_generated_config() {
+    [[ -n "${patch_dir:-}" ]] && rm -rf "${patch_dir}"
+    [[ -n "${config_template:-}" ]] && rm -f "${config_template}"
+  }
+  trap cleanup_generated_config EXIT
 
   # Keep YAML heredocs at column 0; indentation is literal content.
   if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then
@@ -361,6 +366,48 @@ static_pod_manifest_contains_arg() {
   docker exec "${node}" grep -Fq -- "- ${expected}" "/etc/kubernetes/manifests/${component}.yaml"
 }
 
+running_static_pod_container_contains_arg() {
+  local component="$1"
+  local expected="$2"
+  local node="${KIND_CLUSTER_NAME}-control-plane"
+  local container_ids
+  local container_id
+
+  if ! container_ids="$(docker exec "${node}" crictl ps --name "${component}" -q 2>/dev/null)"; then
+    return 1
+  fi
+  [[ -z "${container_ids}" ]] && return 1
+
+  for container_id in ${container_ids}; do
+    if docker exec "${node}" crictl inspect "${container_id}" 2>/dev/null | jq -e --arg expected "${expected}" '
+      ([.info.runtimeSpec.process.args[]?, .status.info.runtimeSpec.process.args[]?] | index($expected)) != null
+    ' >/dev/null; then
+      return 0
+    fi
+  done
+  return 1
+}
+
+dump_running_static_pod_container_args() {
+  local component="$1"
+  local node="${KIND_CLUSTER_NAME}-control-plane"
+  local container_ids
+  local container_id
+
+  echo "Running ${component} CRI container args:"
+  container_ids="$(docker exec "${node}" crictl ps --name "${component}" -q 2>/dev/null || true)"
+  if [[ -z "${container_ids}" ]]; then
+    echo "(no running ${component} CRI containers found)"
+    return
+  fi
+  for container_id in ${container_ids}; do
+    echo "--- ${container_id} ---"
+    docker exec "${node}" crictl inspect "${container_id}" 2>/dev/null | jq -r '
+      [.info.runtimeSpec.process.args[]?, .status.info.runtimeSpec.process.args[]?][]?
+    ' || true
+  done
+}
+
 dump_static_pod_manifest() {
   local component="$1"
   local node="${KIND_CLUSTER_NAME}-control-plane"
@@ -375,14 +422,19 @@ assert_control_plane_arg() {
   local command_args
 
   command_args="$(control_plane_command_args "${component}")"
-  if ! grep -Fxq "${expected}" <<< "${command_args}"; then
+  if ! grep -Fxq -- "${expected}" <<< "${command_args}"; then
+    if running_static_pod_container_contains_arg "${component}" "${expected}"; then
+      echo "${component} running CRI container args verified: ${expected} (live mirror pod omitted it)"
+      return
+    fi
     if static_pod_manifest_contains_arg "${component}" "${expected}"; then
-      echo "::warning::${component} live mirror pod command/args did not show ${expected}; static pod manifest is patched"
+      echo "::warning::${component} live mirror pod and running CRI container args did not show ${expected}; static pod manifest is patched"
       return
     fi
     echo "::error::${component} live pod command/args does not contain ${expected}"
     echo "Observed live command/args:"
     echo "${command_args}"
+    dump_running_static_pod_container_args "${component}"
     dump_static_pod_manifest "${component}"
     exit 1
   fi
diff --git a/.github/actions/gpu-cluster-setup/delete-stale-kind-cluster.sh b/.github/actions/gpu-cluster-setup/delete-stale-kind-cluster.sh
index 8e85ffcb9..0f29c469b 100644
--- a/.github/actions/gpu-cluster-setup/delete-stale-kind-cluster.sh
+++ b/.github/actions/gpu-cluster-setup/delete-stale-kind-cluster.sh
@@ -15,6 +15,10 @@
 
 set -euo pipefail
 kind_cluster_label="io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}"
+docker_timeout() {
+  timeout 30s docker "$@"
+}
+
 if kind get clusters | grep -Fxq "${KIND_CLUSTER_NAME}"; then
   echo "Deleting stale kind cluster: ${KIND_CLUSTER_NAME}"
   if ! timeout 180s kind delete cluster --name "${KIND_CLUSTER_NAME}"; then
@@ -24,16 +28,16 @@ else
   echo "No stale kind cluster named ${KIND_CLUSTER_NAME}"
 fi
 
-remaining_containers=$(docker ps -aq --filter "label=${kind_cluster_label}")
-if [[ -n "${remaining_containers}" ]]; then
+mapfile -t remaining_containers < <(docker_timeout ps -aq --filter "label=${kind_cluster_label}")
+if (( ${#remaining_containers[@]} > 0 )); then
   echo "Removing stale containers for ${KIND_CLUSTER_NAME}:"
-  docker ps -a --filter "label=${kind_cluster_label}"
-  docker rm -f ${remaining_containers}
+  docker_timeout ps -a --filter "label=${kind_cluster_label}"
+  docker_timeout rm -f "${remaining_containers[@]}"
 fi
 
-remaining_containers=$(docker ps -aq --filter "label=${kind_cluster_label}")
-if [[ -n "${remaining_containers}" ]]; then
+mapfile -t remaining_containers < <(docker_timeout ps -aq --filter "label=${kind_cluster_label}")
+if (( ${#remaining_containers[@]} > 0 )); then
   echo "::error::stale containers still remain for ${KIND_CLUSTER_NAME}:"
-  docker ps -a --filter "label=${kind_cluster_label}"
+  docker_timeout ps -a --filter "label=${kind_cluster_label}"
   exit 1
 fi
diff --git a/.github/actions/gpu-cluster-setup/install-nvkind.sh b/.github/actions/gpu-cluster-setup/install-nvkind.sh
index 38f1ce0ae..c2200e078 100644
--- a/.github/actions/gpu-cluster-setup/install-nvkind.sh
+++ b/.github/actions/gpu-cluster-setup/install-nvkind.sh
@@ -15,5 +15,11 @@
 
 set -euo pipefail
 
+if [[ -z "${NVKIND_VERSION:-}" ]]; then
+  echo "::error::NVKIND_VERSION must be set"
+  exit 1
+fi
+
 go install "github.com/NVIDIA/nvkind/cmd/nvkind@${NVKIND_VERSION}"
-nvkind --help
+nvkind_bin="${GOBIN:-$(go env GOPATH)/bin}/nvkind"
+"${nvkind_bin}" --help
diff --git a/.github/actions/gpu-cluster-setup/warm-kind-node-image.sh b/.github/actions/gpu-cluster-setup/warm-kind-node-image.sh
index 3d58a4887..b0567fa7c 100644
--- a/.github/actions/gpu-cluster-setup/warm-kind-node-image.sh
+++ b/.github/actions/gpu-cluster-setup/warm-kind-node-image.sh
@@ -21,9 +21,11 @@ else
   echo "Pulling kind node image: ${KIND_NODE_IMAGE}"
   timeout 600s docker pull "${KIND_NODE_IMAGE}"
 fi
-free_disk_gb=$(df -BG --output=avail / | tail -1 | tr -dc '0-9')
-if (( free_disk_gb < MIN_FREE_DISK_GB )); then
-  echo "::error::free disk on / is ${free_disk_gb}GiB after warming ${KIND_NODE_IMAGE}, need at least ${MIN_FREE_DISK_GB}GiB"
+free_disk_bytes=$(df -B1 --output=avail / | tail -1 | tr -dc '0-9')
+min_free_disk_bytes=$((MIN_FREE_DISK_GB * 1024 * 1024 * 1024))
+free_disk_gib=$((free_disk_bytes / 1024 / 1024 / 1024))
+if (( free_disk_bytes < min_free_disk_bytes )); then
+  echo "::error::free disk on / is ${free_disk_bytes} bytes (${free_disk_gib}GiB) after warming ${KIND_NODE_IMAGE}, need at least ${min_free_disk_bytes} bytes (${MIN_FREE_DISK_GB}GiB)"
   exit 1
 fi
-echo "Runner disk remains sufficient after kind image warm-up: ${free_disk_gb}GiB"
+echo "Runner disk remains sufficient after kind image warm-up: ${free_disk_gib}GiB (${free_disk_bytes} bytes)"
diff --git a/.github/actions/gpu-operator-install/generate-bundle.sh b/.github/actions/gpu-operator-install/generate-bundle.sh
index c7dd3f413..095b68415 100644
--- a/.github/actions/gpu-operator-install/generate-bundle.sh
+++ b/.github/actions/gpu-operator-install/generate-bundle.sh
@@ -15,6 +15,7 @@
 
 set -euo pipefail
 
+rm -rf bundle
 ./aicr bundle \
   --recipe recipe.yaml \
   --accelerated-node-toleration nvidia.com/gpu:NoSchedule \
diff --git a/.github/actions/gpu-operator-install/install-gpu-operator-helm.sh b/.github/actions/gpu-operator-install/install-gpu-operator-helm.sh
index 0aea450eb..6079cad83 100644
--- a/.github/actions/gpu-operator-install/install-gpu-operator-helm.sh
+++ b/.github/actions/gpu-operator-install/install-gpu-operator-helm.sh
@@ -15,7 +15,9 @@
 
 set -euo pipefail
 
-helm repo add nvidia https://helm.ngc.nvidia.com/nvidia
+GPU_OPERATOR_CHART_VERSION="v25.10.1"
+
+helm repo add nvidia https://helm.ngc.nvidia.com/nvidia --force-update
 helm repo update
 helm upgrade -i \
   --kube-context="kind-${KIND_CLUSTER_NAME}" \
@@ -25,5 +27,6 @@ helm upgrade -i \
   --set toolkit.enabled=false \
   --set dcgmExporter.enabled=false \
   --set nfd.enabled=true \
+  --version="${GPU_OPERATOR_CHART_VERSION}" \
   --wait --timeout=600s \
   gpu-operator nvidia/gpu-operator
diff --git a/.github/actions/gpu-operator-install/wait-gpu-operands-bundle.sh b/.github/actions/gpu-operator-install/wait-gpu-operands-bundle.sh
index 496eb372e..2fee8a2c0 100644
--- a/.github/actions/gpu-operator-install/wait-gpu-operands-bundle.sh
+++ b/.github/actions/gpu-operator-install/wait-gpu-operands-bundle.sh
@@ -20,7 +20,7 @@ echo "Waiting for GPU operator controller to deploy operands..."
 # DaemonSets for device-plugin, NFD, GFD, etc. This happens
 # asynchronously after the helm install completes.
 for i in $(seq 1 30); do
-  count=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \
+  count=$(kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \
     get daemonset -l app=nvidia-device-plugin-daemonset --no-headers 2>/dev/null | wc -l)
   if [[ "$count" -gt 0 ]]; then
     echo "Device plugin DaemonSet found."
@@ -32,7 +32,7 @@ done
 echo "Waiting for device plugin rollout..."
 # Operands are excluded from control-plane nodes via nodeAffinity in
 # the kind overlay, so all scheduled pods should become ready.
-kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \
+kubectl --request-timeout=300s --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \
   rollout status daemonset -l app=nvidia-device-plugin-daemonset --timeout=300s
 echo "GPU Operator pods:"
-kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods
+kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods
diff --git a/.github/actions/gpu-operator-install/wait-gpu-operands-helm.sh b/.github/actions/gpu-operator-install/wait-gpu-operands-helm.sh
index 2ad7e801d..2f0bbe159 100644
--- a/.github/actions/gpu-operator-install/wait-gpu-operands-helm.sh
+++ b/.github/actions/gpu-operator-install/wait-gpu-operands-helm.sh
@@ -16,7 +16,22 @@
 set -euo pipefail
 
 echo "Waiting for device plugin to be ready..."
-kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \
+for i in $(seq 1 30); do
+  if kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \
+    get daemonset -l app=nvidia-device-plugin-daemonset --no-headers 2>/dev/null | grep -q .; then
+    echo "Device plugin DaemonSet found."
+    break
+  fi
+  if (( i == 30 )); then
+    echo "::error::device plugin DaemonSet was not created within 300s"
+    kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods || true
+    exit 1
+  fi
+  echo "Waiting for device plugin DaemonSet to be created... (${i}/30)"
+  sleep 10
+done
+
+kubectl --request-timeout=300s --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \
   rollout status daemonset -l app=nvidia-device-plugin-daemonset --timeout=300s || true
 echo "GPU Operator pods:"
-kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods
+kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods
diff --git a/.github/actions/gpu-snapshot-validate/validate-snapshot-gpu.sh b/.github/actions/gpu-snapshot-validate/validate-snapshot-gpu.sh
index 57a622a2d..5a27e6093 100644
--- a/.github/actions/gpu-snapshot-validate/validate-snapshot-gpu.sh
+++ b/.github/actions/gpu-snapshot-validate/validate-snapshot-gpu.sh
@@ -20,6 +20,10 @@ GPU_MODEL=$(yq eval '.measurements[] | select(.type == "GPU") | .subtypes[] | se
 GPU_COUNT=$(yq eval '.measurements[] | select(.type == "GPU") | .subtypes[] | select(.subtype == "smi") | .data["gpu-count"]' snapshot.yaml)
 echo "GPU model: ${GPU_MODEL}"
 echo "GPU count: ${GPU_COUNT}"
+if ! [[ "${GPU_COUNT}" =~ ^[0-9]+$ ]]; then
+  echo "::error::Expected numeric gpu-count in snapshot, got: ${GPU_COUNT}"
+  exit 1
+fi
 if [[ "${GPU_MODEL}" != *"${EXPECTED_GPU_MODEL}"* ]]; then
   echo "::error::Expected ${EXPECTED_GPU_MODEL} GPU in snapshot, got: ${GPU_MODEL}"
   exit 1
diff --git a/.github/actions/gpu-test-cleanup/collect-debug-artifacts.sh b/.github/actions/gpu-test-cleanup/collect-debug-artifacts.sh
index 417606c7d..257695334 100644
--- a/.github/actions/gpu-test-cleanup/collect-debug-artifacts.sh
+++ b/.github/actions/gpu-test-cleanup/collect-debug-artifacts.sh
@@ -13,7 +13,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-set -o pipefail
+# Diagnostic artifact collection intentionally omits -e so one broken cluster
+# call does not prevent later artifacts from being collected.
+set -uo pipefail
 mkdir -p /tmp/debug-artifacts
 CONTROL_PLANE_COMPONENTS="kube-apiserver kube-controller-manager kube-scheduler etcd"
 kubectl_kind() {
diff --git a/.github/actions/install-karpenter-kwok/install-karpenter-kwok.sh b/.github/actions/install-karpenter-kwok/install-karpenter-kwok.sh
index 2fdb26312..472eb844b 100644
--- a/.github/actions/install-karpenter-kwok/install-karpenter-kwok.sh
+++ b/.github/actions/install-karpenter-kwok/install-karpenter-kwok.sh
@@ -32,10 +32,16 @@ validate_seconds_input() {
     echo "::error::${input_name} must be an integer number of seconds, got '${input_value}'"
     exit 1
   fi
+  if (( input_value <= 0 )); then
+    echo "::error::${input_name} must be greater than 0 seconds, got '${input_value}'"
+    exit 1
+  fi
 }
 
 validate_duration_input kwok_helm_timeout "${KWOK_HELM_TIMEOUT}"
 validate_seconds_input ko_build_timeout "${KO_BUILD_TIMEOUT}"
 validate_duration_input karpenter_helm_timeout "${KARPENTER_HELM_TIMEOUT}"
 bash kwok/scripts/install-karpenter-kwok.sh
-kubectl --context="kind-${KIND_CLUSTER_NAME}" apply -f kwok/manifests/karpenter/nodepool.yaml
+timeout 30s kubectl --request-timeout=10s \
+  --context="kind-${KIND_CLUSTER_NAME}" \
+  apply -f kwok/manifests/karpenter/nodepool.yaml
diff --git a/.github/scripts/gpu-chainsaw-health.sh b/.github/scripts/gpu-chainsaw-health.sh
index 35450eb00..7e2c5ac96 100644
--- a/.github/scripts/gpu-chainsaw-health.sh
+++ b/.github/scripts/gpu-chainsaw-health.sh
@@ -1,10 +1,36 @@
 #!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 set -euo pipefail
 
+if [[ $# -ne 1 ]]; then
+  echo "::error::Usage: $0 <test_dir>"
+  exit 2
+fi
 test_dir="$1"
+if [[ ! -d "${test_dir}" ]]; then
+  echo "::error::Test directory not found: ${test_dir}"
+  exit 1
+fi
+
+CHAINSAW_TEST_TIMEOUT="${CHAINSAW_TEST_TIMEOUT:-30m}"
+CHAINSAW_CLEANUP_TIMEOUT="${CHAINSAW_CLEANUP_TIMEOUT:-120s}"
+CHAINSAW_DELETE_TIMEOUT="${CHAINSAW_DELETE_TIMEOUT:-120s}"
 
-chainsaw test \
+timeout "${CHAINSAW_TEST_TIMEOUT}" chainsaw test \
   --test-dir "${test_dir}" \
   --config tests/chainsaw/chainsaw-config.yaml \
-  --cleanup-timeout 120s \
-  --delete-timeout 120s
+  --cleanup-timeout "${CHAINSAW_CLEANUP_TIMEOUT}" \
+  --delete-timeout "${CHAINSAW_DELETE_TIMEOUT}"
diff --git a/.github/scripts/gpu-debug-diagnostics.sh b/.github/scripts/gpu-debug-diagnostics.sh
index 0d1cd76a2..3721691b3 100644
--- a/.github/scripts/gpu-debug-diagnostics.sh
+++ b/.github/scripts/gpu-debug-diagnostics.sh
@@ -1,5 +1,22 @@
 #!/usr/bin/env bash
-set -o pipefail
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Diagnostic script: intentionally omits -e so each mode can keep collecting
+# partial failure data. Keep -u and pipefail to catch script bugs and pipeline
+# failures while individual kubectl_kind calls tolerate cluster errors.
+set -uo pipefail
 
 mode="${GPU_TEST_DIAGNOSTIC_MODE:-smoke}"
 
@@ -86,6 +103,7 @@ case "${mode}" in
   training)
     print_workload_inventory cert-manager gpu-operator monitoring skyhook nvsentinel nvidia-dra-driver \
       nvidia-network-operator kai-scheduler kubeflow
+    print_common_gpu_diagnostics
     print_grafana_diagnostics
     print_kai_diagnostics
     echo "=== Kubeflow Trainer deployment ==="
@@ -96,10 +114,6 @@ case "${mode}" in
     kubectl_kind get validatingwebhookconfigurations validator.trainer.kubeflow.org -o yaml 2>/dev/null || true
     echo "=== Kubeflow Trainer CRD ==="
     kubectl_kind get crd trainjobs.trainer.kubeflow.org -o yaml 2>/dev/null || true
-    echo "=== Non-running pods (all namespaces) ==="
-    kubectl_kind get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded 2>/dev/null || true
-    echo "=== GPU Operator pods ==="
-    kubectl_kind -n gpu-operator get pods -o wide 2>/dev/null || true
     echo "=== Node resources ==="
     kubectl_kind describe nodes 2>/dev/null | grep -A 20 "Allocated resources" || true
     ;;
diff --git a/.github/scripts/gpu-smoke-run-nvidia-smi.sh b/.github/scripts/gpu-smoke-run-nvidia-smi.sh
index cefff49fa..1a2151253 100644
--- a/.github/scripts/gpu-smoke-run-nvidia-smi.sh
+++ b/.github/scripts/gpu-smoke-run-nvidia-smi.sh
@@ -1,11 +1,27 @@
 #!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 set -euo pipefail
 
-cat <<'EOF' | kubectl --context="kind-${KIND_CLUSTER_NAME}" apply -f -
+pod_name=$(cat <<'EOF' | kubectl --context="kind-${KIND_CLUSTER_NAME}" create -f - -o jsonpath='{.metadata.name}'
 apiVersion: v1
 kind: Pod
 metadata:
-  name: gpu-smoke-test
+  generateName: gpu-smoke-test-
+  labels:
+    app: gpu-smoke-test
 spec:
   restartPolicy: Never
   containers:
@@ -16,9 +32,12 @@ spec:
       limits:
         nvidia.com/gpu: 1
 EOF
+)
+
+echo "${pod_name}" > /tmp/aicr-gpu-smoke-pod-name
 
-echo "Waiting for gpu-smoke-test pod to complete..."
-kubectl --context="kind-${KIND_CLUSTER_NAME}" wait pod/gpu-smoke-test \
+echo "Waiting for ${pod_name} pod to complete..."
+kubectl --context="kind-${KIND_CLUSTER_NAME}" wait "pod/${pod_name}" \
   --for=condition=Ready --timeout=120s || true
-kubectl --context="kind-${KIND_CLUSTER_NAME}" wait pod/gpu-smoke-test \
+kubectl --context="kind-${KIND_CLUSTER_NAME}" wait "pod/${pod_name}" \
   --for=jsonpath='{.status.phase}'=Succeeded --timeout=120s
diff --git a/.github/scripts/gpu-smoke-show-nvidia-smi-log.sh b/.github/scripts/gpu-smoke-show-nvidia-smi-log.sh
new file mode 100644
index 000000000..982648460
--- /dev/null
+++ b/.github/scripts/gpu-smoke-show-nvidia-smi-log.sh
@@ -0,0 +1,35 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+pod_name=""
+if [[ -f /tmp/aicr-gpu-smoke-pod-name ]]; then
+  pod_name="$(cat /tmp/aicr-gpu-smoke-pod-name)"
+fi
+
+if [[ -z "${pod_name}" ]]; then
+  pod_name=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" get pods \
+    -l app=gpu-smoke-test \
+    --sort-by=.metadata.creationTimestamp \
+    -o jsonpath='{.items[-1:].metadata.name}')
+fi
+
+if [[ -z "${pod_name}" ]]; then
+  echo "::error::no gpu-smoke-test pod found"
+  exit 1
+fi
+
+kubectl --context="kind-${KIND_CLUSTER_NAME}" logs "${pod_name}"
diff --git a/.github/scripts/gpu-validate-conformance.sh b/.github/scripts/gpu-validate-conformance.sh
index 2fb3fa4a3..79550cb3a 100644
--- a/.github/scripts/gpu-validate-conformance.sh
+++ b/.github/scripts/gpu-validate-conformance.sh
@@ -1,4 +1,18 @@
 #!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 set -euo pipefail
 
 AICR_VALIDATOR_IMAGE_REGISTRY=ko.local \
diff --git a/.github/workflows/gpu-smoke-test.yaml b/.github/workflows/gpu-smoke-test.yaml
index 0cbb2da5c..bdf607e07 100644
--- a/.github/workflows/gpu-smoke-test.yaml
+++ b/.github/workflows/gpu-smoke-test.yaml
@@ -111,7 +111,7 @@ jobs:
         run: bash .github/scripts/gpu-smoke-run-nvidia-smi.sh
 
       - name: Show nvidia-smi output
-        run: kubectl --context="kind-${KIND_CLUSTER_NAME}" logs gpu-smoke-test
+        run: bash .github/scripts/gpu-smoke-show-nvidia-smi-log.sh
 
       # --- Snapshot and validation ---
 
diff --git a/docs/user/cli-reference.md b/docs/user/cli-reference.md
index 02b104a8c..c641eee07 100644
--- a/docs/user/cli-reference.md
+++ b/docs/user/cli-reference.md
@@ -1323,7 +1323,7 @@ After `helm install`, the same manifests are re-applied as post-install to ensur
 Components that use operator patterns with custom resources that reconcile asynchronously use component-specific install behavior:
 
 - `kai-scheduler` installs without `--wait` to avoid Helm timing out on CR readiness. It uses a 30-minute per-attempt timeout and caps the retry budget at 1 retry so hook diagnostics surface quickly on cold runners.
-- `dynamo-platform` uses client-side apply so retries and upgrades do not conflict with Grove-managed webhook certificate Secret data, while preserving Helm `--wait` behavior. Dynamo installs use a 20-minute per-attempt timeout and cap the retry budget at 3 retries because cold-start failures often involve operator webhook and certificate resources settling across attempts.
+- `dynamo-platform` has `deploy.sh` attempt `--server-side=false` so retries and upgrades do not conflict with Grove-managed webhook certificate Secret data, while preserving Helm `--wait` behavior. The script only adds `--server-side=false` when Helm v4.0.5 or later is detected; with older Helm clients it logs a warning and proceeds without that mitigation. Dynamo installs use a 20-minute per-attempt timeout and cap the retry budget at 3 retries because cold-start failures often involve operator webhook and certificate resources settling across attempts.
 - `kube-prometheus-stack` keeps the default timeout and retry budget because Grafana can hit its Deployment progress deadline before a longer Helm timeout would help, especially on slower H100 CI runners under image-pull and control-plane load.
 
 ##### DRA kubelet plugin registration
diff --git a/pkg/bundler/deployer/helm/helm_test.go b/pkg/bundler/deployer/helm/helm_test.go
index 4028a8246..da23545a0 100644
--- a/pkg/bundler/deployer/helm/helm_test.go
+++ b/pkg/bundler/deployer/helm/helm_test.go
@@ -512,10 +512,10 @@ func TestGenerate_DeployScriptKaiSchedulerTimeout(t *testing.T) {
 	if !strings.Contains(script, `dump_kai_scheduler_helm_diagnostics "${namespace}"`) {
 		t.Error("deploy.sh missing kai-scheduler diagnostics hook")
 	}
-	if !strings.Contains(script, `kubectl get jobs -n "${namespace}"`) {
+	if !strings.Contains(script, `kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" get jobs -n "${namespace}"`) {
 		t.Error("deploy.sh missing job diagnostics")
 	}
-	if !strings.Contains(script, `kubectl describe pods -n "${namespace}"`) {
+	if !strings.Contains(script, `kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" describe pods -n "${namespace}"`) {
 		t.Error("deploy.sh missing pod diagnostics")
 	}
 
@@ -575,16 +575,17 @@ func TestGenerate_DeployScriptComponentTimeouts(t *testing.T) {
 			wantRetryCap:        `if [[ "${COMPONENT_MAX_RETRIES}" -gt 3 ]]`,
 			wantApplyArgs:       `COMPONENT_HELM_APPLY_ARGS=(--server-side=false)`,
 			wantSnippets: []string{
-				`helm_supports_server_side_flag`,
+				`helm_supports_server_side_false_install`,
+				`Require v4.0.5+ before relying on`,
 				`--request-timeout="${KUBECTL_REQUEST_TIMEOUT}"`,
-				`dynamo-platform conflict mitigation requires a Helm client that supports --server-side=false`,
+				`dynamo-platform conflict mitigation requires Helm v4.0.5+`,
 				`dump_dynamo_platform_helm_diagnostics "${namespace}"`,
 				`deployment/dynamo-platform-dynamo-operator-controller-manager`,
 				`--previous --tail=200`,
 			},
 			wantReadmeSnippets: []string{
 				`--server-side=false`,
-				`Helm client that supports the flag`,
+				`requires Helm v4.0.5 or later`,
 				`--wait --timeout 20m`,
 			},
 		},
diff --git a/pkg/bundler/deployer/helm/templates/README.md.tmpl b/pkg/bundler/deployer/helm/templates/README.md.tmpl
index 3a1368470..1ef2f252b 100644
--- a/pkg/bundler/deployer/helm/templates/README.md.tmpl
+++ b/pkg/bundler/deployer/helm/templates/README.md.tmpl
@@ -105,7 +105,7 @@ helm upgrade --install {{ .Name }} {{ .ChartName }} \
 {{ end -}}
 ```
 {{ if eq .Name "dynamo-platform" }}
-`--server-side=false` requires a Helm client that supports the flag (Helm v4 or compatible). `deploy.sh` checks support before adding the flag and warns before falling back when unavailable.
+`--server-side=false` requires Helm v4.0.5 or later. `deploy.sh` checks the Helm version before adding the flag and warns before falling back when unavailable.
 {{ end -}}
 {{ end -}}
 {{ if .HasManifests }}
diff --git a/pkg/bundler/deployer/helm/templates/component-README.md.tmpl b/pkg/bundler/deployer/helm/templates/component-README.md.tmpl
index 84517401d..66762ac7f 100644
--- a/pkg/bundler/deployer/helm/templates/component-README.md.tmpl
+++ b/pkg/bundler/deployer/helm/templates/component-README.md.tmpl
@@ -71,7 +71,7 @@ helm upgrade --install {{ .Name }} {{ .ChartName }} \
 {{ end -}}
 ```
 {{ if eq .Name "dynamo-platform" }}
-`--server-side=false` requires a Helm client that supports the flag (Helm v4 or compatible). `deploy.sh` checks support before adding the flag and warns before falling back when unavailable.
+`--server-side=false` requires Helm v4.0.5 or later. `deploy.sh` checks the Helm version before adding the flag and warns before falling back when unavailable.
 {{ end -}}
 {{ if .HasManifests }}
 After the chart is installed, apply additional manifests:
diff --git a/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl b/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl
index c8f7ceb68..4c1e26f28 100644
--- a/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl
+++ b/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl
@@ -60,7 +60,32 @@ function backoff_seconds() {
   echo "${seconds}"
 }
 
-function helm_supports_server_side_flag() {
+function helm_supports_server_side_false_install() {
+  local version
+  local major
+  local minor
+  local patch
+
+  # Helm v4.0.0-v4.0.4 advertise --server-side=false but ignore it for the
+  # upgrade --install install-fallback path. Require v4.0.5+ before relying on
+  # the flag for Dynamo's webhook Secret conflict mitigation.
+  version="$(helm version --short 2>/dev/null | head -n 1 || true)"
+  version="${version#v}"
+  version="${version%%+*}"
+  version="${version%%-*}"
+  if ! [[ "${version}" =~ ^([0-9]+)\.([0-9]+)\.([0-9]+)$ ]]; then
+    return 1
+  fi
+  major="${BASH_REMATCH[1]}"
+  minor="${BASH_REMATCH[2]}"
+  patch="${BASH_REMATCH[3]}"
+  if (( major < 4 )); then
+    return 1
+  fi
+  if (( major == 4 )) && (( minor == 0 )) && (( patch < 5 )); then
+    return 1
+  fi
+
   helm help upgrade 2>/dev/null | grep -q -- '--server-side'
 }
 
@@ -91,7 +116,7 @@ function retry() {
 function cleanup_helm_hooks() {
   local namespace="$1"
   local job_names
-  job_names=$(kubectl get jobs -n "${namespace}" \
+  job_names=$(kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" get jobs -n "${namespace}" \
     --field-selector=status.successful=0 \
     -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' \
     2>/dev/null || true)
@@ -102,7 +127,7 @@ function cleanup_helm_hooks() {
     [[ -z "${name}" ]] && continue
     # Get the full Job JSON to reliably check annotations and status
     local job_json
-    job_json=$(kubectl get job "${name}" -n "${namespace}" -o json 2>/dev/null || true)
+    job_json=$(kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" get job "${name}" -n "${namespace}" -o json 2>/dev/null || true)
     [[ -z "${job_json}" ]] && continue
     # Skip non-hook Jobs (no helm.sh/hook annotation)
     local hook_val
@@ -111,13 +136,13 @@ function cleanup_helm_hooks() {
     # Capture diagnostics before deleting. This helps diagnose transient hook
     # failures (e.g., dynamo ssh-keygen) that are otherwise lost after cleanup.
     echo "  --- Failed hook Job ${name} diagnostics ---"
-    kubectl describe job "${name}" -n "${namespace}" 2>/dev/null | tail -50 || true
+    kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" describe job "${name}" -n "${namespace}" 2>/dev/null | tail -50 || true
     local pod_names
-    pod_names=$(kubectl get pods -n "${namespace}" -l "job-name=${name}" \
+    pod_names=$(kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" get pods -n "${namespace}" -l "job-name=${name}" \
       -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' 2>/dev/null || true)
     for pod_name in ${pod_names}; do
       echo "  --- Hook pod ${pod_name} describe ---"
-      kubectl describe pod "${pod_name}" -n "${namespace}" 2>/dev/null | tail -50 || true
+      kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" describe pod "${pod_name}" -n "${namespace}" 2>/dev/null | tail -50 || true
     done
     echo "  --- End diagnostics for ${name} ---"
     # Delete any non-succeeded hook Job. This function only runs after a Helm
@@ -125,7 +150,7 @@ function cleanup_helm_hooks() {
     # retry — whether it failed, is stuck Pending (timed out before the pod
     # started), or is still active with a stuck container.
     echo "  Cleaning up stale Helm hook Job ${name} in ${namespace}..."
-    kubectl delete job "${name}" -n "${namespace}" --ignore-not-found 2>/dev/null || true
+    kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" delete job "${name}" -n "${namespace}" --ignore-not-found 2>/dev/null || true
   done <<< "${job_names}"
 }
 
@@ -137,15 +162,15 @@ function dump_kai_scheduler_helm_diagnostics() {
 
   echo "  --- ${namespace} diagnostics ---"
   echo "  Jobs:"
-  kubectl get jobs -n "${namespace}" 2>/dev/null || true
+  kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" get jobs -n "${namespace}" 2>/dev/null || true
   echo "  Job descriptions:"
-  kubectl describe jobs -n "${namespace}" 2>/dev/null || true
+  kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" describe jobs -n "${namespace}" 2>/dev/null || true
   echo "  Pods:"
-  kubectl get pods -n "${namespace}" -o wide 2>/dev/null || true
+  kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" get pods -n "${namespace}" -o wide 2>/dev/null || true
   echo "  Pod descriptions:"
-  kubectl describe pods -n "${namespace}" 2>/dev/null || true
+  kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" describe pods -n "${namespace}" 2>/dev/null || true
   echo "  Recent events:"
-  kubectl get events -n "${namespace}" --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true
+  kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" get events -n "${namespace}" --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true
   echo "  --- End ${namespace} diagnostics ---"
 }
 
@@ -419,11 +444,12 @@ fi
 COMPONENT_HELM_TIMEOUT="20m"
 # Grove owns the generated webhook certificate Secret data after install.
 # Client-side apply avoids server-side field ownership conflicts during retries.
-# This flag requires a Helm client that supports --server-side=false.
-if helm_supports_server_side_flag; then
+# This flag requires Helm v4.0.5+; earlier Helm v4 releases advertise the flag
+# but ignore --server-side=false on a fresh upgrade --install fallback.
+if helm_supports_server_side_false_install; then
   COMPONENT_HELM_APPLY_ARGS=(--server-side=false)
 else
-  echo "::warning::dynamo-platform conflict mitigation requires a Helm client that supports --server-side=false; proceeding without this flag"
+  echo "::warning::dynamo-platform conflict mitigation requires Helm v4.0.5+ with working --server-side=false install fallback; proceeding without this flag"
 fi
 if [[ "${COMPONENT_MAX_RETRIES}" -gt 3 ]]; then
   COMPONENT_MAX_RETRIES="3"

From e1e931aba92ff0178f3df2effc1b50e19f8516a0 Mon Sep 17 00:00:00 2001
From: Yuan Chen <yuanchen97@gmail.com>
Date: Sun, 26 Apr 2026 19:48:05 -0700
Subject: [PATCH 20/21] ci: harden H100 kind runtime workflows

---
 .github/actions/aicr-build/build-cli.sh       |   5 -
 .../aicr-build/build-validator-images.sh      |   3 +
 .../check-control-plane-health/action.yml     |  21 ++
 .../check-control-plane-health.sh             | 175 +++++++++++++-
 .../configure-nvidia-container-toolkit.sh     |  11 +-
 .../create-gpu-kind-cluster.sh                |  80 ++++---
 .../delete-stale-kind-cluster.sh              |  19 +-
 .../actions/gpu-debug-diagnostics/action.yml  |  35 +++
 .../wait-gpu-operands-bundle.sh               |  18 +-
 .../wait-gpu-operands-helm.sh                 |   2 +-
 .../actions/gpu-smoke-nvidia-smi/action.yml   |  36 +++
 .github/actions/gpu-test-cleanup/action.yml   |  10 +-
 .../collect-debug-artifacts.sh                |  39 ++++
 .../install-karpenter-kwok.sh                 |   2 +-
 .github/scripts/gpu-chainsaw-health.sh        |  50 +++-
 .github/scripts/gpu-debug-diagnostics.sh      | 212 ++++++++++++-----
 .../scripts/gpu-runtime-component-health.sh   | 110 +++++++++
 .github/scripts/gpu-smoke-run-nvidia-smi.sh   |  20 +-
 .../scripts/gpu-smoke-show-nvidia-smi-log.sh  |  16 +-
 .../workflows/gpu-h100-inference-test.yaml    | 203 ++--------------
 .../workflows/gpu-h100-kind-runtime-test.yaml | 221 ++++++++++++++++++
 .github/workflows/gpu-h100-training-test.yaml | 197 ++--------------
 .github/workflows/gpu-smoke-test.yaml         |  26 ++-
 pkg/bundler/deployer/helm/helm_test.go        |   4 +
 .../deployer/helm/templates/deploy.sh.tmpl    |   3 +-
 recipes/overlays/kind.yaml                    |  33 +++
 26 files changed, 1060 insertions(+), 491 deletions(-)
 create mode 100644 .github/actions/gpu-debug-diagnostics/action.yml
 create mode 100644 .github/actions/gpu-smoke-nvidia-smi/action.yml
 create mode 100644 .github/scripts/gpu-runtime-component-health.sh
 create mode 100644 .github/workflows/gpu-h100-kind-runtime-test.yaml

diff --git a/.github/actions/aicr-build/build-cli.sh b/.github/actions/aicr-build/build-cli.sh
index 83c834aad..c87428241 100644
--- a/.github/actions/aicr-build/build-cli.sh
+++ b/.github/actions/aicr-build/build-cli.sh
@@ -16,9 +16,4 @@
 set -euo pipefail
 
 mkdir -p dist
-if [[ -x dist/aicr ]]; then
-  echo "Reusing existing dist/aicr"
-  exit 0
-fi
-
 CGO_ENABLED=0 go build -trimpath -o dist/aicr ./cmd/aicr
diff --git a/.github/actions/aicr-build/build-validator-images.sh b/.github/actions/aicr-build/build-validator-images.sh
index 76682af90..f098e84e8 100644
--- a/.github/actions/aicr-build/build-validator-images.sh
+++ b/.github/actions/aicr-build/build-validator-images.sh
@@ -15,6 +15,7 @@
 
 set -euo pipefail
 
+VALIDATOR_PHASES="${VALIDATOR_PHASES:-}"
 if [[ -n "${VALIDATOR_PHASES}" ]]; then
   if [[ "${VALIDATOR_PHASES}" == "none" ]]; then
     echo "Skipping validator builds (validator_phases=none)"
@@ -26,6 +27,8 @@ else
   PHASES="deployment,performance,conformance"
 fi
 
+: "${KIND_CLUSTER_NAME:?KIND_CLUSTER_NAME must be set}"
+
 mkdir -p dist/validator
 for phase in ${PHASES//,/ }; do
   if ! [[ "${phase}" =~ ^[a-z][a-z0-9_-]*$ ]]; then
diff --git a/.github/actions/check-control-plane-health/action.yml b/.github/actions/check-control-plane-health/action.yml
index 0172b00a9..85833925f 100644
--- a/.github/actions/check-control-plane-health/action.yml
+++ b/.github/actions/check-control-plane-health/action.yml
@@ -39,6 +39,22 @@ inputs:
     description: 'Optional duration to watch for new control-plane restarts after pods are Ready'
     required: false
     default: '0s'
+  stability_probe_interval:
+    description: 'Interval for active API server probes during the stability window'
+    required: false
+    default: '10s'
+  stability_probe_failure_threshold:
+    description: 'Consecutive active stability probe failures allowed before failing'
+    required: false
+    default: '2'
+  lease_components:
+    description: 'Space-separated leader election lease names to check for freshness'
+    required: false
+    default: kube-controller-manager kube-scheduler
+  lease_stale_timeout:
+    description: 'Maximum allowed leader election lease age at the end of a stability window'
+    required: false
+    default: '120s'
   recover_unhealthy:
     description: 'Restart eligible Kind control-plane static pod containers when they are currently unhealthy'
     required: false
@@ -62,7 +78,12 @@ runs:
         NAMESPACE: ${{ inputs.namespace }}
         COMPONENTS: ${{ inputs.components }}
         WAIT_TIMEOUT: ${{ inputs.wait_timeout }}
+        MAX_RESTARTS: ${{ inputs.max_restarts }}
         STABILITY_WINDOW: ${{ inputs.stability_window }}
+        STABILITY_PROBE_INTERVAL: ${{ inputs.stability_probe_interval }}
+        STABILITY_PROBE_FAILURE_THRESHOLD: ${{ inputs.stability_probe_failure_threshold }}
+        LEASE_COMPONENTS: ${{ inputs.lease_components }}
+        LEASE_STALE_TIMEOUT: ${{ inputs.lease_stale_timeout }}
         RECOVER_UNHEALTHY: ${{ inputs.recover_unhealthy }}
         RECOVERY_COMPONENTS: ${{ inputs.recovery_components }}
         MAX_RECOVERY_ATTEMPTS: ${{ inputs.max_recovery_attempts }}
diff --git a/.github/actions/check-control-plane-health/check-control-plane-health.sh b/.github/actions/check-control-plane-health/check-control-plane-health.sh
index 3614df47f..350538255 100644
--- a/.github/actions/check-control-plane-health/check-control-plane-health.sh
+++ b/.github/actions/check-control-plane-health/check-control-plane-health.sh
@@ -25,6 +25,25 @@ validate_duration_input() {
   fi
 }
 
+duration_seconds() {
+  local input_value="$1"
+  local number="${input_value%[smh]}"
+  local unit="${input_value: -1}"
+  local amount
+
+  amount=$((10#${number}))
+
+  case "${unit}" in
+    s) echo "${amount}" ;;
+    m) echo $((amount * 60)) ;;
+    h) echo $((amount * 3600)) ;;
+    *)
+      echo "::error::unsupported duration unit in '${input_value}'" >&2
+      exit 1
+      ;;
+  esac
+}
+
 MAX_RECOVERY_ATTEMPTS="${MAX_RECOVERY_ATTEMPTS#"${MAX_RECOVERY_ATTEMPTS%%[![:space:]]*}"}"
 MAX_RECOVERY_ATTEMPTS="${MAX_RECOVERY_ATTEMPTS%"${MAX_RECOVERY_ATTEMPTS##*[![:space:]]}"}"
 if ! [[ "${MAX_RECOVERY_ATTEMPTS}" =~ ^[0-9]+$ ]]; then
@@ -32,6 +51,13 @@ if ! [[ "${MAX_RECOVERY_ATTEMPTS}" =~ ^[0-9]+$ ]]; then
   exit 1
 fi
 
+MAX_RESTARTS="${MAX_RESTARTS:-}"
+MAX_RESTARTS="${MAX_RESTARTS#"${MAX_RESTARTS%%[![:space:]]*}"}"
+MAX_RESTARTS="${MAX_RESTARTS%"${MAX_RESTARTS##*[![:space:]]}"}"
+if [[ -n "${MAX_RESTARTS}" ]] && [[ "${MAX_RESTARTS}" != "1" ]]; then
+  echo "::warning::max_restarts is deprecated and ignored; use stability_window to fail on new control-plane restarts"
+fi
+
 WAIT_TIMEOUT="${WAIT_TIMEOUT#"${WAIT_TIMEOUT%%[![:space:]]*}"}"
 WAIT_TIMEOUT="${WAIT_TIMEOUT%"${WAIT_TIMEOUT##*[![:space:]]}"}"
 validate_duration_input wait_timeout "${WAIT_TIMEOUT}"
@@ -45,6 +71,42 @@ validate_duration_input stability_window "${STABILITY_WINDOW}"
 if [[ "${STABILITY_WINDOW}" =~ ^0+[smh]$ ]]; then
   STABILITY_WINDOW="0s"
 fi
+STABILITY_WINDOW_SECONDS="$(duration_seconds "${STABILITY_WINDOW}")"
+
+STABILITY_PROBE_INTERVAL="${STABILITY_PROBE_INTERVAL:-10s}"
+STABILITY_PROBE_INTERVAL="${STABILITY_PROBE_INTERVAL#"${STABILITY_PROBE_INTERVAL%%[![:space:]]*}"}"
+STABILITY_PROBE_INTERVAL="${STABILITY_PROBE_INTERVAL%"${STABILITY_PROBE_INTERVAL##*[![:space:]]}"}"
+validate_duration_input stability_probe_interval "${STABILITY_PROBE_INTERVAL}"
+STABILITY_PROBE_INTERVAL_SECONDS="$(duration_seconds "${STABILITY_PROBE_INTERVAL}")"
+if (( STABILITY_PROBE_INTERVAL_SECONDS <= 0 )); then
+  echo "::error::stability_probe_interval must be greater than 0, got '${STABILITY_PROBE_INTERVAL}'"
+  exit 1
+fi
+STABILITY_PROBE_FAILURE_THRESHOLD="${STABILITY_PROBE_FAILURE_THRESHOLD:-2}"
+STABILITY_PROBE_FAILURE_THRESHOLD="${STABILITY_PROBE_FAILURE_THRESHOLD#"${STABILITY_PROBE_FAILURE_THRESHOLD%%[![:space:]]*}"}"
+STABILITY_PROBE_FAILURE_THRESHOLD="${STABILITY_PROBE_FAILURE_THRESHOLD%"${STABILITY_PROBE_FAILURE_THRESHOLD##*[![:space:]]}"}"
+if ! [[ "${STABILITY_PROBE_FAILURE_THRESHOLD}" =~ ^[0-9]+$ ]]; then
+  echo "::error::stability_probe_failure_threshold must be a positive integer, got '${STABILITY_PROBE_FAILURE_THRESHOLD}'"
+  exit 1
+fi
+if (( STABILITY_PROBE_FAILURE_THRESHOLD <= 0 )); then
+  echo "::error::stability_probe_failure_threshold must be greater than 0, got '${STABILITY_PROBE_FAILURE_THRESHOLD}'"
+  exit 1
+fi
+
+LEASE_COMPONENTS="${LEASE_COMPONENTS:-kube-controller-manager kube-scheduler}"
+LEASE_COMPONENTS="${LEASE_COMPONENTS#"${LEASE_COMPONENTS%%[![:space:]]*}"}"
+LEASE_COMPONENTS="${LEASE_COMPONENTS%"${LEASE_COMPONENTS##*[![:space:]]}"}"
+
+LEASE_STALE_TIMEOUT="${LEASE_STALE_TIMEOUT:-120s}"
+LEASE_STALE_TIMEOUT="${LEASE_STALE_TIMEOUT#"${LEASE_STALE_TIMEOUT%%[![:space:]]*}"}"
+LEASE_STALE_TIMEOUT="${LEASE_STALE_TIMEOUT%"${LEASE_STALE_TIMEOUT##*[![:space:]]}"}"
+validate_duration_input lease_stale_timeout "${LEASE_STALE_TIMEOUT}"
+LEASE_STALE_TIMEOUT_SECONDS="$(duration_seconds "${LEASE_STALE_TIMEOUT}")"
+if (( LEASE_STALE_TIMEOUT_SECONDS <= 0 )); then
+  echo "::error::lease_stale_timeout must be greater than 0, got '${LEASE_STALE_TIMEOUT}'"
+  exit 1
+fi
 
 RECOVER_UNHEALTHY="${RECOVER_UNHEALTHY#"${RECOVER_UNHEALTHY%%[![:space:]]*}"}"
 RECOVER_UNHEALTHY="${RECOVER_UNHEALTHY%"${RECOVER_UNHEALTHY##*[![:space:]]}"}"
@@ -147,6 +209,114 @@ require_readyz() {
   fi
 }
 
+probe_control_plane_api() {
+  local reason="$1"
+  local component
+  local lease_summary
+
+  if ! kubectl_kind get --raw='/readyz' >/dev/null; then
+    echo "::error::kube-apiserver /readyz probe failed ${reason}"
+    return 1
+  fi
+
+  for component in ${LEASE_COMPONENTS}; do
+    if ! lease_summary=$(kubectl_kind -n "${NAMESPACE}" get lease "${component}" \
+      -o jsonpath='{.metadata.name}{" holder="}{.spec.holderIdentity}{" renewTime="}{.spec.renewTime}{"\n"}' 2>/dev/null); then
+      echo "::error::failed to read leader election lease ${component} ${reason}"
+      return 1
+    fi
+    echo "${lease_summary}"
+  done
+}
+
+lease_renew_epoch() {
+  local renew_time="$1"
+
+  date -u -d "${renew_time}" +%s 2>/dev/null
+}
+
+verify_leader_lease_freshness() {
+  local component
+  local now_epoch
+  local renew_time
+  local renew_epoch
+  local lease_age
+
+  [[ -z "${LEASE_COMPONENTS}" ]] && return
+
+  now_epoch="$(date -u +%s)"
+  echo "Checking leader election lease freshness (max age ${LEASE_STALE_TIMEOUT})..."
+  for component in ${LEASE_COMPONENTS}; do
+    if ! renew_time=$(kubectl_kind -n "${NAMESPACE}" get lease "${component}" -o jsonpath='{.spec.renewTime}' 2>/dev/null); then
+      echo "::error::failed to read leader election lease ${component}"
+      dump_all_control_plane_runtime_diagnostics
+      exit 1
+    fi
+    if [[ -z "${renew_time}" ]]; then
+      echo "::error::leader election lease ${component} has empty spec.renewTime"
+      dump_all_control_plane_runtime_diagnostics
+      exit 1
+    fi
+    if ! renew_epoch="$(lease_renew_epoch "${renew_time}")"; then
+      echo "::error::failed to parse leader election lease ${component} renewTime '${renew_time}'"
+      dump_all_control_plane_runtime_diagnostics
+      exit 1
+    fi
+    lease_age=$((now_epoch - renew_epoch))
+    if (( lease_age < 0 )); then
+      lease_age=0
+    fi
+    echo "${component} lease renewTime=${renew_time} age=${lease_age}s"
+    if (( lease_age > LEASE_STALE_TIMEOUT_SECONDS )); then
+      echo "::error::leader election lease ${component} is stale: age=${lease_age}s exceeds ${LEASE_STALE_TIMEOUT}"
+      dump_all_control_plane_runtime_diagnostics
+      exit 1
+    fi
+  done
+}
+
+observe_stability_window() {
+  local label="$1"
+  local elapsed=0
+  local probe=0
+  local sleep_seconds
+  local consecutive_failures=0
+  local total_failures=0
+
+  echo "Observing control-plane stability for ${STABILITY_WINDOW} (${label}); probing every ${STABILITY_PROBE_INTERVAL}, failing after ${STABILITY_PROBE_FAILURE_THRESHOLD} consecutive probe failure(s)..."
+  while (( elapsed < STABILITY_WINDOW_SECONDS )); do
+    sleep_seconds="${STABILITY_PROBE_INTERVAL_SECONDS}"
+    if (( elapsed + sleep_seconds > STABILITY_WINDOW_SECONDS )); then
+      sleep_seconds=$((STABILITY_WINDOW_SECONDS - elapsed))
+    fi
+    if (( sleep_seconds > 0 )); then
+      sleep "${sleep_seconds}"
+      elapsed=$((elapsed + sleep_seconds))
+    fi
+
+    probe=$((probe + 1))
+    echo "=== Control-plane stability probe ${probe} (${elapsed}/${STABILITY_WINDOW_SECONDS}s, ${label}) ==="
+    if probe_control_plane_api "during ${label} stability probe ${probe}"; then
+      consecutive_failures=0
+      continue
+    fi
+
+    total_failures=$((total_failures + 1))
+    consecutive_failures=$((consecutive_failures + 1))
+    echo "::warning::control-plane stability probe ${probe} failed (${consecutive_failures} consecutive, ${total_failures} total)"
+    if (( consecutive_failures >= STABILITY_PROBE_FAILURE_THRESHOLD )); then
+      echo "::error::control-plane had ${consecutive_failures} consecutive failed stability probes during ${label}"
+      dump_all_control_plane_runtime_diagnostics
+      exit 1
+    fi
+  done
+
+  if (( total_failures > 0 )); then
+    echo "::warning::control-plane had ${total_failures} transient failed stability probe(s) during ${label}; final health checks must still pass"
+  fi
+  verify_leader_lease_freshness
+}
+
 dump_api_server_health() {
   local endpoint
 
@@ -391,8 +561,7 @@ verify_stability_window() {
     return
   fi
 
-  echo "Observing control-plane stability for ${STABILITY_WINDOW}..."
-  sleep "${STABILITY_WINDOW}"
+  observe_stability_window "primary"
   for component in ${COMPONENTS}; do
     initial_restarts="${INITIAL_RESTARTS[${component}]:-}"
     if [[ -z "${initial_restarts}" ]]; then
@@ -427,7 +596,7 @@ verify_stability_window() {
   fi
 
   echo "::warning::control-plane recovery occurred; observing one additional ${STABILITY_WINDOW} stability window"
-  sleep "${STABILITY_WINDOW}"
+  observe_stability_window "post-recovery"
   for component in ${COMPONENTS}; do
     initial_restarts="${INITIAL_RESTARTS[${component}]:-}"
     if [[ -z "${initial_restarts}" ]]; then
diff --git a/.github/actions/gpu-cluster-setup/configure-nvidia-container-toolkit.sh b/.github/actions/gpu-cluster-setup/configure-nvidia-container-toolkit.sh
index 50d8c10fa..16352077c 100644
--- a/.github/actions/gpu-cluster-setup/configure-nvidia-container-toolkit.sh
+++ b/.github/actions/gpu-cluster-setup/configure-nvidia-container-toolkit.sh
@@ -18,10 +18,19 @@ set -euo pipefail
 sudo nvidia-ctk runtime configure --runtime=docker --set-as-default --cdi.enabled
 sudo nvidia-ctk config --set accept-nvidia-visible-devices-as-volume-mounts=true --in-place
 sudo nvidia-ctk config --set accept-nvidia-visible-devices-envvar-when-unprivileged=false --in-place
+set +e
 timeout 120s sudo systemctl restart docker
+restart_status=$?
+set -e
+if (( restart_status != 0 )); then
+  echo "::error::Docker restart failed after NVIDIA runtime configuration"
+  sudo systemctl status docker --no-pager || true
+  journalctl -u docker --since "10 minutes ago" --no-pager || true
+  exit "${restart_status}"
+fi
 
 for attempt in $(seq 1 30); do
-  if systemctl is-active --quiet docker && docker info >/dev/null 2>&1; then
+  if systemctl is-active --quiet docker && timeout 5s docker info >/dev/null 2>&1; then
     echo "Docker is healthy after NVIDIA runtime configuration."
     exit 0
   fi
diff --git a/.github/actions/gpu-cluster-setup/create-gpu-kind-cluster.sh b/.github/actions/gpu-cluster-setup/create-gpu-kind-cluster.sh
index 2f9ab4817..0c22fb845 100644
--- a/.github/actions/gpu-cluster-setup/create-gpu-kind-cluster.sh
+++ b/.github/actions/gpu-cluster-setup/create-gpu-kind-cluster.sh
@@ -25,6 +25,20 @@ validate_duration_input() {
   fi
 }
 
+kubectl_kind() {
+  timeout 30s kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" "$@"
+}
+
+kubectl_kind_wait() {
+  timeout 330s kubectl --request-timeout=300s --context="kind-${KIND_CLUSTER_NAME}" "$@"
+}
+
+docker_timeout() {
+  local limit="$1"
+  shift
+  timeout "${limit}" docker "$@"
+}
+
 validate_generated_control_plane_config() {
   if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then
     for patch_file in "${patch_dir}"/*.yaml; do
@@ -288,22 +302,22 @@ case "${create_status}" in
     ;;
 esac
 
-kubectl --context="kind-${KIND_CLUSTER_NAME}" wait --for=condition=Ready nodes --all --timeout=300s
-kubectl --context="kind-${KIND_CLUSTER_NAME}" cluster-info
-kubectl --context="kind-${KIND_CLUSTER_NAME}" get nodes -o wide
-kubectl --context="kind-${KIND_CLUSTER_NAME}" describe nodes | \
+kubectl_kind_wait wait --for=condition=Ready nodes --all --timeout=300s
+kubectl_kind cluster-info
+kubectl_kind get nodes -o wide
+kubectl_kind describe nodes | \
   grep -E "^(Name:|Capacity:|Allocatable:|Allocated resources:|  cpu|  memory|  nvidia.com/gpu)" || true
 
 echo "=== Kind node container resources ==="
-docker ps --filter "label=io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}" \
+docker_timeout 30s ps --filter "label=io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}" \
   --format '{{.Names}}' | sort | while read -r node_container; do
     [[ -z "${node_container}" ]] && continue
-    docker inspect "${node_container}" \
+    docker_timeout 30s inspect "${node_container}" \
       --format '{{.Name}} NanoCpus={{.HostConfig.NanoCpus}} CpuShares={{.HostConfig.CpuShares}} Memory={{.HostConfig.Memory}} MemoryReservation={{.HostConfig.MemoryReservation}}'
   done
 
 echo "=== Control-plane resource requests/limits ==="
-kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kube-system \
+kubectl_kind -n kube-system \
   get pods -l tier=control-plane -o json | jq -r '
     .items[] as $pod |
     $pod.metadata.name,
@@ -327,7 +341,7 @@ control_plane_request() {
   local component="$1"
   local resource="$2"
 
-  kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kube-system \
+  kubectl_kind -n kube-system \
     get pod -l "component=${component}" \
     -o "jsonpath={.items[0].spec.containers[0].resources.requests.${resource}}"
 }
@@ -353,7 +367,7 @@ assert_control_plane_request() {
 control_plane_command_args() {
   local component="$1"
 
-  kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kube-system \
+  kubectl_kind -n kube-system \
     get pod -l "component=${component}" \
     -o json | jq -r '.items[0].spec.containers[0] | ((.command // []) + (.args // []))[]?'
 }
@@ -363,7 +377,7 @@ static_pod_manifest_contains_arg() {
   local expected="$2"
   local node="${KIND_CLUSTER_NAME}-control-plane"
 
-  docker exec "${node}" grep -Fq -- "- ${expected}" "/etc/kubernetes/manifests/${component}.yaml"
+  docker_timeout 30s exec "${node}" grep -Fq -- "- ${expected}" "/etc/kubernetes/manifests/${component}.yaml"
 }
 
 running_static_pod_container_contains_arg() {
@@ -372,16 +386,18 @@ running_static_pod_container_contains_arg() {
   local node="${KIND_CLUSTER_NAME}-control-plane"
   local container_ids
   local container_id
+  local inspect_output
 
-  if ! container_ids="$(docker exec "${node}" crictl ps --name "${component}" -q 2>/dev/null)"; then
+  if ! container_ids="$(docker_timeout 30s exec "${node}" crictl ps --name "${component}" -q 2>/dev/null)"; then
     return 1
   fi
   [[ -z "${container_ids}" ]] && return 1
 
   for container_id in ${container_ids}; do
-    if docker exec "${node}" crictl inspect "${container_id}" 2>/dev/null | jq -e --arg expected "${expected}" '
+    inspect_output="$(docker_timeout 30s exec "${node}" crictl inspect "${container_id}" 2>/dev/null || true)"
+    if jq -e --arg expected "${expected}" '
       ([.info.runtimeSpec.process.args[]?, .status.info.runtimeSpec.process.args[]?] | index($expected)) != null
-    ' >/dev/null; then
+    ' >/dev/null 2>&1 <<< "${inspect_output}" || grep -Fq -- "${expected}" <<< "${inspect_output}"; then
       return 0
     fi
   done
@@ -395,14 +411,14 @@ dump_running_static_pod_container_args() {
   local container_id
 
   echo "Running ${component} CRI container args:"
-  container_ids="$(docker exec "${node}" crictl ps --name "${component}" -q 2>/dev/null || true)"
+  container_ids="$(docker_timeout 30s exec "${node}" crictl ps --name "${component}" -q 2>/dev/null || true)"
   if [[ -z "${container_ids}" ]]; then
     echo "(no running ${component} CRI containers found)"
     return
   fi
   for container_id in ${container_ids}; do
     echo "--- ${container_id} ---"
-    docker exec "${node}" crictl inspect "${container_id}" 2>/dev/null | jq -r '
+    docker_timeout 30s exec "${node}" crictl inspect "${container_id}" 2>/dev/null | jq -r '
       [.info.runtimeSpec.process.args[]?, .status.info.runtimeSpec.process.args[]?][]?
     ' || true
   done
@@ -413,32 +429,40 @@ dump_static_pod_manifest() {
   local node="${KIND_CLUSTER_NAME}-control-plane"
 
   echo "Static pod manifest /etc/kubernetes/manifests/${component}.yaml:"
-  docker exec "${node}" sed -n '1,220p' "/etc/kubernetes/manifests/${component}.yaml" || true
+  docker_timeout 30s exec "${node}" sed -n '1,220p' "/etc/kubernetes/manifests/${component}.yaml" || true
 }
 
 assert_control_plane_arg() {
   local component="$1"
   local expected="$2"
+  local attempt
   local command_args
 
-  command_args="$(control_plane_command_args "${component}")"
-  if ! grep -Fxq -- "${expected}" <<< "${command_args}"; then
+  for attempt in $(seq 1 12); do
+    command_args="$(control_plane_command_args "${component}" || true)"
+    if grep -Fxq -- "${expected}" <<< "${command_args}"; then
+      echo "${component} command/args verified: ${expected}"
+      return
+    fi
     if running_static_pod_container_contains_arg "${component}" "${expected}"; then
       echo "${component} running CRI container args verified: ${expected} (live mirror pod omitted it)"
       return
     fi
     if static_pod_manifest_contains_arg "${component}" "${expected}"; then
-      echo "::warning::${component} live mirror pod and running CRI container args did not show ${expected}; static pod manifest is patched"
-      return
+      echo "::warning::${component} static pod manifest has ${expected}, but the running container does not yet; waiting for kubelet to converge (${attempt}/12)"
+      sleep 5
+      continue
     fi
-    echo "::error::${component} live pod command/args does not contain ${expected}"
-    echo "Observed live command/args:"
-    echo "${command_args}"
-    dump_running_static_pod_container_args "${component}"
-    dump_static_pod_manifest "${component}"
-    exit 1
-  fi
-  echo "${component} command/args verified: ${expected}"
+
+    break
+  done
+
+  echo "::error::${component} running command/args does not contain ${expected}"
+  echo "Observed live command/args:"
+  echo "${command_args:-}"
+  dump_running_static_pod_container_args "${component}"
+  dump_static_pod_manifest "${component}"
+  exit 1
 }
 
 if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then
diff --git a/.github/actions/gpu-cluster-setup/delete-stale-kind-cluster.sh b/.github/actions/gpu-cluster-setup/delete-stale-kind-cluster.sh
index 0f29c469b..5e0a81778 100644
--- a/.github/actions/gpu-cluster-setup/delete-stale-kind-cluster.sh
+++ b/.github/actions/gpu-cluster-setup/delete-stale-kind-cluster.sh
@@ -19,6 +19,21 @@ docker_timeout() {
   timeout 30s docker "$@"
 }
 
+read_kind_container_ids() {
+  local output
+
+  if ! output="$(docker_timeout ps -aq --filter "label=${kind_cluster_label}" 2>&1)"; then
+    echo "::error::failed to query stale kind containers for ${KIND_CLUSTER_NAME}"
+    echo "${output}"
+    exit 1
+  fi
+
+  remaining_containers=()
+  if [[ -n "${output}" ]]; then
+    mapfile -t remaining_containers <<< "${output}"
+  fi
+}
+
 if kind get clusters | grep -Fxq "${KIND_CLUSTER_NAME}"; then
   echo "Deleting stale kind cluster: ${KIND_CLUSTER_NAME}"
   if ! timeout 180s kind delete cluster --name "${KIND_CLUSTER_NAME}"; then
@@ -28,14 +43,14 @@ else
   echo "No stale kind cluster named ${KIND_CLUSTER_NAME}"
 fi
 
-mapfile -t remaining_containers < <(docker_timeout ps -aq --filter "label=${kind_cluster_label}")
+read_kind_container_ids
 if (( ${#remaining_containers[@]} > 0 )); then
   echo "Removing stale containers for ${KIND_CLUSTER_NAME}:"
   docker_timeout ps -a --filter "label=${kind_cluster_label}"
   docker_timeout rm -f "${remaining_containers[@]}"
 fi
 
-mapfile -t remaining_containers < <(docker_timeout ps -aq --filter "label=${kind_cluster_label}")
+read_kind_container_ids
 if (( ${#remaining_containers[@]} > 0 )); then
   echo "::error::stale containers still remain for ${KIND_CLUSTER_NAME}:"
   docker_timeout ps -a --filter "label=${kind_cluster_label}"
diff --git a/.github/actions/gpu-debug-diagnostics/action.yml b/.github/actions/gpu-debug-diagnostics/action.yml
new file mode 100644
index 000000000..e5a38b964
--- /dev/null
+++ b/.github/actions/gpu-debug-diagnostics/action.yml
@@ -0,0 +1,35 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: 'GPU Debug Diagnostics'
+description: 'Print bounded GPU CI diagnostics while the kind cluster is still present.'
+
+inputs:
+  cluster_name:
+    description: 'Kind cluster name'
+    required: true
+  mode:
+    description: 'Diagnostic mode: smoke, training, or inference'
+    required: false
+    default: 'smoke'
+
+runs:
+  using: 'composite'
+  steps:
+    - name: Print GPU debug diagnostics
+      shell: bash
+      env:
+        KIND_CLUSTER_NAME: ${{ inputs.cluster_name }}
+        GPU_TEST_DIAGNOSTIC_MODE: ${{ inputs.mode }}
+      run: bash "${{ github.workspace }}/.github/scripts/gpu-debug-diagnostics.sh"
diff --git a/.github/actions/gpu-operator-install/wait-gpu-operands-bundle.sh b/.github/actions/gpu-operator-install/wait-gpu-operands-bundle.sh
index 2fee8a2c0..9566fb8ba 100644
--- a/.github/actions/gpu-operator-install/wait-gpu-operands-bundle.sh
+++ b/.github/actions/gpu-operator-install/wait-gpu-operands-bundle.sh
@@ -19,16 +19,28 @@ echo "Waiting for GPU operator controller to deploy operands..."
 # The GPU operator controller watches ClusterPolicy and creates
 # DaemonSets for device-plugin, NFD, GFD, etc. This happens
 # asynchronously after the helm install completes.
+daemonset_found=false
 for i in $(seq 1 30); do
-  count=$(kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \
-    get daemonset -l app=nvidia-device-plugin-daemonset --no-headers 2>/dev/null | wc -l)
-  if [[ "$count" -gt 0 ]]; then
+  daemonsets=""
+  if daemonsets=$(kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \
+    get daemonset -l app=nvidia-device-plugin-daemonset --no-headers 2>/dev/null); then
+    if [[ -n "${daemonsets}" ]]; then
+      daemonset_found=true
+    fi
+  fi
+  if [[ "${daemonset_found}" == "true" ]]; then
     echo "Device plugin DaemonSet found."
     break
   fi
   echo "Waiting for device plugin DaemonSet to be created... (${i}/30)"
   sleep 10
 done
+if [[ "${daemonset_found}" != "true" ]]; then
+  echo "::error::device plugin DaemonSet was not created within 300s"
+  kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods || true
+  kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get events --sort-by='.lastTimestamp' || true
+  exit 1
+fi
 echo "Waiting for device plugin rollout..."
 # Operands are excluded from control-plane nodes via nodeAffinity in
 # the kind overlay, so all scheduled pods should become ready.
diff --git a/.github/actions/gpu-operator-install/wait-gpu-operands-helm.sh b/.github/actions/gpu-operator-install/wait-gpu-operands-helm.sh
index 2f0bbe159..3d3042f8a 100644
--- a/.github/actions/gpu-operator-install/wait-gpu-operands-helm.sh
+++ b/.github/actions/gpu-operator-install/wait-gpu-operands-helm.sh
@@ -32,6 +32,6 @@ for i in $(seq 1 30); do
 done
 
 kubectl --request-timeout=300s --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \
-  rollout status daemonset -l app=nvidia-device-plugin-daemonset --timeout=300s || true
+  rollout status daemonset -l app=nvidia-device-plugin-daemonset --timeout=300s
 echo "GPU Operator pods:"
 kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods
diff --git a/.github/actions/gpu-smoke-nvidia-smi/action.yml b/.github/actions/gpu-smoke-nvidia-smi/action.yml
new file mode 100644
index 000000000..cb61b5d0d
--- /dev/null
+++ b/.github/actions/gpu-smoke-nvidia-smi/action.yml
@@ -0,0 +1,36 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: 'GPU Smoke nvidia-smi'
+description: 'Run nvidia-smi in a GPU-backed kind pod and print its logs.'
+
+inputs:
+  cluster_name:
+    description: 'Kind cluster name'
+    required: true
+
+runs:
+  using: 'composite'
+  steps:
+    - name: Run nvidia-smi in a pod
+      shell: bash
+      env:
+        KIND_CLUSTER_NAME: ${{ inputs.cluster_name }}
+      run: bash "${{ github.workspace }}/.github/scripts/gpu-smoke-run-nvidia-smi.sh"
+    - name: Show nvidia-smi output
+      if: always()
+      shell: bash
+      env:
+        KIND_CLUSTER_NAME: ${{ inputs.cluster_name }}
+      run: bash "${{ github.workspace }}/.github/scripts/gpu-smoke-show-nvidia-smi-log.sh"
diff --git a/.github/actions/gpu-test-cleanup/action.yml b/.github/actions/gpu-test-cleanup/action.yml
index a1eef57b9..417130669 100644
--- a/.github/actions/gpu-test-cleanup/action.yml
+++ b/.github/actions/gpu-test-cleanup/action.yml
@@ -23,18 +23,22 @@ inputs:
     description: 'Prefix for the uploaded artifact name'
     required: false
     default: 'gpu-test-debug'
+  collect_artifacts:
+    description: 'Collect and upload debug artifacts before deleting the kind cluster'
+    required: false
+    default: 'false'
 
 runs:
   using: 'composite'
   steps:
     - name: Collect debug artifacts
-      if: failure() || cancelled()
+      if: inputs.collect_artifacts == 'true'
       shell: bash
       env:
         KIND_CLUSTER_NAME: ${{ inputs.cluster_name }}
       run: bash "${{ github.action_path }}/collect-debug-artifacts.sh"
     - name: Export kind logs
-      if: failure() || cancelled()
+      if: always() && inputs.collect_artifacts == 'true'
       shell: bash
       env:
         KIND_CLUSTER_NAME: ${{ inputs.cluster_name }}
@@ -46,7 +50,7 @@ runs:
         KIND_CLUSTER_NAME: ${{ inputs.cluster_name }}
       run: bash "${{ github.action_path }}/cleanup-kind-cluster.sh"
     - name: Upload debug artifacts
-      if: failure() || cancelled()
+      if: always() && inputs.collect_artifacts == 'true'
       uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f  # v6.0.0
       with:
         name: ${{ inputs.artifact_name_prefix }}-${{ github.run_id }}
diff --git a/.github/actions/gpu-test-cleanup/collect-debug-artifacts.sh b/.github/actions/gpu-test-cleanup/collect-debug-artifacts.sh
index 257695334..cb33e770a 100644
--- a/.github/actions/gpu-test-cleanup/collect-debug-artifacts.sh
+++ b/.github/actions/gpu-test-cleanup/collect-debug-artifacts.sh
@@ -16,6 +16,7 @@
 # Diagnostic artifact collection intentionally omits -e so one broken cluster
 # call does not prevent later artifacts from being collected.
 set -uo pipefail
+rm -rf /tmp/debug-artifacts
 mkdir -p /tmp/debug-artifacts
 CONTROL_PLANE_COMPONENTS="kube-apiserver kube-controller-manager kube-scheduler etcd"
 kubectl_kind() {
@@ -27,6 +28,23 @@ docker_timeout() {
   timeout "${limit}" docker "$@"
 }
 
+{
+  date -u || true
+  hostname || true
+  uptime || true
+  nproc || true
+  free -h || true
+  df -h / || true
+  df -ih / || true
+} > /tmp/debug-artifacts/runner-baseline.txt 2>&1 || true
+docker_timeout 30s version > /tmp/debug-artifacts/docker-version.txt 2>&1 || true
+docker_timeout 30s info > /tmp/debug-artifacts/docker-info.txt 2>&1 || true
+nvidia-smi -L > /tmp/debug-artifacts/host-gpus.txt 2>&1 || true
+nvidia-smi >> /tmp/debug-artifacts/host-gpus.txt 2>&1 || true
+kind get clusters > /tmp/debug-artifacts/kind-clusters.txt 2>&1 || true
+docker_timeout 30s ps -a --filter "label=io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}" \
+  > /tmp/debug-artifacts/kind-node-containers.txt 2>&1 || true
+
 kubectl_kind get all --all-namespaces > /tmp/debug-artifacts/all-resources.txt || true
 kubectl_kind get events --all-namespaces --sort-by='.lastTimestamp' > /tmp/debug-artifacts/events.txt || true
 kubectl_kind get --raw='/livez?verbose' > /tmp/debug-artifacts/apiserver-livez.txt 2>&1 || true
@@ -48,6 +66,24 @@ done
 kubectl_kind -n gpu-operator get pods -o wide > /tmp/debug-artifacts/gpu-operator-pods.txt || true
 kubectl_kind -n gpu-operator logs -l app=nvidia-device-plugin-daemonset --tail=100 > /tmp/debug-artifacts/device-plugin-logs.txt || true
 kubectl_kind -n gpu-operator logs -l app.kubernetes.io/component=gpu-operator --tail=100 > /tmp/debug-artifacts/gpu-operator-logs.txt || true
+kubectl_kind -n monitoring get deployment,statefulset,daemonset,pods -o wide \
+  > /tmp/debug-artifacts/monitoring-workloads.txt 2>&1 || true
+kubectl_kind -n monitoring describe deployment kube-prometheus-operator \
+  > /tmp/debug-artifacts/kube-prometheus-operator-deployment-describe.txt 2>&1 || true
+kubectl_kind -n monitoring logs deployment/kube-prometheus-operator --all-containers --tail=300 \
+  > /tmp/debug-artifacts/kube-prometheus-operator-logs.txt 2>&1 || true
+kubectl_kind -n monitoring logs deployment/kube-prometheus-operator --all-containers --previous --tail=300 \
+  > /tmp/debug-artifacts/kube-prometheus-operator-previous-logs.txt 2>&1 || true
+kubectl_kind -n monitoring get events --sort-by='.lastTimestamp' \
+  > /tmp/debug-artifacts/monitoring-events.txt 2>&1 || true
+{
+  kubectl_kind -n monitoring get pods -o name 2>/dev/null \
+    | grep '^pod/kube-prometheus-operator-' \
+    | while read -r pod; do
+        echo "=== ${pod} ==="
+        kubectl_kind -n monitoring describe "${pod}" 2>&1 || true
+      done
+} > /tmp/debug-artifacts/kube-prometheus-operator-pods-describe.txt 2>&1 || true
 kubectl_kind get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded > /tmp/debug-artifacts/non-running-pods.txt || true
 tar_inputs=()
 [[ -f recipe.yaml ]] && tar_inputs+=(recipe.yaml)
@@ -63,6 +99,8 @@ docker_timeout 30s ps --filter "label=io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}
   --format '{{.Names}}' | sort | while read -r node_container; do
     [[ -z "${node_container}" ]] && continue
     node_file="${node_container//[^A-Za-z0-9_.-]/_}"
+    docker_timeout 30s inspect "${node_container}" \
+      > "/tmp/debug-artifacts/${node_file}-docker-inspect.json" 2>&1 || true
     docker_timeout 30s exec "${node_container}" journalctl -u kubelet \
       --since "90 minutes ago" --no-pager \
       > "/tmp/debug-artifacts/${node_file}-kubelet-journal.txt" 2>&1 || true
@@ -83,6 +121,7 @@ docker_timeout 30s ps --filter "label=io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}
       echo "--- top cpu/memory processes ---"
       ps -eo pid,ppid,stat,etime,%cpu,%mem,comm,args --sort=-%cpu | head -40 || true
     ' > "/tmp/debug-artifacts/${node_file}-node-pressure.txt" 2>&1 || true
+    # shellcheck disable=SC2016 # Expanded inside the kind node shell.
     docker_timeout 120s exec "${node_container}" sh -c '
       for component in kube-apiserver kube-controller-manager kube-scheduler etcd; do
         echo "=== ${component} static pod manifest ==="
diff --git a/.github/actions/install-karpenter-kwok/install-karpenter-kwok.sh b/.github/actions/install-karpenter-kwok/install-karpenter-kwok.sh
index 472eb844b..8987144ab 100644
--- a/.github/actions/install-karpenter-kwok/install-karpenter-kwok.sh
+++ b/.github/actions/install-karpenter-kwok/install-karpenter-kwok.sh
@@ -32,7 +32,7 @@ validate_seconds_input() {
     echo "::error::${input_name} must be an integer number of seconds, got '${input_value}'"
     exit 1
   fi
-  if (( input_value <= 0 )); then
+  if (( 10#${input_value} <= 0 )); then
     echo "::error::${input_name} must be greater than 0 seconds, got '${input_value}'"
     exit 1
   fi
diff --git a/.github/scripts/gpu-chainsaw-health.sh b/.github/scripts/gpu-chainsaw-health.sh
index 7e2c5ac96..7098c6bf8 100644
--- a/.github/scripts/gpu-chainsaw-health.sh
+++ b/.github/scripts/gpu-chainsaw-health.sh
@@ -26,11 +26,53 @@ if [[ ! -d "${test_dir}" ]]; then
 fi
 
 CHAINSAW_TEST_TIMEOUT="${CHAINSAW_TEST_TIMEOUT:-30m}"
-CHAINSAW_CLEANUP_TIMEOUT="${CHAINSAW_CLEANUP_TIMEOUT:-120s}"
-CHAINSAW_DELETE_TIMEOUT="${CHAINSAW_DELETE_TIMEOUT:-120s}"
+MONITORING_READY_TIMEOUT="${MONITORING_READY_TIMEOUT:-180s}"
+
+kubectl_kind() {
+  timeout 30s kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" "$@"
+}
+
+kubectl_kind_wait() {
+  # Rollout status opens a watch that is already bounded by --timeout. Keep
+  # request-timeout unset here so a slow API server does not cut the watch short.
+  kubectl --context="kind-${KIND_CLUSTER_NAME}" "$@"
+}
+
+print_monitoring_diagnostics() {
+  echo "=== Monitoring workloads ==="
+  kubectl_kind -n monitoring get deployment,statefulset,daemonset,pods -o wide 2>/dev/null || true
+  echo "=== kube-prometheus-operator deployment ==="
+  kubectl_kind -n monitoring get deployment kube-prometheus-operator -o wide 2>/dev/null || true
+  echo "=== kube-prometheus-operator deployment describe ==="
+  kubectl_kind -n monitoring describe deployment kube-prometheus-operator 2>/dev/null || true
+  echo "=== kube-prometheus-operator pods ==="
+  kubectl_kind -n monitoring get pods -o wide 2>/dev/null \
+    | grep -E '(^NAME|^kube-prometheus-operator-)' || true
+  echo "=== kube-prometheus-operator logs ==="
+  kubectl_kind -n monitoring logs deployment/kube-prometheus-operator --all-containers --tail=200 2>/dev/null || true
+  echo "=== kube-prometheus-operator previous logs ==="
+  kubectl_kind -n monitoring logs deployment/kube-prometheus-operator --all-containers --previous --tail=200 2>/dev/null || true
+  echo "=== Recent events (monitoring) ==="
+  kubectl_kind -n monitoring get events --sort-by='.lastTimestamp' 2>/dev/null | tail -100 || true
+}
+
+wait_for_monitoring_operator() {
+  echo "Waiting for monitoring/kube-prometheus-operator before Chainsaw..."
+  print_monitoring_diagnostics
+  if kubectl_kind_wait -n monitoring rollout status deployment/kube-prometheus-operator \
+    --timeout="${MONITORING_READY_TIMEOUT}"; then
+    echo "monitoring/kube-prometheus-operator is rolled out."
+    return 0
+  fi
+
+  echo "::error::monitoring/kube-prometheus-operator did not become available within ${MONITORING_READY_TIMEOUT}"
+  print_monitoring_diagnostics
+  return 1
+}
+
+wait_for_monitoring_operator
 
 timeout "${CHAINSAW_TEST_TIMEOUT}" chainsaw test \
   --test-dir "${test_dir}" \
   --config tests/chainsaw/chainsaw-config.yaml \
-  --cleanup-timeout "${CHAINSAW_CLEANUP_TIMEOUT}" \
-  --delete-timeout "${CHAINSAW_DELETE_TIMEOUT}"
+  --skip-delete
diff --git a/.github/scripts/gpu-debug-diagnostics.sh b/.github/scripts/gpu-debug-diagnostics.sh
index 3721691b3..3db82a6e8 100644
--- a/.github/scripts/gpu-debug-diagnostics.sh
+++ b/.github/scripts/gpu-debug-diagnostics.sh
@@ -24,6 +24,39 @@ kubectl_kind() {
   timeout 30s kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" "$@"
 }
 
+docker_timeout() {
+  local limit="$1"
+  shift
+  timeout "${limit}" docker "$@"
+}
+
+print_setup_diagnostics() {
+  echo "=== Runner baseline ==="
+  date -u || true
+  hostname || true
+  uptime || true
+  nproc || true
+  free -h || true
+  df -h / || true
+  df -ih / || true
+  echo "=== Docker health ==="
+  docker info >/dev/null 2>&1 && docker version || true
+  echo "=== Host GPUs ==="
+  nvidia-smi -L || true
+  nvidia-smi || true
+  echo "=== Kind clusters ==="
+  kind get clusters || true
+  echo "=== Kind node containers ==="
+  docker_timeout 30s ps -a --filter "label=io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}" || true
+  echo "=== Kind node container resources ==="
+  docker_timeout 30s ps --filter "label=io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}" \
+    --format '{{.Names}}' | sort | while read -r node_container; do
+      [[ -z "${node_container}" ]] && continue
+      docker_timeout 30s inspect "${node_container}" \
+        --format '{{.Name}} State={{.State.Status}} NanoCpus={{.HostConfig.NanoCpus}} CpuShares={{.HostConfig.CpuShares}} Memory={{.HostConfig.Memory}} MemoryReservation={{.HostConfig.MemoryReservation}}' || true
+    done || true
+}
+
 print_workload_images() {
   local ns="$1"
   kubectl_kind -n "${ns}" get deployment,daemonset,statefulset -o json 2>/dev/null \
@@ -47,15 +80,39 @@ print_workload_inventory() {
   done
 }
 
-print_grafana_diagnostics() {
-  echo "=== Grafana deployment ==="
-  kubectl_kind -n monitoring get deployment grafana -o wide 2>/dev/null || true
-  echo "=== Grafana pods ==="
-  kubectl_kind -n monitoring get pods -l app.kubernetes.io/name=grafana -o wide 2>/dev/null || true
-  echo "=== Grafana deployment describe ==="
-  kubectl_kind -n monitoring describe deployment grafana 2>/dev/null || true
-  echo "=== Grafana pod describe ==="
-  kubectl_kind -n monitoring describe pods -l app.kubernetes.io/name=grafana 2>/dev/null || true
+print_component_status_summary() {
+  echo "=== Component workload status ==="
+  kubectl_kind get deployments,statefulsets,daemonsets,pods -A -o wide 2>/dev/null || true
+  echo "=== Component rollout conditions ==="
+  kubectl_kind get deployments,statefulsets,daemonsets -A \
+    -o custom-columns='KIND:.kind,NAMESPACE:.metadata.namespace,NAME:.metadata.name,READY:.status.readyReplicas,AVAILABLE:.status.availableReplicas,DESIRED:.status.replicas,UPDATED:.status.updatedReplicas,AGE:.metadata.creationTimestamp' \
+    2>/dev/null || true
+  echo "=== Non-ready pods ==="
+  kubectl_kind get pods -A \
+    --field-selector=status.phase!=Running,status.phase!=Succeeded \
+    -o wide 2>/dev/null || true
+}
+
+print_kube_prometheus_operator_diagnostics() {
+  echo "=== Monitoring workloads ==="
+  kubectl_kind -n monitoring get deployment,statefulset,daemonset,pods -o wide 2>/dev/null || true
+  echo "=== kube-prometheus-operator deployment ==="
+  kubectl_kind -n monitoring get deployment kube-prometheus-operator -o wide 2>/dev/null || true
+  echo "=== kube-prometheus-operator deployment describe ==="
+  kubectl_kind -n monitoring describe deployment kube-prometheus-operator 2>/dev/null || true
+  echo "=== kube-prometheus-operator pod describe ==="
+  kubectl_kind -n monitoring get pods -o name 2>/dev/null \
+    | grep '^pod/kube-prometheus-operator-' \
+    | while read -r pod; do
+        echo "--- ${pod} ---"
+        kubectl_kind -n monitoring describe "${pod}" 2>/dev/null || true
+      done || true
+  echo "=== kube-prometheus-operator logs ==="
+  kubectl_kind -n monitoring logs deployment/kube-prometheus-operator --all-containers --tail=200 2>/dev/null || true
+  echo "=== kube-prometheus-operator previous logs ==="
+  kubectl_kind -n monitoring logs deployment/kube-prometheus-operator --all-containers --previous --tail=200 2>/dev/null || true
+  echo "=== Recent events (monitoring) ==="
+  kubectl_kind -n monitoring get events --sort-by='.lastTimestamp' 2>/dev/null | tail -80 || true
 }
 
 print_kai_diagnostics() {
@@ -83,6 +140,34 @@ print_kai_diagnostics() {
   kubectl_kind -n kai-scheduler get events --sort-by='.lastTimestamp' 2>/dev/null | tail -50 || true
 }
 
+print_custom_metrics() {
+  local metric
+  local ns
+  local namespaces=("$@")
+
+  echo "=== Custom metrics API ==="
+  for metric in gpu_utilization gpu_memory_used gpu_power_usage; do
+    for ns in "${namespaces[@]}"; do
+      echo "--- ${ns}/${metric} ---"
+      kubectl_kind get --raw "/apis/custom.metrics.k8s.io/v1beta1/namespaces/${ns}/pods/*/${metric}" 2>/dev/null \
+        | jq . || true
+    done
+  done
+}
+
+print_metrics_pipeline_diagnostics() {
+  echo "=== prometheus-adapter pods ==="
+  kubectl_kind -n monitoring get pods -l app.kubernetes.io/name=prometheus-adapter -o wide 2>/dev/null || true
+  echo "=== DCGM Exporter pods ==="
+  kubectl_kind -n gpu-operator get pods -l app=nvidia-dcgm-exporter -o wide 2>/dev/null || true
+  echo "=== Monitoring pods ==="
+  kubectl_kind -n monitoring get pods -o wide 2>/dev/null || true
+  echo "=== DRA ResourceSlices ==="
+  kubectl_kind get resourceslices -o wide 2>/dev/null || true
+  echo "=== Node status ==="
+  kubectl_kind get nodes -o wide 2>/dev/null || true
+}
+
 print_common_gpu_diagnostics() {
   echo "=== ClusterPolicy status ==="
   kubectl_kind get clusterpolicy -o yaml 2>/dev/null || true
@@ -94,64 +179,75 @@ print_common_gpu_diagnostics() {
   kubectl_kind -n gpu-operator get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true
 }
 
+print_h100_common_diagnostics() {
+  local metric_namespaces=("$@")
+  local common_namespaces=(
+    cert-manager
+    gpu-operator
+    monitoring
+    skyhook
+    nvsentinel
+    nvidia-dra-driver
+    nvidia-network-operator
+    kai-scheduler
+  )
+
+  print_setup_diagnostics
+  print_component_status_summary
+  print_workload_inventory "${common_namespaces[@]}" "${metric_namespaces[@]}"
+  print_common_gpu_diagnostics
+  print_kube_prometheus_operator_diagnostics
+  print_kai_diagnostics
+  print_custom_metrics gpu-operator "${metric_namespaces[@]}"
+  print_metrics_pipeline_diagnostics
+  echo "=== Node resources ==="
+  kubectl_kind describe nodes 2>/dev/null | grep -A 20 "Allocated resources" || true
+}
+
+print_kubeflow_diagnostics() {
+  echo "=== Kubeflow Trainer deployment ==="
+  kubectl_kind -n kubeflow get deployment kubeflow-trainer-controller-manager -o wide 2>/dev/null || true
+  echo "=== Kubeflow pods ==="
+  kubectl_kind -n kubeflow get pods -o wide 2>/dev/null || true
+  echo "=== Kubeflow validating webhooks ==="
+  kubectl_kind get validatingwebhookconfigurations validator.trainer.kubeflow.org -o yaml 2>/dev/null || true
+  echo "=== Kubeflow Trainer CRD ==="
+  kubectl_kind get crd trainjobs.trainer.kubeflow.org -o yaml 2>/dev/null || true
+}
+
+print_dynamo_diagnostics() {
+  echo "=== Dynamo pods ==="
+  kubectl_kind -n dynamo-system get pods -o wide 2>/dev/null || true
+  echo "=== Dynamo operator logs ==="
+  kubectl_kind -n dynamo-system logs deployment/dynamo-platform-dynamo-operator-controller-manager --tail=100 -c manager 2>/dev/null || true
+  echo "=== Recent events (dynamo-system) ==="
+  kubectl_kind -n dynamo-system get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true
+}
+
+print_kgateway_diagnostics() {
+  echo "=== kgateway pods ==="
+  kubectl_kind -n kgateway-system get pods -o wide 2>/dev/null || true
+  echo "=== GatewayClass status ==="
+  kubectl_kind get gatewayclass -o yaml 2>/dev/null || true
+  echo "=== Gateway status ==="
+  kubectl_kind get gateways -A -o yaml 2>/dev/null || true
+}
+
 case "${mode}" in
   smoke)
+    print_setup_diagnostics
     print_common_gpu_diagnostics
     echo "=== Node status ==="
     kubectl_kind get nodes -o wide 2>/dev/null || true
     ;;
   training)
-    print_workload_inventory cert-manager gpu-operator monitoring skyhook nvsentinel nvidia-dra-driver \
-      nvidia-network-operator kai-scheduler kubeflow
-    print_common_gpu_diagnostics
-    print_grafana_diagnostics
-    print_kai_diagnostics
-    echo "=== Kubeflow Trainer deployment ==="
-    kubectl_kind -n kubeflow get deployment kubeflow-trainer-controller-manager -o wide 2>/dev/null || true
-    echo "=== Kubeflow pods ==="
-    kubectl_kind -n kubeflow get pods -o wide 2>/dev/null || true
-    echo "=== Kubeflow validating webhooks ==="
-    kubectl_kind get validatingwebhookconfigurations validator.trainer.kubeflow.org -o yaml 2>/dev/null || true
-    echo "=== Kubeflow Trainer CRD ==="
-    kubectl_kind get crd trainjobs.trainer.kubeflow.org -o yaml 2>/dev/null || true
-    echo "=== Node resources ==="
-    kubectl_kind describe nodes 2>/dev/null | grep -A 20 "Allocated resources" || true
+    print_h100_common_diagnostics kubeflow
+    print_kubeflow_diagnostics
     ;;
   inference)
-    print_workload_inventory cert-manager gpu-operator monitoring skyhook nvsentinel nvidia-dra-driver \
-      nvidia-network-operator kai-scheduler dynamo-system kgateway-system
-    print_common_gpu_diagnostics
-    echo "=== Dynamo pods ==="
-    kubectl_kind -n dynamo-system get pods -o wide 2>/dev/null || true
-    echo "=== Dynamo operator logs ==="
-    kubectl_kind -n dynamo-system logs deployment/dynamo-operator-controller-manager --tail=100 -c manager 2>/dev/null || true
-    echo "=== Recent events (dynamo-system) ==="
-    kubectl_kind -n dynamo-system get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true
-    print_kai_diagnostics
-    echo "=== Custom metrics API ==="
-    for metric in gpu_utilization gpu_memory_used gpu_power_usage; do
-      echo "--- ${metric} ---"
-      for ns in gpu-operator dynamo-system; do
-        kubectl_kind get --raw "/apis/custom.metrics.k8s.io/v1beta1/namespaces/${ns}/pods/*/${metric}" 2>/dev/null | jq . || true
-      done
-    done
-    print_grafana_diagnostics
-    echo "=== prometheus-adapter pods ==="
-    kubectl_kind -n monitoring get pods -l app.kubernetes.io/name=prometheus-adapter -o wide 2>/dev/null || true
-    echo "=== kgateway pods ==="
-    kubectl_kind -n kgateway-system get pods -o wide 2>/dev/null || true
-    echo "=== GatewayClass status ==="
-    kubectl_kind get gatewayclass -o yaml 2>/dev/null || true
-    echo "=== Gateway status ==="
-    kubectl_kind get gateways -A -o yaml 2>/dev/null || true
-    echo "=== DCGM Exporter pods ==="
-    kubectl_kind -n gpu-operator get pods -l app=nvidia-dcgm-exporter -o wide 2>/dev/null || true
-    echo "=== Monitoring pods ==="
-    kubectl_kind -n monitoring get pods -o wide 2>/dev/null || true
-    echo "=== DRA ResourceSlices ==="
-    kubectl_kind get resourceslices -o wide 2>/dev/null || true
-    echo "=== Node status ==="
-    kubectl_kind get nodes -o wide 2>/dev/null || true
+    print_h100_common_diagnostics dynamo-system kgateway-system
+    print_dynamo_diagnostics
+    print_kgateway_diagnostics
     ;;
   *)
     echo "::error::unknown GPU_TEST_DIAGNOSTIC_MODE: ${mode}"
diff --git a/.github/scripts/gpu-runtime-component-health.sh b/.github/scripts/gpu-runtime-component-health.sh
new file mode 100644
index 000000000..93b8efc7b
--- /dev/null
+++ b/.github/scripts/gpu-runtime-component-health.sh
@@ -0,0 +1,110 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+if [[ $# -ne 1 ]]; then
+  echo "::error::Usage: $0 <training|inference>"
+  exit 2
+fi
+
+mode="$1"
+COMPONENT_HEALTH_TIMEOUT="${COMPONENT_HEALTH_TIMEOUT:-120s}"
+
+kubectl_kind() {
+  timeout 30s kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" "$@"
+}
+
+kubectl_kind_wait() {
+  # kubectl wait opens a watch that is already bounded by --timeout. Keep
+  # request-timeout unset here so a slow API server does not cut the watch short.
+  kubectl --context="kind-${KIND_CLUSTER_NAME}" "$@"
+}
+
+print_namespace_diagnostics() {
+  local ns="$1"
+
+  echo "=== ${ns} workloads ==="
+  kubectl_kind -n "${ns}" get deployments,statefulsets,daemonsets,pods -o wide 2>/dev/null || true
+  echo "=== Recent events (${ns}) ==="
+  kubectl_kind -n "${ns}" get events --sort-by='.lastTimestamp' 2>/dev/null | tail -80 || true
+}
+
+wait_for_deployments() {
+  local ns="$1"
+  shift
+  local deployments=("$@")
+
+  echo "Waiting up to ${COMPONENT_HEALTH_TIMEOUT} for ${ns}: ${deployments[*]}"
+  if kubectl_kind_wait -n "${ns}" wait \
+    --for=condition=Available \
+    --timeout="${COMPONENT_HEALTH_TIMEOUT}" \
+    "${deployments[@]}"; then
+    return 0
+  fi
+
+  echo "::error::One or more deployments in ${ns} did not become Available within ${COMPONENT_HEALTH_TIMEOUT}: ${deployments[*]}"
+  print_namespace_diagnostics "${ns}"
+  return 1
+}
+
+wait_for_required_object() {
+  local resource="$1"
+
+  echo "Verifying ${resource}"
+  if kubectl_kind get "${resource}" >/dev/null; then
+    return 0
+  fi
+
+  echo "::error::Required object is missing: ${resource}"
+  return 1
+}
+
+echo "=== Runtime component health (${mode}) ==="
+
+wait_for_deployments monitoring \
+  deployment/kube-prometheus-operator
+
+wait_for_deployments kai-scheduler \
+  deployment/kai-scheduler-default \
+  deployment/admission \
+  deployment/binder \
+  deployment/kai-operator \
+  deployment/pod-grouper \
+  deployment/podgroup-controller \
+  deployment/queue-controller
+
+case "${mode}" in
+  training)
+    wait_for_deployments kubeflow \
+      deployment/kubeflow-trainer-controller-manager
+    wait_for_required_object validatingwebhookconfiguration/validator.trainer.kubeflow.org
+    wait_for_required_object customresourcedefinition/trainjobs.trainer.kubeflow.org
+    ;;
+  inference)
+    wait_for_deployments dynamo-system \
+      deployment/dynamo-platform-dynamo-operator-controller-manager \
+      deployment/grove-operator
+    wait_for_deployments kgateway-system \
+      deployment/kgateway \
+      deployment/inference-gateway
+    ;;
+  *)
+    echo "::error::unknown runtime component health mode: ${mode}"
+    exit 2
+    ;;
+esac
+
+echo "Runtime component health check passed."
diff --git a/.github/scripts/gpu-smoke-run-nvidia-smi.sh b/.github/scripts/gpu-smoke-run-nvidia-smi.sh
index 1a2151253..0d4ea31d7 100644
--- a/.github/scripts/gpu-smoke-run-nvidia-smi.sh
+++ b/.github/scripts/gpu-smoke-run-nvidia-smi.sh
@@ -15,7 +15,19 @@
 
 set -euo pipefail
 
-pod_name=$(cat <<'EOF' | kubectl --context="kind-${KIND_CLUSTER_NAME}" create -f - -o jsonpath='{.metadata.name}'
+KUBECTL_REQUEST_TIMEOUT="${KUBECTL_REQUEST_TIMEOUT:-10s}"
+KUBECTL_WAIT_REQUEST_TIMEOUT="${KUBECTL_WAIT_REQUEST_TIMEOUT:-130s}"
+POD_NAME_FILE="${POD_NAME_FILE:-/tmp/aicr-gpu-smoke-pod-name-${KIND_CLUSTER_NAME}}"
+
+kubectl_kind() {
+  kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" --context="kind-${KIND_CLUSTER_NAME}" "$@"
+}
+
+kubectl_kind_wait() {
+  timeout 150s kubectl --request-timeout="${KUBECTL_WAIT_REQUEST_TIMEOUT}" --context="kind-${KIND_CLUSTER_NAME}" "$@"
+}
+
+pod_name=$(cat <<'EOF' | kubectl_kind create -f - -o jsonpath='{.metadata.name}'
 apiVersion: v1
 kind: Pod
 metadata:
@@ -34,10 +46,10 @@ spec:
 EOF
 )
 
-echo "${pod_name}" > /tmp/aicr-gpu-smoke-pod-name
+echo "${pod_name}" > "${POD_NAME_FILE}"
 
 echo "Waiting for ${pod_name} pod to complete..."
-kubectl --context="kind-${KIND_CLUSTER_NAME}" wait "pod/${pod_name}" \
+kubectl_kind_wait wait "pod/${pod_name}" \
   --for=condition=Ready --timeout=120s || true
-kubectl --context="kind-${KIND_CLUSTER_NAME}" wait "pod/${pod_name}" \
+kubectl_kind_wait wait "pod/${pod_name}" \
   --for=jsonpath='{.status.phase}'=Succeeded --timeout=120s
diff --git a/.github/scripts/gpu-smoke-show-nvidia-smi-log.sh b/.github/scripts/gpu-smoke-show-nvidia-smi-log.sh
index 982648460..25b33c862 100644
--- a/.github/scripts/gpu-smoke-show-nvidia-smi-log.sh
+++ b/.github/scripts/gpu-smoke-show-nvidia-smi-log.sh
@@ -15,13 +15,20 @@
 
 set -euo pipefail
 
+KUBECTL_REQUEST_TIMEOUT="${KUBECTL_REQUEST_TIMEOUT:-10s}"
+POD_NAME_FILE="${POD_NAME_FILE:-/tmp/aicr-gpu-smoke-pod-name-${KIND_CLUSTER_NAME}}"
+
+kubectl_kind() {
+  kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" --context="kind-${KIND_CLUSTER_NAME}" "$@"
+}
+
 pod_name=""
-if [[ -f /tmp/aicr-gpu-smoke-pod-name ]]; then
-  pod_name="$(cat /tmp/aicr-gpu-smoke-pod-name)"
+if [[ -f "${POD_NAME_FILE}" ]]; then
+  pod_name="$(cat "${POD_NAME_FILE}")"
 fi
 
 if [[ -z "${pod_name}" ]]; then
-  pod_name=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" get pods \
+  pod_name=$(kubectl_kind get pods \
     -l app=gpu-smoke-test \
     --sort-by=.metadata.creationTimestamp \
     -o jsonpath='{.items[-1:].metadata.name}')
@@ -32,4 +39,5 @@ if [[ -z "${pod_name}" ]]; then
   exit 1
 fi
 
-kubectl --context="kind-${KIND_CLUSTER_NAME}" logs "${pod_name}"
+kubectl_kind logs "${pod_name}"
+rm -f "${POD_NAME_FILE}"
diff --git a/.github/workflows/gpu-h100-inference-test.yaml b/.github/workflows/gpu-h100-inference-test.yaml
index 08152c25e..4f06bb396 100644
--- a/.github/workflows/gpu-h100-inference-test.yaml
+++ b/.github/workflows/gpu-h100-inference-test.yaml
@@ -16,7 +16,7 @@ name: GPU Inference Test (nvkind + H100 x2)
 
 on:
   schedule:
-    - cron: '15 6,18 * * *'  # Every 12 hours (2x daily), offset from T4 smoke test
+    - cron: '15 6,18 * * *'  # Every 12 hours (2x daily), offset from training test
   push:
     branches:
       - "pull-request/[0-9]+"
@@ -56,8 +56,14 @@ jobs:
               - '.github/actions/install-karpenter-kwok/**'
               - 'validators/*/Dockerfile'
               - 'pkg/evidence/**'
+              - '.github/workflows/gpu-h100-kind-runtime-test.yaml'
+              - '.github/actions/gpu-debug-diagnostics/**'
               - '.github/actions/gpu-test-cleanup/**'
               - '.github/actions/load-versions/**'
+              - '.github/scripts/gpu-chainsaw-health.sh'
+              - '.github/scripts/gpu-debug-diagnostics.sh'
+              - '.github/scripts/gpu-runtime-component-health.sh'
+              - '.github/scripts/gpu-validate-conformance.sh'
               - 'pkg/bundler/deployer/helm/**'
               - 'tests/chainsaw/chainsaw-config.yaml'
               - 'tests/chainsaw/ai-conformance/main.go'
@@ -84,11 +90,11 @@ jobs:
               - 'pkg/defaults/timeouts.go'
               - 'validators/conformance/**'
 
+  # NVIDIA self-hosted GPU runners reject pull_request event jobs before
+  # checkout. PR GPU coverage runs through the pull-request/<number> push
+  # mirror after ok-to-test approval.
   gpu-inference-test:
     needs: [check-paths]
-    # NVIDIA self-hosted GPU runners reject pull_request event jobs before
-    # checkout. PR GPU coverage runs through the pull-request/<number> push
-    # mirror after ok-to-test approval.
     if: >
       always() && (
         github.event_name == 'schedule' ||
@@ -96,184 +102,11 @@ jobs:
         (github.event_name == 'push' && needs.check-paths.outputs.should-run == 'true')
       )
     name: GPU Inference Test (nvkind + H100 x2)
-    concurrency:
-      group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}
-      cancel-in-progress: true
-    runs-on: linux-amd64-gpu-h100-latest-2
-    # Cold self-hosted H100 runners can spend most of the old budget pulling
-    # images and loading Kind nodes before validation starts.
-    timeout-minutes: 180
-
-    env:
-      KIND_CLUSTER_NAME: gpu-inference-test
-
-    steps:
-
-      - name: Checkout Code
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
-        with:
-          persist-credentials: false
-
-      - name: Load GPU test versions
-        id: versions
-        uses: ./.github/actions/load-versions
-
-      - name: Set up GPU cluster
-        uses: ./.github/actions/gpu-cluster-setup
-        with:
-          kind_node_image: ${{ steps.versions.outputs.h100_kind_node_image }}
-          min_gpu_count: '2'
-          gpu_model_pattern: H100
-          min_free_disk_gb: '50'
-          min_available_memory_gb: '16'
-          cluster_create_timeout: 900s
-          control_plane_resource_patches: 'true'
-          control_plane_leader_election_tuning: 'true'
-
-      - name: Build aicr and snapshot agent image
-        uses: ./.github/actions/aicr-build
-        with:
-          build_snapshot_agent: 'true'
-          validator_phases: 'none'
-
-      # Fast readiness gate after cluster setup. Stability windows start after
-      # runtime install, where component rollouts can stress the control plane.
-      - name: Check control plane health
-        uses: ./.github/actions/check-control-plane-health
-        with:
-          cluster_name: ${{ env.KIND_CLUSTER_NAME }}
-          wait_timeout: 120s
-          stability_window: 0s
-          recover_unhealthy: 'true'
-
-      - name: Install runtime bundle
-        id: bundle-install
-        uses: ./.github/actions/gpu-operator-install
-        with:
-          method: bundle
-          accelerator: h100
-          platform: dynamo
-          wait: 'true'
-          best_effort: 'false'
-
-      # Runtime install creates many CRDs, webhooks, and controllers. Keep a
-      # stability window here to catch KCM/scheduler restarts before snapshot.
-      - name: Check control plane health
-        uses: ./.github/actions/check-control-plane-health
-        with:
-          cluster_name: ${{ env.KIND_CLUSTER_NAME }}
-          wait_timeout: 120s
-          stability_window: 60s
-          recover_unhealthy: 'true'
-
-      # --- Snapshot and GPU validation ---
-
-      - name: Snapshot and validate GPU
-        uses: ./.github/actions/gpu-snapshot-validate
-        with:
-          gpu_model: H100
-          min_gpu_count: '2'
-          cluster_name: ${{ env.KIND_CLUSTER_NAME }}
-          snapshot_timeout: 10m
-
-      # --- Install Karpenter + KWOK early to give monitoring stack settle time ---
-
-      # Snapshot deploys a GPU Job and exercises cluster discovery; verify the
-      # control plane stayed stable before adding Karpenter/KWOK.
-      - name: Check control plane health
-        uses: ./.github/actions/check-control-plane-health
-        with:
-          cluster_name: ${{ env.KIND_CLUSTER_NAME }}
-          wait_timeout: 120s
-          stability_window: 60s
-          recover_unhealthy: 'true'
-
-      - name: Install Karpenter + KWOK
-        uses: ./.github/actions/install-karpenter-kwok
-        with:
-          cluster_name: ${{ env.KIND_CLUSTER_NAME }}
-          kwok_helm_timeout: 600s
-          ko_build_timeout: '1200'
-          karpenter_helm_timeout: 600s
-
-      # --- Health checks ---
-
-      - name: Install chainsaw
-        uses: ./.github/actions/setup-build-tools
-        with:
-          install_chainsaw: 'true'
-          chainsaw_version: '${{ steps.versions.outputs.chainsaw }}'
-
-      # This gate is effectively post-Karpenter/KWOK; the chainsaw setup above
-      # only installs a runner-side binary.
-      - name: Check control plane health
-        uses: ./.github/actions/check-control-plane-health
-        with:
-          cluster_name: ${{ env.KIND_CLUSTER_NAME }}
-          wait_timeout: 120s
-          stability_window: 60s
-          recover_unhealthy: 'true'
-
-      - name: Run chainsaw health checks
-        # The H100 stack can make namespace cleanup API calls slow under load.
-        # Keep cleanup enabled, but allow more than the default 30s deadline.
-        run: bash .github/scripts/gpu-chainsaw-health.sh tests/chainsaw/ai-conformance/kind-inference-dynamo
-
-      # --- CNCF AI Conformance validation ---
-      # Runs after the stack health checks so gateway and metrics validators
-      # see a settled inference stack.
-
-      - name: Build conformance validator image
-        uses: ./.github/actions/aicr-build
-        with:
-          build_cli: 'false'
-          build_snapshot_agent: 'false'
-          validator_phases: 'conformance'
-
-      # Validator image build/load can contend with Docker and kind containerd;
-      # verify the control plane before the final conformance workload.
-      - name: Check control plane health
-        uses: ./.github/actions/check-control-plane-health
-        with:
-          cluster_name: ${{ env.KIND_CLUSTER_NAME }}
-          wait_timeout: 120s
-          stability_window: 60s
-          recover_unhealthy: 'true'
-
-      - name: Validate CNCF AI Conformance
-        id: validate-conformance
-        run: bash .github/scripts/gpu-validate-conformance.sh
-
-      # Dynamo smoke is intentionally disabled for now. The vLLM runtime image
-      # adds significant latency and flakiness in Kind CI, and training has no
-      # matching smoke path yet. Reintroduce it later alongside a symmetric
-      # training smoke test if needed.
-      # --- Validation artifacts ---
-
-      - name: Upload validation artifacts
-        if: always()
-        timeout-minutes: 5
-        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a  # v7.0.1
-        with:
-          name: conformance-evidence
-          path: |
-            conformance-evidence/
-            validation-result.yaml
-          if-no-files-found: warn
-
-      # --- Debug diagnostics (before cleanup so resources still exist) ---
-
-      - name: Debug diagnostics
-        if: failure()
-        timeout-minutes: 5
-        shell: bash
-        env:
-          GPU_TEST_DIAGNOSTIC_MODE: inference
-        run: bash .github/scripts/gpu-debug-diagnostics.sh
-
-      - name: GPU Test Cleanup
-        if: always()
-        uses: ./.github/actions/gpu-test-cleanup
-        with:
-          cluster_name: ${{ env.KIND_CLUSTER_NAME }}
-          artifact_name_prefix: gpu-inference-test-debug
+    uses: ./.github/workflows/gpu-h100-kind-runtime-test.yaml
+    with:
+      job_name: GPU Inference Test (nvkind + H100 x2)
+      cluster_name: gpu-inference-test
+      intent: inference
+      platform: dynamo
+      chainsaw_path: tests/chainsaw/ai-conformance/kind-inference-dynamo
+      artifact_name_prefix: gpu-inference-test-debug
diff --git a/.github/workflows/gpu-h100-kind-runtime-test.yaml b/.github/workflows/gpu-h100-kind-runtime-test.yaml
new file mode 100644
index 000000000..6d0f8757b
--- /dev/null
+++ b/.github/workflows/gpu-h100-kind-runtime-test.yaml
@@ -0,0 +1,221 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: GPU H100 Kind Runtime Test
+
+on:
+  workflow_call:
+    inputs:
+      job_name:
+        description: 'Display name for the H100 runtime job'
+        required: true
+        type: string
+      cluster_name:
+        description: 'Kind cluster name'
+        required: true
+        type: string
+      intent:
+        description: 'Runtime intent passed to the bundle installer'
+        required: true
+        type: string
+      platform:
+        description: 'Runtime platform passed to the bundle installer'
+        required: true
+        type: string
+      chainsaw_path:
+        description: 'Chainsaw health-check directory'
+        required: true
+        type: string
+      artifact_name_prefix:
+        description: 'Prefix for uploaded debug artifacts'
+        required: true
+        type: string
+
+permissions:
+  contents: read
+
+jobs:
+  gpu-h100-kind-runtime-test:
+    name: ${{ inputs.job_name }}
+    runs-on: linux-amd64-gpu-h100-latest-2
+    # Cold self-hosted H100 runners can spend most of this budget pulling
+    # images and loading Kind nodes before validation starts.
+    timeout-minutes: 180
+    concurrency:
+      group: gpu-h100-${{ inputs.cluster_name }}-${{ github.event_name }}-${{ github.ref }}
+      cancel-in-progress: true
+
+    env:
+      KIND_CLUSTER_NAME: ${{ inputs.cluster_name }}
+
+    steps:
+      - name: Checkout Code
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
+        with:
+          persist-credentials: false
+
+      - name: Load GPU test versions
+        id: versions
+        uses: ./.github/actions/load-versions
+
+      - name: Set up GPU cluster
+        uses: ./.github/actions/gpu-cluster-setup
+        with:
+          kind_node_image: ${{ steps.versions.outputs.h100_kind_node_image }}
+          min_gpu_count: '2'
+          gpu_model_pattern: H100
+          min_free_disk_gb: '50'
+          min_available_memory_gb: '16'
+          cluster_create_timeout: 900s
+          control_plane_resource_patches: 'true'
+          control_plane_leader_election_tuning: 'true'
+
+      - name: Build aicr and snapshot agent image
+        uses: ./.github/actions/aicr-build
+        with:
+          build_snapshot_agent: 'true'
+          validator_phases: 'none'
+
+      # Fast readiness gate after cluster setup. Stability windows start after
+      # runtime install, where component rollouts can stress the control plane.
+      - name: Check control plane health
+        uses: ./.github/actions/check-control-plane-health
+        with:
+          cluster_name: ${{ env.KIND_CLUSTER_NAME }}
+          wait_timeout: 120s
+          stability_window: 0s
+          recover_unhealthy: 'true'
+
+      - name: Install runtime bundle
+        id: bundle-install
+        uses: ./.github/actions/gpu-operator-install
+        with:
+          method: bundle
+          accelerator: h100
+          intent: ${{ inputs.intent }}
+          platform: ${{ inputs.platform }}
+          wait: 'true'
+          best_effort: 'false'
+
+      - name: Check runtime component health
+        run: bash .github/scripts/gpu-runtime-component-health.sh "${{ inputs.intent }}"
+
+      # Runtime install creates many CRDs, webhooks, and controllers. Keep a
+      # stability window here to catch KCM/scheduler restarts before snapshot.
+      - name: Check control plane health
+        uses: ./.github/actions/check-control-plane-health
+        with:
+          cluster_name: ${{ env.KIND_CLUSTER_NAME }}
+          wait_timeout: 120s
+          stability_window: 60s
+          recover_unhealthy: 'true'
+
+      - name: Snapshot and validate GPU
+        uses: ./.github/actions/gpu-snapshot-validate
+        with:
+          gpu_model: H100
+          min_gpu_count: '2'
+          cluster_name: ${{ env.KIND_CLUSTER_NAME }}
+          snapshot_timeout: 10m
+
+      # Snapshot deploys a GPU Job and exercises cluster discovery; verify the
+      # control plane stayed stable before adding Karpenter/KWOK.
+      - name: Check control plane health
+        uses: ./.github/actions/check-control-plane-health
+        with:
+          cluster_name: ${{ env.KIND_CLUSTER_NAME }}
+          wait_timeout: 120s
+          stability_window: 60s
+          recover_unhealthy: 'true'
+
+      - name: Install Karpenter + KWOK
+        uses: ./.github/actions/install-karpenter-kwok
+        with:
+          cluster_name: ${{ env.KIND_CLUSTER_NAME }}
+          kwok_helm_timeout: 600s
+          ko_build_timeout: '1200'
+          karpenter_helm_timeout: 600s
+
+      - name: Install chainsaw
+        uses: ./.github/actions/setup-build-tools
+        with:
+          install_chainsaw: 'true'
+          chainsaw_version: '${{ steps.versions.outputs.chainsaw }}'
+
+      # This gate is effectively post-Karpenter/KWOK; the chainsaw setup above
+      # only installs a runner-side binary.
+      - name: Check control plane health
+        uses: ./.github/actions/check-control-plane-health
+        with:
+          cluster_name: ${{ env.KIND_CLUSTER_NAME }}
+          wait_timeout: 120s
+          stability_window: 60s
+          recover_unhealthy: 'true'
+
+      - name: Run chainsaw health checks
+        run: bash .github/scripts/gpu-chainsaw-health.sh "${{ inputs.chainsaw_path }}"
+
+      - name: Build conformance validator image
+        uses: ./.github/actions/aicr-build
+        with:
+          build_cli: 'false'
+          build_snapshot_agent: 'false'
+          validator_phases: 'conformance'
+
+      # Validator image build/load can contend with Docker and kind containerd;
+      # verify the control plane before the final conformance workload.
+      - name: Check control plane health
+        uses: ./.github/actions/check-control-plane-health
+        with:
+          cluster_name: ${{ env.KIND_CLUSTER_NAME }}
+          wait_timeout: 120s
+          stability_window: 60s
+          recover_unhealthy: 'true'
+
+      - name: Validate CNCF AI Conformance
+        id: validate-conformance
+        run: bash .github/scripts/gpu-validate-conformance.sh
+
+      - name: Upload validation artifacts
+        if: always()
+        timeout-minutes: 5
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a  # v7.0.1
+        with:
+          name: conformance-evidence
+          path: |
+            conformance-evidence/
+            validation-result.yaml
+          if-no-files-found: warn
+
+      - name: Debug diagnostics
+        if: failure()
+        timeout-minutes: 5
+        uses: ./.github/actions/gpu-debug-diagnostics
+        with:
+          cluster_name: ${{ env.KIND_CLUSTER_NAME }}
+          mode: ${{ inputs.intent }}
+
+      - name: Mark debug artifact collection
+        id: gpu-debug-artifacts
+        if: failure() || cancelled()
+        shell: bash
+        run: echo "collect=true" >> "${GITHUB_OUTPUT}"
+
+      - name: GPU Test Cleanup
+        if: always()
+        uses: ./.github/actions/gpu-test-cleanup
+        with:
+          cluster_name: ${{ env.KIND_CLUSTER_NAME }}
+          artifact_name_prefix: ${{ inputs.artifact_name_prefix }}
+          collect_artifacts: ${{ steps.gpu-debug-artifacts.outputs.collect == 'true' }}
diff --git a/.github/workflows/gpu-h100-training-test.yaml b/.github/workflows/gpu-h100-training-test.yaml
index f5ecbaf7f..51fbed8ba 100644
--- a/.github/workflows/gpu-h100-training-test.yaml
+++ b/.github/workflows/gpu-h100-training-test.yaml
@@ -56,8 +56,14 @@ jobs:
               - '.github/actions/install-karpenter-kwok/**'
               - 'validators/*/Dockerfile'
               - 'pkg/evidence/**'
+              - '.github/workflows/gpu-h100-kind-runtime-test.yaml'
+              - '.github/actions/gpu-debug-diagnostics/**'
               - '.github/actions/gpu-test-cleanup/**'
               - '.github/actions/load-versions/**'
+              - '.github/scripts/gpu-chainsaw-health.sh'
+              - '.github/scripts/gpu-debug-diagnostics.sh'
+              - '.github/scripts/gpu-runtime-component-health.sh'
+              - '.github/scripts/gpu-validate-conformance.sh'
               - 'pkg/bundler/deployer/helm/**'
               - 'tests/chainsaw/chainsaw-config.yaml'
               - 'tests/chainsaw/ai-conformance/main.go'
@@ -80,11 +86,11 @@ jobs:
               - 'pkg/defaults/timeouts.go'
               - 'validators/conformance/**'
 
+  # NVIDIA self-hosted GPU runners reject pull_request event jobs before
+  # checkout. PR GPU coverage runs through the pull-request/<number> push
+  # mirror after ok-to-test approval.
   gpu-training-test:
     needs: [check-paths]
-    # NVIDIA self-hosted GPU runners reject pull_request event jobs before
-    # checkout. PR GPU coverage runs through the pull-request/<number> push
-    # mirror after ok-to-test approval.
     if: >
       always() && (
         github.event_name == 'schedule' ||
@@ -92,180 +98,11 @@ jobs:
         (github.event_name == 'push' && needs.check-paths.outputs.should-run == 'true')
       )
     name: GPU Training Test (nvkind + H100 x2)
-    concurrency:
-      group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}
-      cancel-in-progress: true
-    runs-on: linux-amd64-gpu-h100-latest-2
-    # Cold self-hosted H100 runners can spend most of the old budget pulling
-    # images and loading Kind nodes before validation starts.
-    timeout-minutes: 180
-
-    env:
-      KIND_CLUSTER_NAME: gpu-training-test
-
-    steps:
-
-      - name: Checkout Code
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
-        with:
-          persist-credentials: false
-
-      - name: Load GPU test versions
-        id: versions
-        uses: ./.github/actions/load-versions
-
-      - name: Set up GPU cluster
-        uses: ./.github/actions/gpu-cluster-setup
-        with:
-          kind_node_image: ${{ steps.versions.outputs.h100_kind_node_image }}
-          min_gpu_count: '2'
-          gpu_model_pattern: H100
-          min_free_disk_gb: '50'
-          min_available_memory_gb: '16'
-          cluster_create_timeout: 900s
-          control_plane_resource_patches: 'true'
-          control_plane_leader_election_tuning: 'true'
-
-      - name: Build aicr and snapshot agent image
-        uses: ./.github/actions/aicr-build
-        with:
-          build_snapshot_agent: 'true'
-          validator_phases: 'none'
-
-      # Fast readiness gate after cluster setup. Stability windows start after
-      # runtime install, where component rollouts can stress the control plane.
-      - name: Check control plane health
-        uses: ./.github/actions/check-control-plane-health
-        with:
-          cluster_name: ${{ env.KIND_CLUSTER_NAME }}
-          wait_timeout: 120s
-          stability_window: 0s
-          recover_unhealthy: 'true'
-
-      - name: Install runtime bundle
-        id: bundle-install
-        uses: ./.github/actions/gpu-operator-install
-        with:
-          method: bundle
-          accelerator: h100
-          intent: training
-          platform: kubeflow
-          wait: 'true'
-          best_effort: 'false'
-
-      # Runtime install creates many CRDs, webhooks, and controllers. Keep a
-      # stability window here to catch KCM/scheduler restarts before snapshot.
-      - name: Check control plane health
-        id: post_runtime_control_plane_health
-        uses: ./.github/actions/check-control-plane-health
-        with:
-          cluster_name: ${{ env.KIND_CLUSTER_NAME }}
-          wait_timeout: 120s
-          stability_window: 60s
-          recover_unhealthy: 'true'
-
-      - name: Snapshot and validate GPU
-        uses: ./.github/actions/gpu-snapshot-validate
-        with:
-          gpu_model: H100
-          min_gpu_count: '2'
-          cluster_name: ${{ env.KIND_CLUSTER_NAME }}
-          snapshot_timeout: 10m
-
-      # --- Install Karpenter + KWOK early to give monitoring stack settle time ---
-
-      # Snapshot deploys a GPU Job and exercises cluster discovery; verify the
-      # control plane stayed stable before adding Karpenter/KWOK.
-      - name: Check control plane health
-        uses: ./.github/actions/check-control-plane-health
-        with:
-          cluster_name: ${{ env.KIND_CLUSTER_NAME }}
-          wait_timeout: 120s
-          stability_window: 60s
-          recover_unhealthy: 'true'
-
-      - name: Install Karpenter + KWOK
-        uses: ./.github/actions/install-karpenter-kwok
-        with:
-          cluster_name: ${{ env.KIND_CLUSTER_NAME }}
-          kwok_helm_timeout: 600s
-          ko_build_timeout: '1200'
-          karpenter_helm_timeout: 600s
-
-      # --- Health checks ---
-
-      - name: Install chainsaw
-        uses: ./.github/actions/setup-build-tools
-        with:
-          install_chainsaw: 'true'
-          chainsaw_version: '${{ steps.versions.outputs.chainsaw }}'
-
-      # This gate is effectively post-Karpenter/KWOK; the chainsaw setup above
-      # only installs a runner-side binary.
-      - name: Check control plane health
-        uses: ./.github/actions/check-control-plane-health
-        with:
-          cluster_name: ${{ env.KIND_CLUSTER_NAME }}
-          wait_timeout: 120s
-          stability_window: 60s
-          recover_unhealthy: 'true'
-
-      - name: Run chainsaw health checks
-        # The H100 stack can make namespace cleanup API calls slow under load.
-        # Keep cleanup enabled, but allow more than the default 30s deadline.
-        run: bash .github/scripts/gpu-chainsaw-health.sh tests/chainsaw/ai-conformance/kind-training-kubeflow
-
-      # --- CNCF AI Conformance validation ---
-      # Runs last to ensure the DCGM → Prometheus → adapter pipeline
-      # has had time to bootstrap (pod-autoscaling check needs live metric data).
-
-      - name: Build conformance validator image
-        uses: ./.github/actions/aicr-build
-        with:
-          build_cli: 'false'
-          build_snapshot_agent: 'false'
-          validator_phases: 'conformance'
-
-      # Validator image build/load can contend with Docker and kind containerd;
-      # verify the control plane before the final conformance workload.
-      - name: Check control plane health
-        uses: ./.github/actions/check-control-plane-health
-        with:
-          cluster_name: ${{ env.KIND_CLUSTER_NAME }}
-          wait_timeout: 120s
-          stability_window: 60s
-          recover_unhealthy: 'true'
-
-      - name: Validate CNCF AI Conformance
-        id: validate-conformance
-        run: bash .github/scripts/gpu-validate-conformance.sh
-
-      # --- Validation artifacts ---
-
-      - name: Upload validation artifacts
-        if: always()
-        timeout-minutes: 5
-        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a  # v7.0.1
-        with:
-          name: conformance-evidence
-          path: |
-            conformance-evidence/
-            validation-result.yaml
-          if-no-files-found: warn
-
-      # --- Debug diagnostics (before cleanup so resources still exist) ---
-
-      - name: Debug diagnostics
-        if: failure()
-        timeout-minutes: 5
-        shell: bash
-        env:
-          GPU_TEST_DIAGNOSTIC_MODE: training
-        run: bash .github/scripts/gpu-debug-diagnostics.sh
-
-      - name: GPU Test Cleanup
-        if: always()
-        uses: ./.github/actions/gpu-test-cleanup
-        with:
-          cluster_name: ${{ env.KIND_CLUSTER_NAME }}
-          artifact_name_prefix: gpu-training-test-debug
+    uses: ./.github/workflows/gpu-h100-kind-runtime-test.yaml
+    with:
+      job_name: GPU Training Test (nvkind + H100 x2)
+      cluster_name: gpu-training-test
+      intent: training
+      platform: kubeflow
+      chainsaw_path: tests/chainsaw/ai-conformance/kind-training-kubeflow
+      artifact_name_prefix: gpu-training-test-debug
diff --git a/.github/workflows/gpu-smoke-test.yaml b/.github/workflows/gpu-smoke-test.yaml
index bdf607e07..af8d3860c 100644
--- a/.github/workflows/gpu-smoke-test.yaml
+++ b/.github/workflows/gpu-smoke-test.yaml
@@ -50,8 +50,13 @@ jobs:
               - '.github/actions/gpu-cluster-setup/**'
               - '.github/actions/gpu-operator-install/**'
               - '.github/actions/aicr-build/**'
+              - '.github/actions/gpu-debug-diagnostics/**'
               - '.github/actions/gpu-test-cleanup/**'
+              - '.github/actions/gpu-smoke-nvidia-smi/**'
               - '.github/actions/load-versions/**'
+              - '.github/scripts/gpu-debug-diagnostics.sh'
+              - '.github/scripts/gpu-smoke-run-nvidia-smi.sh'
+              - '.github/scripts/gpu-smoke-show-nvidia-smi-log.sh'
               - 'pkg/collector/**'
               - 'pkg/snapshotter/**'
               - '.github/actions/gpu-snapshot-validate/**'
@@ -108,10 +113,9 @@ jobs:
           method: helm
 
       - name: Run nvidia-smi in a pod
-        run: bash .github/scripts/gpu-smoke-run-nvidia-smi.sh
-
-      - name: Show nvidia-smi output
-        run: bash .github/scripts/gpu-smoke-show-nvidia-smi-log.sh
+        uses: ./.github/actions/gpu-smoke-nvidia-smi
+        with:
+          cluster_name: ${{ env.KIND_CLUSTER_NAME }}
 
       # --- Snapshot and validation ---
 
@@ -124,12 +128,20 @@ jobs:
 
       - name: Debug diagnostics
         if: failure()
-        env:
-          GPU_TEST_DIAGNOSTIC_MODE: smoke
-        run: bash .github/scripts/gpu-debug-diagnostics.sh
+        uses: ./.github/actions/gpu-debug-diagnostics
+        with:
+          cluster_name: ${{ env.KIND_CLUSTER_NAME }}
+          mode: smoke
+
+      - name: Mark debug artifact collection
+        id: gpu-debug-artifacts
+        if: failure() || cancelled()
+        shell: bash
+        run: echo "collect=true" >> "${GITHUB_OUTPUT}"
 
       - name: GPU Test Cleanup
         if: always()
         uses: ./.github/actions/gpu-test-cleanup
         with:
           cluster_name: ${{ env.KIND_CLUSTER_NAME }}
+          collect_artifacts: ${{ steps.gpu-debug-artifacts.outputs.collect == 'true' }}
diff --git a/pkg/bundler/deployer/helm/helm_test.go b/pkg/bundler/deployer/helm/helm_test.go
index da23545a0..d266afb96 100644
--- a/pkg/bundler/deployer/helm/helm_test.go
+++ b/pkg/bundler/deployer/helm/helm_test.go
@@ -588,6 +588,10 @@ func TestGenerate_DeployScriptComponentTimeouts(t *testing.T) {
 				`requires Helm v4.0.5 or later`,
 				`--wait --timeout 20m`,
 			},
+			rejectSnippets: []string{
+				`local prerelease`,
+				`if [[ -n "${prerelease}" ]]`,
+			},
 		},
 		{
 			name: "kube-prometheus-stack",
diff --git a/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl b/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl
index 4c1e26f28..11ffb8475 100644
--- a/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl
+++ b/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl
@@ -72,8 +72,7 @@ function helm_supports_server_side_false_install() {
   version="$(helm version --short 2>/dev/null | head -n 1 || true)"
   version="${version#v}"
   version="${version%%+*}"
-  version="${version%%-*}"
-  if ! [[ "${version}" =~ ^([0-9]+)\.([0-9]+)\.([0-9]+)$ ]]; then
+  if ! [[ "${version}" =~ ^([0-9]+)\.([0-9]+)\.([0-9]+)(-[0-9A-Za-z.-]+)?$ ]]; then
     return 1
   fi
   major="${BASH_REMATCH[1]}"
diff --git a/recipes/overlays/kind.yaml b/recipes/overlays/kind.yaml
index b0d8dbd76..791016135 100644
--- a/recipes/overlays/kind.yaml
+++ b/recipes/overlays/kind.yaml
@@ -115,6 +115,11 @@ spec:
     - name: kube-prometheus-stack
       type: Helm
       overrides:
+        # CI only needs component health, not the full upstream alerting rule
+        # set. Skipping default rules reduces PrometheusRule churn during
+        # install on small kind control planes.
+        defaultRules:
+          create: false
         prometheus:
           prometheusSpec:
             # Smaller storage for local testing
@@ -132,7 +137,35 @@ spec:
                 memory: 1Gi
             # Shorter retention for local testing
             retention: 7d
+        prometheusOperator:
+          # Keep operator-owned monitoring custom resources in the monitoring
+          # namespace for kind. Do not scope ServiceMonitor discovery here;
+          # GPU, Kubeflow, and Dynamo monitors may live in their own namespaces.
+          alertmanagerInstanceNamespaces:
+            - monitoring
+          alertmanagerConfigNamespaces:
+            - monitoring
+          prometheusInstanceNamespaces:
+            - monitoring
+          thanosRulerInstanceNamespaces:
+            - monitoring
+          # CI kind control planes can be slow under image pulls and controller
+          # startup. Avoid restarting the operator on short health probe stalls.
+          livenessProbe:
+            timeoutSeconds: 10
+            failureThreshold: 10
+          readinessProbe:
+            timeoutSeconds: 10
+            failureThreshold: 6
+          resources:
+            requests:
+              cpu: 100m
+              memory: 256Mi
+            limits:
+              cpu: 500m
+              memory: 512Mi
         grafana:
+          enabled: false
           resources:
             requests:
               cpu: 100m

From 727f1b0cfb515e58bf3820587bfc803760bb4047 Mon Sep 17 00:00:00 2001
From: Yuan Chen <yuanchen97@gmail.com>
Date: Sun, 26 Apr 2026 19:59:25 -0700
Subject: [PATCH 21/21] ci: address follow-up GPU review feedback

---
 .../configure-nvidia-container-toolkit.sh     |  2 +-
 .../collect-debug-artifacts.sh                |  7 ++++
 .../scripts/gpu-runtime-component-health.sh   | 34 ++++++++++++++++---
 .../scripts/gpu-smoke-show-nvidia-smi-log.sh  |  5 ++-
 pkg/bundler/deployer/helm/helm_test.go        | 10 ++++--
 .../deployer/helm/templates/deploy.sh.tmpl    | 28 ++++++++-------
 6 files changed, 66 insertions(+), 20 deletions(-)

diff --git a/.github/actions/gpu-cluster-setup/configure-nvidia-container-toolkit.sh b/.github/actions/gpu-cluster-setup/configure-nvidia-container-toolkit.sh
index 16352077c..84635a988 100644
--- a/.github/actions/gpu-cluster-setup/configure-nvidia-container-toolkit.sh
+++ b/.github/actions/gpu-cluster-setup/configure-nvidia-container-toolkit.sh
@@ -25,7 +25,7 @@ set -e
 if (( restart_status != 0 )); then
   echo "::error::Docker restart failed after NVIDIA runtime configuration"
   sudo systemctl status docker --no-pager || true
-  journalctl -u docker --since "10 minutes ago" --no-pager || true
+  sudo journalctl -u docker --since "10 minutes ago" --no-pager || true
   exit "${restart_status}"
 fi
 
diff --git a/.github/actions/gpu-test-cleanup/collect-debug-artifacts.sh b/.github/actions/gpu-test-cleanup/collect-debug-artifacts.sh
index cb33e770a..7c780e3f7 100644
--- a/.github/actions/gpu-test-cleanup/collect-debug-artifacts.sh
+++ b/.github/actions/gpu-test-cleanup/collect-debug-artifacts.sh
@@ -19,6 +19,7 @@ set -uo pipefail
 rm -rf /tmp/debug-artifacts
 mkdir -p /tmp/debug-artifacts
 CONTROL_PLANE_COMPONENTS="kube-apiserver kube-controller-manager kube-scheduler etcd"
+MAX_KIND_NODE_ARTIFACT_SECONDS="${MAX_KIND_NODE_ARTIFACT_SECONDS:-600}"
 kubectl_kind() {
   timeout 30s kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" "$@"
 }
@@ -95,9 +96,15 @@ else
   echo "No recipe.yaml or bundle directory found; skipping runtime bundle archive"
 fi
 
+artifact_loop_start="$(date +%s)"
 docker_timeout 30s ps --filter "label=io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}" \
   --format '{{.Names}}' | sort | while read -r node_container; do
     [[ -z "${node_container}" ]] && continue
+    artifact_loop_elapsed=$(($(date +%s) - artifact_loop_start))
+    if (( artifact_loop_elapsed > MAX_KIND_NODE_ARTIFACT_SECONDS )); then
+      echo "Kind node artifact collection exceeded ${MAX_KIND_NODE_ARTIFACT_SECONDS}s; stopping after partial collection."
+      break
+    fi
     node_file="${node_container//[^A-Za-z0-9_.-]/_}"
     docker_timeout 30s inspect "${node_container}" \
       > "/tmp/debug-artifacts/${node_file}-docker-inspect.json" 2>&1 || true
diff --git a/.github/scripts/gpu-runtime-component-health.sh b/.github/scripts/gpu-runtime-component-health.sh
index 93b8efc7b..3d668d37b 100644
--- a/.github/scripts/gpu-runtime-component-health.sh
+++ b/.github/scripts/gpu-runtime-component-health.sh
@@ -23,6 +23,22 @@ fi
 mode="$1"
 COMPONENT_HEALTH_TIMEOUT="${COMPONENT_HEALTH_TIMEOUT:-120s}"
 
+duration_seconds() {
+  local input_value="$1"
+  local number="${input_value%[smh]}"
+  local unit="${input_value: -1}"
+
+  case "${unit}" in
+    s) echo "$((10#${number}))" ;;
+    m) echo "$((10#${number} * 60))" ;;
+    h) echo "$((10#${number} * 3600))" ;;
+    *)
+      echo "::error::unsupported duration '${input_value}'" >&2
+      exit 1
+      ;;
+  esac
+}
+
 kubectl_kind() {
   timeout 30s kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" "$@"
 }
@@ -62,13 +78,23 @@ wait_for_deployments() {
 
 wait_for_required_object() {
   local resource="$1"
+  local timeout_seconds
+  local deadline
 
-  echo "Verifying ${resource}"
-  if kubectl_kind get "${resource}" >/dev/null; then
-    return 0
-  fi
+  timeout_seconds="$(duration_seconds "${COMPONENT_HEALTH_TIMEOUT}")"
+  deadline=$((SECONDS + timeout_seconds))
+
+  echo "Waiting up to ${COMPONENT_HEALTH_TIMEOUT} for ${resource}"
+  while (( SECONDS <= deadline )); do
+    if kubectl_kind get "${resource}" >/dev/null; then
+      return 0
+    fi
+    sleep 2
+  done
 
   echo "::error::Required object is missing: ${resource}"
+  kubectl_kind get "${resource}" -o yaml 2>/dev/null || true
+  kubectl_kind describe "${resource}" 2>/dev/null || true
   return 1
 }
 
diff --git a/.github/scripts/gpu-smoke-show-nvidia-smi-log.sh b/.github/scripts/gpu-smoke-show-nvidia-smi-log.sh
index 25b33c862..05bc09523 100644
--- a/.github/scripts/gpu-smoke-show-nvidia-smi-log.sh
+++ b/.github/scripts/gpu-smoke-show-nvidia-smi-log.sh
@@ -17,6 +17,7 @@ set -euo pipefail
 
 KUBECTL_REQUEST_TIMEOUT="${KUBECTL_REQUEST_TIMEOUT:-10s}"
 POD_NAME_FILE="${POD_NAME_FILE:-/tmp/aicr-gpu-smoke-pod-name-${KIND_CLUSTER_NAME}}"
+trap 'rm -f "${POD_NAME_FILE}"' EXIT
 
 kubectl_kind() {
   kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" --context="kind-${KIND_CLUSTER_NAME}" "$@"
@@ -25,6 +26,9 @@ kubectl_kind() {
 pod_name=""
 if [[ -f "${POD_NAME_FILE}" ]]; then
   pod_name="$(cat "${POD_NAME_FILE}")"
+  if [[ -n "${pod_name}" ]] && ! kubectl_kind get pod "${pod_name}" >/dev/null 2>&1; then
+    pod_name=""
+  fi
 fi
 
 if [[ -z "${pod_name}" ]]; then
@@ -40,4 +44,3 @@ if [[ -z "${pod_name}" ]]; then
 fi
 
 kubectl_kind logs "${pod_name}"
-rm -f "${POD_NAME_FILE}"
diff --git a/pkg/bundler/deployer/helm/helm_test.go b/pkg/bundler/deployer/helm/helm_test.go
index d266afb96..33dc2c6e9 100644
--- a/pkg/bundler/deployer/helm/helm_test.go
+++ b/pkg/bundler/deployer/helm/helm_test.go
@@ -557,6 +557,7 @@ func TestGenerate_DeployScriptComponentTimeouts(t *testing.T) {
 		wantSnippets         []string
 		wantReadmeSnippets   []string
 		rejectSnippets       []string
+		rejectScriptSnippets []string
 		rejectReadmeSnippets []string
 		rejectRetryCap       bool
 	}{
@@ -579,7 +580,7 @@ func TestGenerate_DeployScriptComponentTimeouts(t *testing.T) {
 				`Require v4.0.5+ before relying on`,
 				`--request-timeout="${KUBECTL_REQUEST_TIMEOUT}"`,
 				`dynamo-platform conflict mitigation requires Helm v4.0.5+`,
-				`dump_dynamo_platform_helm_diagnostics "${namespace}"`,
+				`dump_dynamo_platform_helm_diagnostics "${component}" "${namespace}"`,
 				`deployment/dynamo-platform-dynamo-operator-controller-manager`,
 				`--previous --tail=200`,
 			},
@@ -588,7 +589,7 @@ func TestGenerate_DeployScriptComponentTimeouts(t *testing.T) {
 				`requires Helm v4.0.5 or later`,
 				`--wait --timeout 20m`,
 			},
-			rejectSnippets: []string{
+			rejectScriptSnippets: []string{
 				`local prerelease`,
 				`if [[ -n "${prerelease}" ]]`,
 			},
@@ -703,6 +704,11 @@ func TestGenerate_DeployScriptComponentTimeouts(t *testing.T) {
 					t.Errorf("deploy.sh should not include %s snippet %q", tt.component.Name, snippet)
 				}
 			}
+			for _, snippet := range tt.rejectScriptSnippets {
+				if strings.Contains(script, snippet) {
+					t.Errorf("deploy.sh should not include %s script snippet %q", tt.component.Name, snippet)
+				}
+			}
 			if tt.rejectRetryCap && retryCapPattern.MatchString(componentBlock) {
 				t.Errorf("deploy.sh should not cap %s retries in its component block", tt.component.Name)
 			}
diff --git a/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl b/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl
index 11ffb8475..6b946a969 100644
--- a/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl
+++ b/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl
@@ -174,8 +174,9 @@ function dump_kai_scheduler_helm_diagnostics() {
 }
 
 function dump_dynamo_platform_helm_diagnostics() {
-  local namespace="$1"
-  if [[ "${namespace}" != "dynamo-system" ]]; then
+  local component="$1"
+  local namespace="$2"
+  if [[ "${component}" != "dynamo-platform" ]]; then
     return
   fi
 
@@ -202,15 +203,16 @@ function dump_dynamo_platform_helm_diagnostics() {
 }
 
 # helm_retry contract:
-#   helm_retry "<description>" "<namespace>" "<max_retries>" <command> [args...]
-# Callers must pass the retry budget as the third positional argument before the
-# command to execute. This keeps per-component retry tuning explicit at the
-# callsite instead of relying on the global MAX_RETRIES fallback.
+#   helm_retry "<description>" "<component>" "<namespace>" "<max_retries>" <command> [args...]
+# Callers must pass the component name and retry budget before the command to
+# execute. This keeps per-component retry tuning and diagnostics explicit at the
+# callsite instead of relying on global fallbacks.
 function helm_retry() {
   local desc="$1"
-  local namespace="$2"
-  local max_retries="$3"
-  shift 3
+  local component="$2"
+  local namespace="$3"
+  local max_retries="$4"
+  shift 4
   local attempt=0
   while true; do
     if "$@"; then
@@ -218,7 +220,7 @@ function helm_retry() {
     fi
     attempt=$((attempt + 1))
     dump_kai_scheduler_helm_diagnostics "${namespace}"
-    dump_dynamo_platform_helm_diagnostics "${namespace}"
+    dump_dynamo_platform_helm_diagnostics "${component}" "${namespace}"
     if [[ ${attempt} -gt ${max_retries} ]]; then
       echo "ERROR: ${desc} failed after ${attempt} attempts"
       return 1
@@ -471,7 +473,8 @@ if echo "${ASYNC_COMPONENTS}" | grep -qw "{{ .Name }}"; then
   echo "  (async component — skipping --wait, keeping --timeout for hooks)"
 fi
 {{ if .IsOCI -}}
-helm_retry "{{ .Name }} helm install" "{{ .Namespace }}" \
+helm_retry "{{ .Name }} helm install" "{{ .Name }}" \
+  "{{ .Namespace }}" \
   "${COMPONENT_MAX_RETRIES}" \
   helm upgrade --install {{ .Name }} {{ .Repository }}/{{ .ChartName }} \
   ${COMPONENT_HELM_APPLY_ARGS[@]+"${COMPONENT_HELM_APPLY_ARGS[@]}"} \
@@ -483,7 +486,8 @@ helm_retry "{{ .Name }} helm install" "{{ .Namespace }}" \
   ${COMPONENT_WAIT_ARGS} \
   || helm_failed "{{ .Name }}"
 {{ else -}}
-helm_retry "{{ .Name }} helm install" "{{ .Namespace }}" \
+helm_retry "{{ .Name }} helm install" "{{ .Name }}" \
+  "{{ .Namespace }}" \
   "${COMPONENT_MAX_RETRIES}" \
   helm upgrade --install {{ .Name }} {{ .ChartName }} \
   ${COMPONENT_HELM_APPLY_ARGS[@]+"${COMPONENT_HELM_APPLY_ARGS[@]}"} \