diff --git a/.github/actions/README.md b/.github/actions/README.md
index cef2fd6ca..15710df7d 100644
--- a/.github/actions/README.md
+++ b/.github/actions/README.md
@@ -4,6 +4,13 @@ This directory contains a modular, reusable GitHub Actions architecture optimize
 
 ## Composite Actions
 
+### Script Conventions
+
+Composite action helper scripts in this directory are intentionally portable
+across checkout modes: keep them mode `0644` and invoke them as
+`bash path/to/script.sh` from workflows or `action.yml` files. Do not rely on
+executable bits or `./script.sh` invocation.
+
 ### Core CI/CD Actions
 
 #### `security-scan/`
@@ -50,7 +57,8 @@ This action runs `tools/setup-tools --skip-go --skip-docker` in auto mode, which
 **When to use**: When you need version values in workflow steps
 **Outputs**:
 - `go`, `goreleaser`, `ko`, `crane`, `golangci_lint`, `yamllint`, `addlicense`
-- `grype`, `kubectl`, `kind`, `ctlptl`, `tilt`, `helm`
+- `grype`, `kubectl`, `kind`, `nvkind`, `ctlptl`, `tilt`, `helm`
+- `kind_node_image`, `h100_kind_node_image`
 
 **Example**:
 ```yaml
diff --git a/.github/actions/aicr-build/action.yml b/.github/actions/aicr-build/action.yml
index 7a973ae21..671392215 100644
--- a/.github/actions/aicr-build/action.yml
+++ b/.github/actions/aicr-build/action.yml
@@ -13,9 +13,17 @@
 # limitations under the License.
 
 name: 'AICR Build'
-description: 'Builds the aicr validator image (via Dockerfile) and CLI binary, and loads the image into kind.'
+description: 'Builds the aicr CLI and optional snapshot/validator images, and loads requested images into kind.'
 
 inputs:
+  build_cli:
+    description: 'Build and stage the aicr CLI binary at the repository root'
+    required: false
+    default: 'true'
+  build_snapshot_agent:
+    description: 'Build the CUDA-based snapshot agent image and load it into kind'
+    required: false
+    default: 'true'
   build_validators:
     description: 'Deprecated: use validator_phases instead. Ignored when validator_phases is set.'
     required: false
@@ -28,86 +36,27 @@ inputs:
 runs:
   using: 'composite'
   steps:
-
-    - name: Install ko
-      shell: bash
-      run: |
-        KO_VERSION=$(yq eval '.build_tools.ko' .settings.yaml)
-        GOFLAGS= go install "github.com/google/ko@${KO_VERSION}"
-
-    - name: Build snapshot agent image and load into kind
+    - name: Build aicr CLI binary
+      if: inputs.build_cli == 'true' || inputs.build_snapshot_agent == 'true'
       shell: bash
       env:
         GOFLAGS: -mod=vendor
-      run: |
-        # Build snapshot agent image with CUDA base (provides nvidia-smi for GPU detection).
-        # Uses cuda:base (~250MB) instead of cuda:runtime (~1.8GB) — only nvidia-smi is needed.
-        # GPU test workflows use --image=ko.local:smoke-test for aicr snapshot.
-        CGO_ENABLED=0 go build -trimpath -o dist/aicr ./cmd/aicr
-        docker build -t ko.local:smoke-test -f - . <<'DOCKERFILE'
-        FROM nvcr.io/nvidia/cuda:13.1.0-base-ubuntu24.04
-        COPY dist/aicr /usr/local/bin/aicr
-        ENTRYPOINT ["/usr/local/bin/aicr"]
-        DOCKERFILE
+      run: bash "${{ github.action_path }}/build-cli.sh"
 
-        # Load onto all nodes. The snapshot agent requests nvidia.com/gpu but
-        # does not set a node selector, so it can land on any GPU-capable node
-        # including the control-plane (e.g., T4 smoke test).
-        #
-        # Timeout is intentionally generous (900s per attempt). H100 self-hosted
-        # runners transfer images over a shared Docker-in-Docker bridge; large
-        # CUDA base images (~250MB compressed) combined with I/O contention from
-        # parallel GPU operator pods regularly exceed the previous 600s limit.
-        timeout 900 kind load docker-image ko.local:smoke-test --name "${KIND_CLUSTER_NAME}" || {
-          echo "::warning::kind load attempt 1 failed for ko.local:smoke-test, retrying..."
-          timeout 900 kind load docker-image ko.local:smoke-test --name "${KIND_CLUSTER_NAME}"
-        }
+    - name: Build snapshot agent image and load into kind
+      if: inputs.build_snapshot_agent == 'true'
+      shell: bash
+      run: bash "${{ github.action_path }}/build-snapshot-agent.sh"
 
     - name: Build validator images and load into kind
       if: "!(inputs.validator_phases == 'none' || (inputs.validator_phases == '' && inputs.build_validators == 'false'))"
       shell: bash
       env:
         GOFLAGS: -mod=vendor
-      run: |
-        # Determine which validator phases to build.
-        # validator_phases takes precedence; build_validators is a deprecated fallback.
-        if [[ -n "${{ inputs.validator_phases }}" ]]; then
-          if [[ "${{ inputs.validator_phases }}" == "none" ]]; then
-            echo "Skipping validator builds (validator_phases=none)"
-            exit 0
-          fi
-          PHASES="${{ inputs.validator_phases }}"
-        else
-          # Default: build all phases (backwards compatible)
-          PHASES="deployment,performance,conformance"
-        fi
-
-        # Compile only the requested validator binaries.
-        mkdir -p dist/validator
-        for phase in ${PHASES//,/ }; do
-          echo "Building validator binary: ${phase}"
-          CGO_ENABLED=0 go build -trimpath -o "dist/validator/${phase}" "./validators/${phase}"
-        done
-
-        for phase in ${PHASES//,/ }; do
-          mkdir -p "validators/${phase}/testdata"
-          docker build -t "ko.local/aicr-validators/${phase}:latest" -f - . <<DOCKERFILE
-        FROM gcr.io/distroless/static-debian12:nonroot
-        COPY dist/validator/${phase} /${phase}
-        COPY validators/${phase}/testdata /app/testdata
-        WORKDIR /app
-        USER nonroot
-        ENTRYPOINT ["/${phase}"]
-        DOCKERFILE
-          # Validator images are small (~30MB distroless), but share the same
-          # Docker-in-Docker bridge as the smoke-test load above. 600s per
-          # attempt accommodates I/O queuing behind concurrent image pulls.
-          timeout 600 kind load docker-image "ko.local/aicr-validators/${phase}:latest" --name "${KIND_CLUSTER_NAME}" || {
-            echo "::warning::kind load attempt 1 failed for ko.local/aicr-validators/${phase}:latest, retrying..."
-            timeout 600 kind load docker-image "ko.local/aicr-validators/${phase}:latest" --name "${KIND_CLUSTER_NAME}"
-          }
-        done
+        VALIDATOR_PHASES: ${{ inputs.validator_phases }}
+      run: bash "${{ github.action_path }}/build-validator-images.sh"
 
     - name: Stage aicr binary at repo root
+      if: inputs.build_cli == 'true'
       shell: bash
-      run: cp dist/aicr ./aicr
+      run: bash "${{ github.action_path }}/stage-cli.sh"
diff --git a/.github/actions/aicr-build/build-cli.sh b/.github/actions/aicr-build/build-cli.sh
new file mode 100644
index 000000000..c87428241
--- /dev/null
+++ b/.github/actions/aicr-build/build-cli.sh
@@ -0,0 +1,19 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+mkdir -p dist
+CGO_ENABLED=0 go build -trimpath -o dist/aicr ./cmd/aicr
diff --git a/.github/actions/aicr-build/build-snapshot-agent.sh b/.github/actions/aicr-build/build-snapshot-agent.sh
new file mode 100644
index 000000000..512aad2f0
--- /dev/null
+++ b/.github/actions/aicr-build/build-snapshot-agent.sh
@@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+# Build snapshot agent image with CUDA base (provides nvidia-smi for GPU detection).
+# Uses cuda:base (~250MB) instead of cuda:runtime (~1.8GB) because only nvidia-smi is needed.
+timeout 900s docker build -t ko.local:smoke-test -f - . <<'DOCKERFILE'
+FROM nvcr.io/nvidia/cuda:13.1.0-base-ubuntu24.04
+COPY dist/aicr /usr/local/bin/aicr
+ENTRYPOINT ["/usr/local/bin/aicr"]
+DOCKERFILE
+
+# Load onto all nodes. The snapshot agent requests nvidia.com/gpu but does not
+# set a node selector, so it can land on any GPU-capable node including the
+# control-plane in the L40G smoke test.
+timeout 900 kind load docker-image ko.local:smoke-test --name "${KIND_CLUSTER_NAME}" || {
+  echo "::warning::kind load attempt 1 failed for ko.local:smoke-test, retrying..."
+  timeout 900 kind load docker-image ko.local:smoke-test --name "${KIND_CLUSTER_NAME}"
+}
diff --git a/.github/actions/aicr-build/build-validator-images.sh b/.github/actions/aicr-build/build-validator-images.sh
new file mode 100644
index 000000000..f098e84e8
--- /dev/null
+++ b/.github/actions/aicr-build/build-validator-images.sh
@@ -0,0 +1,59 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+VALIDATOR_PHASES="${VALIDATOR_PHASES:-}"
+if [[ -n "${VALIDATOR_PHASES}" ]]; then
+  if [[ "${VALIDATOR_PHASES}" == "none" ]]; then
+    echo "Skipping validator builds (validator_phases=none)"
+    exit 0
+  fi
+  PHASES="${VALIDATOR_PHASES}"
+else
+  # Default: build all phases (backwards compatible).
+  PHASES="deployment,performance,conformance"
+fi
+
+: "${KIND_CLUSTER_NAME:?KIND_CLUSTER_NAME must be set}"
+
+mkdir -p dist/validator
+for phase in ${PHASES//,/ }; do
+  if ! [[ "${phase}" =~ ^[a-z][a-z0-9_-]*$ ]]; then
+    echo "::error::invalid validator phase '${phase}'; expected ^[a-z][a-z0-9_-]*$"
+    exit 1
+  fi
+  echo "Building validator binary: ${phase}"
+  CGO_ENABLED=0 go build -trimpath -o "dist/validator/${phase}" "./validators/${phase}"
+done
+
+for phase in ${PHASES//,/ }; do
+  if [[ ! -d "validators/${phase}/testdata" ]]; then
+    echo "::warning::validators/${phase}/testdata is missing; creating empty testdata directory"
+    mkdir -p "validators/${phase}/testdata"
+  fi
+  docker build -t "ko.local/aicr-validators/${phase}:latest" -f - . <<DOCKERFILE
+FROM gcr.io/distroless/static-debian12:nonroot
+COPY dist/validator/${phase} /${phase}
+COPY validators/${phase}/testdata /app/testdata
+WORKDIR /app
+USER nonroot
+ENTRYPOINT ["/${phase}"]
+DOCKERFILE
+  timeout 600 kind load docker-image "ko.local/aicr-validators/${phase}:latest" --name "${KIND_CLUSTER_NAME}" || {
+    echo "::warning::kind load attempt 1 failed for ko.local/aicr-validators/${phase}:latest, retrying..."
+    timeout 600 kind load docker-image "ko.local/aicr-validators/${phase}:latest" --name "${KIND_CLUSTER_NAME}"
+  }
+done
diff --git a/.github/actions/aicr-build/stage-cli.sh b/.github/actions/aicr-build/stage-cli.sh
new file mode 100644
index 000000000..c5b737a4d
--- /dev/null
+++ b/.github/actions/aicr-build/stage-cli.sh
@@ -0,0 +1,18 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+cp dist/aicr ./aicr
diff --git a/.github/actions/check-control-plane-health/action.yml b/.github/actions/check-control-plane-health/action.yml
new file mode 100644
index 000000000..85833925f
--- /dev/null
+++ b/.github/actions/check-control-plane-health/action.yml
@@ -0,0 +1,90 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: 'Check Control Plane Health'
+description: 'Fails if Kind control-plane static pods are missing, unready, or unstable.'
+
+inputs:
+  cluster_name:
+    description: 'Kind cluster name'
+    required: true
+  namespace:
+    description: 'Namespace that contains the control-plane pods'
+    required: false
+    default: kube-system
+  components:
+    description: 'Space-separated component label values to check'
+    required: false
+    default: kube-apiserver kube-controller-manager kube-scheduler etcd
+  wait_timeout:
+    description: 'Timeout for each component readiness wait'
+    required: false
+    default: 60s
+  max_restarts:
+    description: 'Deprecated compatibility input; historical restart counts are reported but not capped'
+    required: false
+    default: '1'
+  stability_window:
+    description: 'Optional duration to watch for new control-plane restarts after pods are Ready'
+    required: false
+    default: '0s'
+  stability_probe_interval:
+    description: 'Interval for active API server probes during the stability window'
+    required: false
+    default: '10s'
+  stability_probe_failure_threshold:
+    description: 'Consecutive active stability probe failures allowed before failing'
+    required: false
+    default: '2'
+  lease_components:
+    description: 'Space-separated leader election lease names to check for freshness'
+    required: false
+    default: kube-controller-manager kube-scheduler
+  lease_stale_timeout:
+    description: 'Maximum allowed leader election lease age at the end of a stability window'
+    required: false
+    default: '120s'
+  recover_unhealthy:
+    description: 'Restart eligible Kind control-plane static pod containers when they are currently unhealthy'
+    required: false
+    default: 'false'
+  recovery_components:
+    description: 'Space-separated component label values eligible for recovery'
+    required: false
+    default: kube-controller-manager kube-scheduler kube-apiserver
+  max_recovery_attempts:
+    description: 'Maximum recovery attempts for each eligible component'
+    required: false
+    default: '1'
+
+runs:
+  using: 'composite'
+  steps:
+    - name: Check control-plane pods
+      shell: bash
+      env:
+        KIND_CLUSTER_NAME: ${{ inputs.cluster_name }}
+        NAMESPACE: ${{ inputs.namespace }}
+        COMPONENTS: ${{ inputs.components }}
+        WAIT_TIMEOUT: ${{ inputs.wait_timeout }}
+        MAX_RESTARTS: ${{ inputs.max_restarts }}
+        STABILITY_WINDOW: ${{ inputs.stability_window }}
+        STABILITY_PROBE_INTERVAL: ${{ inputs.stability_probe_interval }}
+        STABILITY_PROBE_FAILURE_THRESHOLD: ${{ inputs.stability_probe_failure_threshold }}
+        LEASE_COMPONENTS: ${{ inputs.lease_components }}
+        LEASE_STALE_TIMEOUT: ${{ inputs.lease_stale_timeout }}
+        RECOVER_UNHEALTHY: ${{ inputs.recover_unhealthy }}
+        RECOVERY_COMPONENTS: ${{ inputs.recovery_components }}
+        MAX_RECOVERY_ATTEMPTS: ${{ inputs.max_recovery_attempts }}
+      run: bash "${{ github.action_path }}/check-control-plane-health.sh"
diff --git a/.github/actions/check-control-plane-health/check-control-plane-health.sh b/.github/actions/check-control-plane-health/check-control-plane-health.sh
new file mode 100644
index 000000000..350538255
--- /dev/null
+++ b/.github/actions/check-control-plane-health/check-control-plane-health.sh
@@ -0,0 +1,626 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+validate_duration_input() {
+  local input_name="$1"
+  local input_value="$2"
+
+  if ! [[ "${input_value}" =~ ^[0-9]+[smh]$ ]]; then
+    echo "::error::${input_name} must be a duration like 60s, 2m, or 1h; got '${input_value}'"
+    exit 1
+  fi
+}
+
+duration_seconds() {
+  local input_value="$1"
+  local number="${input_value%[smh]}"
+  local unit="${input_value: -1}"
+  local amount
+
+  amount=$((10#${number}))
+
+  case "${unit}" in
+    s) echo "${amount}" ;;
+    m) echo $((amount * 60)) ;;
+    h) echo $((amount * 3600)) ;;
+    *)
+      echo "::error::unsupported duration unit in '${input_value}'" >&2
+      exit 1
+      ;;
+  esac
+}
+
+MAX_RECOVERY_ATTEMPTS="${MAX_RECOVERY_ATTEMPTS#"${MAX_RECOVERY_ATTEMPTS%%[![:space:]]*}"}"
+MAX_RECOVERY_ATTEMPTS="${MAX_RECOVERY_ATTEMPTS%"${MAX_RECOVERY_ATTEMPTS##*[![:space:]]}"}"
+if ! [[ "${MAX_RECOVERY_ATTEMPTS}" =~ ^[0-9]+$ ]]; then
+  echo "::error::max_recovery_attempts must be a non-negative integer, got '${MAX_RECOVERY_ATTEMPTS}'"
+  exit 1
+fi
+
+MAX_RESTARTS="${MAX_RESTARTS:-}"
+MAX_RESTARTS="${MAX_RESTARTS#"${MAX_RESTARTS%%[![:space:]]*}"}"
+MAX_RESTARTS="${MAX_RESTARTS%"${MAX_RESTARTS##*[![:space:]]}"}"
+if [[ -n "${MAX_RESTARTS}" ]] && [[ "${MAX_RESTARTS}" != "1" ]]; then
+  echo "::warning::max_restarts is deprecated and ignored; use stability_window to fail on new control-plane restarts"
+fi
+
+WAIT_TIMEOUT="${WAIT_TIMEOUT#"${WAIT_TIMEOUT%%[![:space:]]*}"}"
+WAIT_TIMEOUT="${WAIT_TIMEOUT%"${WAIT_TIMEOUT##*[![:space:]]}"}"
+validate_duration_input wait_timeout "${WAIT_TIMEOUT}"
+
+STABILITY_WINDOW="${STABILITY_WINDOW#"${STABILITY_WINDOW%%[![:space:]]*}"}"
+STABILITY_WINDOW="${STABILITY_WINDOW%"${STABILITY_WINDOW##*[![:space:]]}"}"
+if [[ -z "${STABILITY_WINDOW}" ]]; then
+  STABILITY_WINDOW="0s"
+fi
+validate_duration_input stability_window "${STABILITY_WINDOW}"
+if [[ "${STABILITY_WINDOW}" =~ ^0+[smh]$ ]]; then
+  STABILITY_WINDOW="0s"
+fi
+STABILITY_WINDOW_SECONDS="$(duration_seconds "${STABILITY_WINDOW}")"
+
+STABILITY_PROBE_INTERVAL="${STABILITY_PROBE_INTERVAL:-10s}"
+STABILITY_PROBE_INTERVAL="${STABILITY_PROBE_INTERVAL#"${STABILITY_PROBE_INTERVAL%%[![:space:]]*}"}"
+STABILITY_PROBE_INTERVAL="${STABILITY_PROBE_INTERVAL%"${STABILITY_PROBE_INTERVAL##*[![:space:]]}"}"
+validate_duration_input stability_probe_interval "${STABILITY_PROBE_INTERVAL}"
+STABILITY_PROBE_INTERVAL_SECONDS="$(duration_seconds "${STABILITY_PROBE_INTERVAL}")"
+if (( STABILITY_PROBE_INTERVAL_SECONDS <= 0 )); then
+  echo "::error::stability_probe_interval must be greater than 0, got '${STABILITY_PROBE_INTERVAL}'"
+  exit 1
+fi
+STABILITY_PROBE_FAILURE_THRESHOLD="${STABILITY_PROBE_FAILURE_THRESHOLD:-2}"
+STABILITY_PROBE_FAILURE_THRESHOLD="${STABILITY_PROBE_FAILURE_THRESHOLD#"${STABILITY_PROBE_FAILURE_THRESHOLD%%[![:space:]]*}"}"
+STABILITY_PROBE_FAILURE_THRESHOLD="${STABILITY_PROBE_FAILURE_THRESHOLD%"${STABILITY_PROBE_FAILURE_THRESHOLD##*[![:space:]]}"}"
+if ! [[ "${STABILITY_PROBE_FAILURE_THRESHOLD}" =~ ^[0-9]+$ ]]; then
+  echo "::error::stability_probe_failure_threshold must be a positive integer, got '${STABILITY_PROBE_FAILURE_THRESHOLD}'"
+  exit 1
+fi
+if (( STABILITY_PROBE_FAILURE_THRESHOLD <= 0 )); then
+  echo "::error::stability_probe_failure_threshold must be greater than 0, got '${STABILITY_PROBE_FAILURE_THRESHOLD}'"
+  exit 1
+fi
+
+LEASE_COMPONENTS="${LEASE_COMPONENTS:-kube-controller-manager kube-scheduler}"
+LEASE_COMPONENTS="${LEASE_COMPONENTS#"${LEASE_COMPONENTS%%[![:space:]]*}"}"
+LEASE_COMPONENTS="${LEASE_COMPONENTS%"${LEASE_COMPONENTS##*[![:space:]]}"}"
+
+LEASE_STALE_TIMEOUT="${LEASE_STALE_TIMEOUT:-120s}"
+LEASE_STALE_TIMEOUT="${LEASE_STALE_TIMEOUT#"${LEASE_STALE_TIMEOUT%%[![:space:]]*}"}"
+LEASE_STALE_TIMEOUT="${LEASE_STALE_TIMEOUT%"${LEASE_STALE_TIMEOUT##*[![:space:]]}"}"
+validate_duration_input lease_stale_timeout "${LEASE_STALE_TIMEOUT}"
+LEASE_STALE_TIMEOUT_SECONDS="$(duration_seconds "${LEASE_STALE_TIMEOUT}")"
+if (( LEASE_STALE_TIMEOUT_SECONDS <= 0 )); then
+  echo "::error::lease_stale_timeout must be greater than 0, got '${LEASE_STALE_TIMEOUT}'"
+  exit 1
+fi
+
+RECOVER_UNHEALTHY="${RECOVER_UNHEALTHY#"${RECOVER_UNHEALTHY%%[![:space:]]*}"}"
+RECOVER_UNHEALTHY="${RECOVER_UNHEALTHY%"${RECOVER_UNHEALTHY##*[![:space:]]}"}"
+case "${RECOVER_UNHEALTHY}" in
+  true|false) ;;
+  *)
+    echo "::error::recover_unhealthy must be true or false, got '${RECOVER_UNHEALTHY}'"
+    exit 1
+    ;;
+esac
+
+kubectl_kind() {
+  timeout 30s kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" "$@"
+}
+
+docker_timeout() {
+  timeout 30s docker "$@"
+}
+
+STATIC_POD_RECREATE_SETTLE_SECONDS=5
+RESTART_COUNT_ATTEMPTS=3
+RESTART_COUNT_RETRY_SLEEP_SECONDS=2
+declare -A RECOVERY_ATTEMPTS=()
+declare -A INITIAL_RESTARTS=()
+
+kubectl_kind get --raw='/readyz' || true
+
+wait_ready() {
+  local component="$1"
+  local selector="component=${component}"
+
+  if ! timeout "${WAIT_TIMEOUT}" kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" -n "${NAMESPACE}" \
+    wait --for=condition=Ready pod -l "${selector}" --timeout="${WAIT_TIMEOUT}"; then
+    return 1
+  fi
+}
+
+restart_total() {
+  local component="$1"
+  local selector="component=${component}"
+  local restart_counts
+  local restart_count
+  local total=0
+  local attempt
+
+  for ((attempt = 1; attempt <= RESTART_COUNT_ATTEMPTS; attempt++)); do
+    if restart_counts=$(kubectl_kind -n "${NAMESPACE}" get pod -l "${selector}" \
+      -o jsonpath='{range .items[*]}{range .status.containerStatuses[*]}{.restartCount}{"\n"}{end}{end}'); then
+      if [[ -n "${restart_counts}" ]]; then
+        break
+      fi
+      echo "::warning::no container statuses found for ${component} pods (attempt ${attempt}/${RESTART_COUNT_ATTEMPTS})" >&2
+    else
+      echo "::warning::failed to read restart counts for ${component} pods (attempt ${attempt}/${RESTART_COUNT_ATTEMPTS})" >&2
+    fi
+
+    if (( attempt < RESTART_COUNT_ATTEMPTS )); then
+      sleep "${RESTART_COUNT_RETRY_SLEEP_SECONDS}"
+    fi
+  done
+
+  if [[ -z "${restart_counts}" ]]; then
+    echo "::error::no container statuses found for ${component} pods after ${RESTART_COUNT_ATTEMPTS} attempts" >&2
+    dump_component_diagnostics "${component}" >&2
+    exit 1
+  fi
+
+  while IFS= read -r restart_count; do
+    [[ -z "${restart_count}" ]] && continue
+    total=$((total + restart_count))
+  done <<< "${restart_counts}"
+  echo "${total}"
+}
+
+report_restart_baseline() {
+  local component="$1"
+  local restart_count="$2"
+
+  if (( restart_count > 0 )); then
+    echo "::warning::${component} has historical restartCount=${restart_count}; checking current readiness and stability window only"
+    return
+  fi
+  echo "${component} restartCount=${restart_count}"
+}
+
+dump_control_plane_summary() {
+  echo "=== Control-plane pod restart summary ==="
+  kubectl_kind -n "${NAMESPACE}" get pods -l tier=control-plane -o wide || true
+  kubectl_kind -n "${NAMESPACE}" get pods -l tier=control-plane \
+    -o jsonpath='{range .items[*]}{.metadata.name}{" restartCount="}{range .status.containerStatuses[*]}{.restartCount}{" "}{end}{"\n"}{end}' || true
+}
+
+require_readyz() {
+  local reason="$1"
+
+  if ! kubectl_kind get --raw='/readyz'; then
+    echo "::error::kube-apiserver /readyz failed ${reason}"
+    dump_all_control_plane_runtime_diagnostics
+    exit 1
+  fi
+}
+
+probe_control_plane_api() {
+  local reason="$1"
+  local component
+  local lease_summary
+
+  if ! kubectl_kind get --raw='/readyz' >/dev/null; then
+    echo "::error::kube-apiserver /readyz probe failed ${reason}"
+    return 1
+  fi
+
+  for component in ${LEASE_COMPONENTS}; do
+    if ! lease_summary=$(kubectl_kind -n "${NAMESPACE}" get lease "${component}" \
+      -o jsonpath='{.metadata.name}{" holder="}{.spec.holderIdentity}{" renewTime="}{.spec.renewTime}{"\n"}' 2>/dev/null); then
+      echo "::error::failed to read leader election lease ${component} ${reason}"
+      return 1
+    fi
+    echo "${lease_summary}"
+  done
+}
+
+lease_renew_epoch() {
+  local renew_time="$1"
+
+  date -u -d "${renew_time}" +%s 2>/dev/null
+}
+
+verify_leader_lease_freshness() {
+  local component
+  local now_epoch
+  local renew_time
+  local renew_epoch
+  local lease_age
+
+  [[ -z "${LEASE_COMPONENTS}" ]] && return
+
+  now_epoch="$(date -u +%s)"
+  echo "Checking leader election lease freshness (max age ${LEASE_STALE_TIMEOUT})..."
+  for component in ${LEASE_COMPONENTS}; do
+    if ! renew_time=$(kubectl_kind -n "${NAMESPACE}" get lease "${component}" -o jsonpath='{.spec.renewTime}' 2>/dev/null); then
+      echo "::error::failed to read leader election lease ${component}"
+      dump_all_control_plane_runtime_diagnostics
+      exit 1
+    fi
+    if [[ -z "${renew_time}" ]]; then
+      echo "::error::leader election lease ${component} has empty spec.renewTime"
+      dump_all_control_plane_runtime_diagnostics
+      exit 1
+    fi
+    if ! renew_epoch="$(lease_renew_epoch "${renew_time}")"; then
+      echo "::error::failed to parse leader election lease ${component} renewTime '${renew_time}'"
+      dump_all_control_plane_runtime_diagnostics
+      exit 1
+    fi
+    lease_age=$((now_epoch - renew_epoch))
+    if (( lease_age < 0 )); then
+      lease_age=0
+    fi
+    echo "${component} lease renewTime=${renew_time} age=${lease_age}s"
+    if (( lease_age > LEASE_STALE_TIMEOUT_SECONDS )); then
+      echo "::error::leader election lease ${component} is stale: age=${lease_age}s exceeds ${LEASE_STALE_TIMEOUT}"
+      dump_all_control_plane_runtime_diagnostics
+      exit 1
+    fi
+  done
+}
+
+observe_stability_window() {
+  local label="$1"
+  local elapsed=0
+  local probe=0
+  local sleep_seconds
+  local consecutive_failures=0
+  local total_failures=0
+
+  echo "Observing control-plane stability for ${STABILITY_WINDOW} (${label}); probing every ${STABILITY_PROBE_INTERVAL}, failing after ${STABILITY_PROBE_FAILURE_THRESHOLD} consecutive probe failure(s)..."
+  while (( elapsed < STABILITY_WINDOW_SECONDS )); do
+    sleep_seconds="${STABILITY_PROBE_INTERVAL_SECONDS}"
+    if (( elapsed + sleep_seconds > STABILITY_WINDOW_SECONDS )); then
+      sleep_seconds=$((STABILITY_WINDOW_SECONDS - elapsed))
+    fi
+    if (( sleep_seconds > 0 )); then
+      sleep "${sleep_seconds}"
+      elapsed=$((elapsed + sleep_seconds))
+    fi
+
+    probe=$((probe + 1))
+    echo "=== Control-plane stability probe ${probe} (${elapsed}/${STABILITY_WINDOW_SECONDS}s, ${label}) ==="
+    if probe_control_plane_api "during ${label} stability probe ${probe}"; then
+      consecutive_failures=0
+      continue
+    fi
+
+    total_failures=$((total_failures + 1))
+    consecutive_failures=$((consecutive_failures + 1))
+    echo "::warning::control-plane stability probe ${probe} failed (${consecutive_failures} consecutive, ${total_failures} total)"
+    if (( consecutive_failures >= STABILITY_PROBE_FAILURE_THRESHOLD )); then
+      echo "::error::control-plane had ${consecutive_failures} consecutive failed stability probes during ${label}"
+      dump_all_control_plane_runtime_diagnostics
+      exit 1
+    fi
+  done
+
+  if (( total_failures > 0 )); then
+    echo "::warning::control-plane had ${total_failures} transient failed stability probe(s) during ${label}; final health checks must still pass"
+  fi
+  verify_leader_lease_freshness
+}
+
+dump_api_server_health() {
+  local endpoint
+
+  for endpoint in '/livez?verbose' '/readyz?verbose' '/healthz'; do
+    echo "=== kube-apiserver ${endpoint} ==="
+    kubectl_kind get --raw="${endpoint}" || true
+  done
+}
+
+dump_kind_node_runtime_summary() {
+  local node="${KIND_CLUSTER_NAME}-control-plane"
+
+  if ! docker_timeout inspect "${node}" >/dev/null 2>&1; then
+    echo "::warning::cannot collect node runtime summary: kind node container ${node} not found"
+    return
+  fi
+
+  echo "=== ${node} docker stats ==="
+  docker_timeout stats --no-stream \
+    --format 'table {{.Name}}\t{{.CPUPerc}}\t{{.MemUsage}}\t{{.NetIO}}\t{{.BlockIO}}\t{{.PIDs}}' \
+    "${node}" || true
+
+  echo "=== ${node} docker inspect state ==="
+  docker_timeout inspect \
+    --format 'status={{.State.Status}} running={{.State.Running}} oomKilled={{.State.OOMKilled}} pid={{.State.Pid}} started={{.State.StartedAt}} finished={{.State.FinishedAt}}' \
+    "${node}" || true
+
+  echo "=== ${node} node pressure snapshot ==="
+  docker_timeout exec "${node}" sh -c '
+    date
+    uptime || true
+    free -h || true
+    df -h / /var/lib/containerd /var/lib/kubelet 2>/dev/null || df -h
+    echo "--- top cpu/memory processes ---"
+    ps -eo pid,ppid,stat,etime,%cpu,%mem,comm,args --sort=-%cpu | head -40 || true
+  ' || true
+
+  echo "=== ${node} CRI pod/container summary ==="
+  docker_timeout exec "${node}" crictl pods || true
+  docker_timeout exec "${node}" crictl ps -a || true
+  docker_timeout exec "${node}" crictl stats || true
+}
+
+dump_static_pod_runtime_diagnostics() {
+  local component="$1"
+  local node="${KIND_CLUSTER_NAME}-control-plane"
+  local container_ids
+  local container_id
+  local count=0
+
+  if ! docker_timeout inspect "${node}" >/dev/null 2>&1; then
+    echo "::warning::cannot collect ${component} runtime diagnostics: kind node container ${node} not found"
+    return
+  fi
+
+  echo "=== ${node} ${component} static pod manifest ==="
+  docker_timeout exec "${node}" sh -c "sed -n '1,220p' /etc/kubernetes/manifests/${component}.yaml" || true
+
+  echo "=== ${node} ${component} CRI containers ==="
+  docker_timeout exec "${node}" crictl ps -a --name "${component}" || true
+
+  container_ids=$(docker_timeout exec "${node}" crictl ps -a --name "${component}" -q 2>/dev/null || true)
+  for container_id in ${container_ids}; do
+    count=$((count + 1))
+    if (( count > 8 )); then
+      echo "Skipping remaining ${component} CRI containers after first 8 entries."
+      break
+    fi
+
+    echo "=== ${node} crictl inspect ${component} ${container_id} ==="
+    docker_timeout exec "${node}" crictl inspect "${container_id}" || true
+    echo "=== ${node} crictl logs ${component} ${container_id} ==="
+    docker_timeout exec "${node}" crictl logs --tail=200 "${container_id}" || true
+  done
+
+  echo "=== ${node} kubelet journal (${component}) ==="
+  docker_timeout exec "${node}" journalctl -u kubelet --since '45 minutes ago' --no-pager 2>/dev/null \
+    | grep -Ei "${component}|static pod|mirror pod|probe|liveness|readiness|startup|back-off|backoff|container|failed|error|oom|killed" \
+    | tail -200 || true
+
+  echo "=== ${node} containerd journal (${component}) ==="
+  docker_timeout exec "${node}" journalctl -u containerd --since '45 minutes ago' --no-pager 2>/dev/null \
+    | grep -Ei "${component}|container|task|shim|deadline|failed|error|oom|killed" \
+    | tail -200 || true
+}
+
+dump_all_control_plane_runtime_diagnostics() {
+  local component
+
+  dump_control_plane_summary
+  dump_api_server_health
+  dump_kind_node_runtime_summary
+  for component in ${COMPONENTS}; do
+    dump_static_pod_runtime_diagnostics "${component}"
+    kubectl_kind -n "${NAMESPACE}" get lease "${component}" -o yaml 2>/dev/null || true
+  done
+}
+
+dump_component_diagnostics() {
+  local component="$1"
+  local selector="component=${component}"
+  local pods
+  local pod
+
+  dump_control_plane_summary
+  kubectl_kind -n "${NAMESPACE}" get pod -l "${selector}" -o wide || true
+  kubectl_kind -n "${NAMESPACE}" describe pod -l "${selector}" || true
+  kubectl_kind -n "${NAMESPACE}" get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true
+
+  pods=$(kubectl_kind -n "${NAMESPACE}" get pod -l "${selector}" -o name 2>/dev/null || true)
+  while IFS= read -r pod; do
+    [[ -z "${pod}" ]] && continue
+    echo "=== ${pod} logs ==="
+    kubectl_kind -n "${NAMESPACE}" logs "${pod}" --all-containers --tail=100 2>/dev/null || true
+    echo "=== ${pod} previous logs ==="
+    kubectl_kind -n "${NAMESPACE}" logs "${pod}" --all-containers --previous --tail=100 2>/dev/null || true
+  done <<< "${pods}"
+
+  dump_all_control_plane_runtime_diagnostics
+  kubectl_kind -n "${NAMESPACE}" get lease "${component}" -o yaml 2>/dev/null || true
+}
+
+is_recovery_component() {
+  local component="$1"
+  local candidate
+
+  for candidate in ${RECOVERY_COMPONENTS}; do
+    if [[ "${candidate}" == "${component}" ]]; then
+      return 0
+    fi
+  done
+  return 1
+}
+
+try_recover_component() {
+  local component="$1"
+  local reason="$2"
+  local node="${KIND_CLUSTER_NAME}-control-plane"
+  local attempt
+  local container_ids
+  local container_id
+
+  if [[ "${RECOVER_UNHEALTHY}" != "true" ]]; then
+    return 1
+  fi
+  if (( MAX_RECOVERY_ATTEMPTS == 0 )); then
+    return 1
+  fi
+  if ! is_recovery_component "${component}"; then
+    return 1
+  fi
+
+  attempt="${RECOVERY_ATTEMPTS[${component}]:-0}"
+  if (( attempt >= MAX_RECOVERY_ATTEMPTS )); then
+    return 1
+  fi
+  RECOVERY_ATTEMPTS["${component}"]=$((attempt + 1))
+
+  echo "::warning::${component} is unhealthy (${reason}); restarting static pod container (attempt $((attempt + 1))/${MAX_RECOVERY_ATTEMPTS})"
+  dump_component_diagnostics "${component}"
+
+  if ! docker_timeout inspect "${node}" >/dev/null 2>&1; then
+    echo "::warning::cannot recover ${component}: kind node container ${node} not found"
+    return 1
+  fi
+
+  if ! container_ids=$(docker_timeout exec "${node}" crictl ps --name "${component}" -q 2>/dev/null); then
+    echo "::warning::cannot recover ${component}: timed out or failed to list containers in ${node}"
+    return 1
+  fi
+  if [[ -z "${container_ids}" ]]; then
+    echo "::warning::cannot recover ${component}: no running container found in ${node}"
+    return 1
+  fi
+
+  for container_id in ${container_ids}; do
+    echo "Stopping ${component} container ${container_id} in ${node}..."
+    if ! docker_timeout exec "${node}" crictl stop "${container_id}"; then
+      echo "::warning::failed to stop ${component} container ${container_id}"
+      return 1
+    fi
+  done
+
+  # Give kubelet a short interval to observe the stopped CRI container
+  # and refresh the mirror pod before kubectl wait reads pod status.
+  sleep "${STATIC_POD_RECREATE_SETTLE_SECONDS}"
+  if ! wait_ready "${component}"; then
+    echo "::warning::${component} did not recover after static pod container restart"
+    dump_component_diagnostics "${component}"
+    kubectl_kind get --raw='/readyz' || true
+    return 1
+  fi
+
+  echo "${component} recovered after static pod container restart."
+  return 0
+}
+
+check_component() {
+  local component="$1"
+  local selector="component=${component}"
+  local pods
+  local initial_restarts
+
+  if ! pods=$(kubectl_kind -n "${NAMESPACE}" get pod -l "${selector}" -o name); then
+    if ! try_recover_component "${component}" "failed to list pods in ${NAMESPACE} with selector ${selector}"; then
+      echo "::error::failed to list ${component} pods in ${NAMESPACE} with selector ${selector}"
+      kubectl_kind -n "${NAMESPACE}" get pods -o wide || true
+      exit 1
+    fi
+    if ! pods=$(kubectl_kind -n "${NAMESPACE}" get pod -l "${selector}" -o name); then
+      echo "::error::failed to list ${component} pods after recovery"
+      kubectl_kind -n "${NAMESPACE}" get pods -o wide || true
+      exit 1
+    fi
+  fi
+  if [[ -z "${pods}" ]]; then
+    echo "::error::no ${component} pods found in ${NAMESPACE} with selector ${selector}"
+    kubectl_kind -n "${NAMESPACE}" get pods -o wide || true
+    exit 1
+  fi
+
+  if ! wait_ready "${component}"; then
+    if ! try_recover_component "${component}" "pods did not become Ready within ${WAIT_TIMEOUT}"; then
+      echo "::error::${component} pods did not become Ready within ${WAIT_TIMEOUT}"
+      dump_component_diagnostics "${component}"
+      kubectl_kind get --raw='/readyz' || true
+      exit 1
+    fi
+  fi
+  initial_restarts=$(restart_total "${component}")
+  report_restart_baseline "${component}" "${initial_restarts}"
+  INITIAL_RESTARTS["${component}"]="${initial_restarts}"
+}
+
+verify_stability_window() {
+  local component
+  local initial_restarts
+  local final_restarts
+  local recovered=false
+
+  if [[ "${STABILITY_WINDOW}" == "0s" ]]; then
+    return
+  fi
+
+  observe_stability_window "primary"
+  for component in ${COMPONENTS}; do
+    initial_restarts="${INITIAL_RESTARTS[${component}]:-}"
+    if [[ -z "${initial_restarts}" ]]; then
+      echo "::error::missing initial restart count for ${component}"
+      exit 1
+    fi
+    if ! wait_ready "${component}"; then
+      if ! try_recover_component "${component}" "pods became unready during ${STABILITY_WINDOW}"; then
+        echo "::error::${component} pods became unready during ${STABILITY_WINDOW}"
+        dump_component_diagnostics "${component}"
+        kubectl_kind get --raw='/readyz' || true
+        exit 1
+      fi
+      initial_restarts=$(restart_total "${component}")
+      report_restart_baseline "${component}" "${initial_restarts}"
+      INITIAL_RESTARTS["${component}"]="${initial_restarts}"
+      recovered=true
+      continue
+    fi
+    final_restarts=$(restart_total "${component}")
+    if (( final_restarts > initial_restarts )); then
+      echo "::error::${component} restartCount increased from ${initial_restarts} to ${final_restarts} during ${STABILITY_WINDOW}"
+      dump_component_diagnostics "${component}"
+      kubectl_kind get --raw='/readyz' || true
+      exit 1
+    fi
+    INITIAL_RESTARTS["${component}"]="${final_restarts}"
+  done
+
+  if [[ "${recovered}" != "true" ]]; then
+    return
+  fi
+
+  echo "::warning::control-plane recovery occurred; observing one additional ${STABILITY_WINDOW} stability window"
+  observe_stability_window "post-recovery"
+  for component in ${COMPONENTS}; do
+    initial_restarts="${INITIAL_RESTARTS[${component}]:-}"
+    if [[ -z "${initial_restarts}" ]]; then
+      echo "::error::missing post-recovery restart count for ${component}"
+      exit 1
+    fi
+    if ! wait_ready "${component}"; then
+      echo "::error::${component} pods became unready after recovery"
+      dump_component_diagnostics "${component}"
+      kubectl_kind get --raw='/readyz' || true
+      exit 1
+    fi
+    final_restarts=$(restart_total "${component}")
+    if (( final_restarts > initial_restarts )); then
+      echo "::error::${component} restartCount increased from ${initial_restarts} to ${final_restarts} after recovery"
+      dump_component_diagnostics "${component}"
+      exit 1
+    fi
+    INITIAL_RESTARTS["${component}"]="${final_restarts}"
+  done
+}
+
+for component in ${COMPONENTS}; do
+  check_component "${component}"
+done
+verify_stability_window
+require_readyz "after stability window"
diff --git a/.github/actions/gpu-cluster-setup/action.yml b/.github/actions/gpu-cluster-setup/action.yml
index b9bc3060f..324ce7a8f 100644
--- a/.github/actions/gpu-cluster-setup/action.yml
+++ b/.github/actions/gpu-cluster-setup/action.yml
@@ -15,18 +15,91 @@
 name: 'GPU Cluster Setup'
 description: 'Creates a GPU-enabled kind cluster using nvkind with CDI-mode GPU passthrough.'
 
+inputs:
+  kind_node_image:
+    description: 'Kind node image for nvkind cluster creation'
+    required: false
+    default: ''
+  min_gpu_count:
+    description: 'Minimum visible GPU count required before cluster setup'
+    required: false
+    default: '1'
+  gpu_model_pattern:
+    description: 'Optional grep-compatible GPU model pattern required for visible GPUs'
+    required: false
+    default: ''
+  min_free_disk_gb:
+    description: 'Minimum free disk space on / required before cluster setup'
+    required: false
+    default: '20'
+  min_available_memory_gb:
+    description: 'Minimum available system memory required before cluster setup'
+    required: false
+    default: '8'
+  cluster_create_timeout:
+    description: 'Timeout for nvkind cluster create'
+    required: false
+    default: '900s'
+  control_plane_resource_patches:
+    description: 'Apply kubeadm patches that raise control-plane static pod resource requests'
+    required: false
+    default: 'false'
+  control_plane_leader_election_tuning:
+    description: 'Increase kube-controller-manager and kube-scheduler leader election timeouts for slow CI control planes'
+    required: false
+    default: 'false'
+  leader_election_lease_duration:
+    description: 'Leader election lease duration when control_plane_leader_election_tuning is true'
+    required: false
+    default: '300s'
+  leader_election_renew_deadline:
+    description: 'Leader election renew deadline when control_plane_leader_election_tuning is true'
+    required: false
+    default: '240s'
+  leader_election_retry_period:
+    description: 'Leader election retry period when control_plane_leader_election_tuning is true'
+    required: false
+    default: '10s'
+  api_server_cpu_request:
+    description: 'kube-apiserver CPU request when control_plane_resource_patches is true'
+    required: false
+    default: '1000m'
+  api_server_memory_request:
+    description: 'kube-apiserver memory request when control_plane_resource_patches is true'
+    required: false
+    default: '1Gi'
+  controller_manager_cpu_request:
+    description: 'kube-controller-manager CPU request when control_plane_resource_patches is true'
+    required: false
+    default: '1000m'
+  controller_manager_memory_request:
+    description: 'kube-controller-manager memory request when control_plane_resource_patches is true'
+    required: false
+    default: '512Mi'
+  scheduler_cpu_request:
+    description: 'kube-scheduler CPU request when control_plane_resource_patches is true'
+    required: false
+    default: '500m'
+  scheduler_memory_request:
+    description: 'kube-scheduler memory request when control_plane_resource_patches is true'
+    required: false
+    default: '256Mi'
+  etcd_cpu_request:
+    description: 'etcd CPU request when control_plane_resource_patches is true'
+    required: false
+    default: '1000m'
+  etcd_memory_request:
+    description: 'etcd memory request when control_plane_resource_patches is true'
+    required: false
+    default: '1Gi'
+
 runs:
   using: 'composite'
   steps:
 
     - name: Validate environment
       shell: bash
-      run: |
-        if [[ -z "${KIND_CLUSTER_NAME:-}" ]]; then
-          echo "::error::KIND_CLUSTER_NAME environment variable must be set by the calling workflow"
-          exit 1
-        fi
-
+      run: bash "${{ github.action_path }}/validate-env.sh"
     - name: Load versions
       id: versions
       uses: ./.github/actions/load-versions
@@ -52,40 +125,61 @@ runs:
 
     - name: Install nvkind
       shell: bash
-      run: |
-        go install github.com/NVIDIA/nvkind/cmd/nvkind@latest
-        nvkind --help
-
-    - name: Verify host GPU
+      env:
+        NVKIND_VERSION: ${{ steps.versions.outputs.nvkind }}
+      run: bash "${{ github.action_path }}/install-nvkind.sh"
+    - name: Runner preflight
       shell: bash
-      run: nvidia-smi -L
-
+      env:
+        GPU_MODEL_PATTERN: ${{ inputs.gpu_model_pattern }}
+        MIN_GPU_COUNT: ${{ inputs.min_gpu_count }}
+        MIN_FREE_DISK_GB: ${{ inputs.min_free_disk_gb }}
+        MIN_AVAILABLE_MEMORY_GB: ${{ inputs.min_available_memory_gb }}
+      run: bash "${{ github.action_path }}/runner-preflight.sh"
     - name: Configure NVIDIA Container Toolkit for kind
       shell: bash
-      run: |
-        sudo nvidia-ctk runtime configure --runtime=docker --set-as-default --cdi.enabled
-        sudo nvidia-ctk config --set accept-nvidia-visible-devices-as-volume-mounts=true --in-place
-        sudo nvidia-ctk config --set accept-nvidia-visible-devices-envvar-when-unprivileged=false --in-place
-        sudo systemctl restart docker
-
+      run: bash "${{ github.action_path }}/configure-nvidia-container-toolkit.sh"
     - name: Validate Docker GPU access
       shell: bash
-      run: docker run --rm -v /dev/null:/var/run/nvidia-container-devices/all ubuntu:22.04 nvidia-smi -L
-
+      run: bash "${{ github.action_path }}/validate-docker-gpu-access.sh"
     - name: Increase inotify limits
       shell: bash
-      run: |
-        sudo sysctl -w fs.inotify.max_user_watches=524288
-        sudo sysctl -w fs.inotify.max_user_instances=1024
-
+      run: bash "${{ github.action_path }}/increase-inotify-limits.sh"
+    - name: Delete stale kind cluster
+      shell: bash
+      run: bash "${{ github.action_path }}/delete-stale-kind-cluster.sh"
+    - name: Check runner capacity
+      shell: bash
+      env:
+        MIN_FREE_DISK_GB: ${{ inputs.min_free_disk_gb }}
+        MIN_AVAILABLE_MEMORY_GB: ${{ inputs.min_available_memory_gb }}
+      run: bash "${{ github.action_path }}/check-runner-capacity.sh"
+    - name: Warm kind node image
+      if: ${{ inputs.kind_node_image != '' }}
+      shell: bash
+      env:
+        KIND_NODE_IMAGE: ${{ inputs.kind_node_image }}
+        MIN_FREE_DISK_GB: ${{ inputs.min_free_disk_gb }}
+      run: bash "${{ github.action_path }}/warm-kind-node-image.sh"
     - name: Create GPU-enabled kind cluster
       shell: bash
-      run: |
-        nvkind cluster create --name="${KIND_CLUSTER_NAME}" || echo "::warning::nvkind cluster create returned non-zero (umount errors are expected with CDI mode)"
-        kubectl --context="kind-${KIND_CLUSTER_NAME}" wait --for=condition=Ready nodes --all --timeout=300s
-        kubectl --context="kind-${KIND_CLUSTER_NAME}" cluster-info
-        kubectl --context="kind-${KIND_CLUSTER_NAME}" get nodes -o wide
-
+      env:
+        KIND_NODE_IMAGE: ${{ inputs.kind_node_image }}
+        CLUSTER_CREATE_TIMEOUT: ${{ inputs.cluster_create_timeout }}
+        CONTROL_PLANE_RESOURCE_PATCHES: ${{ inputs.control_plane_resource_patches }}
+        CONTROL_PLANE_LEADER_ELECTION_TUNING: ${{ inputs.control_plane_leader_election_tuning }}
+        LEADER_ELECTION_LEASE_DURATION: ${{ inputs.leader_election_lease_duration }}
+        LEADER_ELECTION_RENEW_DEADLINE: ${{ inputs.leader_election_renew_deadline }}
+        LEADER_ELECTION_RETRY_PERIOD: ${{ inputs.leader_election_retry_period }}
+        API_SERVER_CPU_REQUEST: ${{ inputs.api_server_cpu_request }}
+        API_SERVER_MEMORY_REQUEST: ${{ inputs.api_server_memory_request }}
+        CONTROLLER_MANAGER_CPU_REQUEST: ${{ inputs.controller_manager_cpu_request }}
+        CONTROLLER_MANAGER_MEMORY_REQUEST: ${{ inputs.controller_manager_memory_request }}
+        SCHEDULER_CPU_REQUEST: ${{ inputs.scheduler_cpu_request }}
+        SCHEDULER_MEMORY_REQUEST: ${{ inputs.scheduler_memory_request }}
+        ETCD_CPU_REQUEST: ${{ inputs.etcd_cpu_request }}
+        ETCD_MEMORY_REQUEST: ${{ inputs.etcd_memory_request }}
+      run: bash "${{ github.action_path }}/create-gpu-kind-cluster.sh"
     - name: Print GPUs (nvkind)
       shell: bash
       run: nvkind cluster print-gpus --name="${KIND_CLUSTER_NAME}"
diff --git a/.github/actions/gpu-cluster-setup/check-runner-capacity.sh b/.github/actions/gpu-cluster-setup/check-runner-capacity.sh
new file mode 100644
index 000000000..ff6c3168e
--- /dev/null
+++ b/.github/actions/gpu-cluster-setup/check-runner-capacity.sh
@@ -0,0 +1,31 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+free_disk_bytes=$(df -B1 --output=avail / | tail -1 | tr -dc '0-9')
+min_free_disk_bytes=$((MIN_FREE_DISK_GB * 1024 * 1024 * 1024))
+free_disk_gib=$((free_disk_bytes / 1024 / 1024 / 1024))
+if (( free_disk_bytes < min_free_disk_bytes )); then
+  echo "::error::free disk on / is ${free_disk_bytes} bytes (${free_disk_gib}GiB), need at least ${min_free_disk_bytes} bytes (${MIN_FREE_DISK_GB}GiB)"
+  exit 1
+fi
+
+available_memory_gb=$(free -g | awk '/^Mem:/ {print $7}')
+if (( available_memory_gb < MIN_AVAILABLE_MEMORY_GB )); then
+  echo "::error::available memory is ${available_memory_gb}GiB, need at least ${MIN_AVAILABLE_MEMORY_GB}GiB"
+  exit 1
+fi
+
+echo "Runner capacity is sufficient: disk=${free_disk_gib}GiB (${free_disk_bytes} bytes) memory=${available_memory_gb}GiB"
diff --git a/.github/actions/gpu-cluster-setup/configure-nvidia-container-toolkit.sh b/.github/actions/gpu-cluster-setup/configure-nvidia-container-toolkit.sh
new file mode 100644
index 000000000..84635a988
--- /dev/null
+++ b/.github/actions/gpu-cluster-setup/configure-nvidia-container-toolkit.sh
@@ -0,0 +1,43 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+sudo nvidia-ctk runtime configure --runtime=docker --set-as-default --cdi.enabled
+sudo nvidia-ctk config --set accept-nvidia-visible-devices-as-volume-mounts=true --in-place
+sudo nvidia-ctk config --set accept-nvidia-visible-devices-envvar-when-unprivileged=false --in-place
+set +e
+timeout 120s sudo systemctl restart docker
+restart_status=$?
+set -e
+if (( restart_status != 0 )); then
+  echo "::error::Docker restart failed after NVIDIA runtime configuration"
+  sudo systemctl status docker --no-pager || true
+  sudo journalctl -u docker --since "10 minutes ago" --no-pager || true
+  exit "${restart_status}"
+fi
+
+for attempt in $(seq 1 30); do
+  if systemctl is-active --quiet docker && timeout 5s docker info >/dev/null 2>&1; then
+    echo "Docker is healthy after NVIDIA runtime configuration."
+    exit 0
+  fi
+  echo "Waiting for Docker to become healthy... (${attempt}/30)"
+  sleep 2
+done
+
+echo "::error::Docker did not become healthy after NVIDIA runtime configuration"
+sudo systemctl status docker --no-pager || true
+exit 1
diff --git a/.github/actions/gpu-cluster-setup/create-gpu-kind-cluster.sh b/.github/actions/gpu-cluster-setup/create-gpu-kind-cluster.sh
new file mode 100644
index 000000000..0c22fb845
--- /dev/null
+++ b/.github/actions/gpu-cluster-setup/create-gpu-kind-cluster.sh
@@ -0,0 +1,487 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+validate_duration_input() {
+  local input_name="$1"
+  local input_value="$2"
+
+  if ! [[ "${input_value}" =~ ^[0-9]+[smh]$ ]]; then
+    echo "::error::${input_name} must be a duration like 300s, 10m, or 1h; got '${input_value}'"
+    exit 1
+  fi
+}
+
+kubectl_kind() {
+  timeout 30s kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" "$@"
+}
+
+kubectl_kind_wait() {
+  timeout 330s kubectl --request-timeout=300s --context="kind-${KIND_CLUSTER_NAME}" "$@"
+}
+
+docker_timeout() {
+  local limit="$1"
+  shift
+  timeout "${limit}" docker "$@"
+}
+
+validate_generated_control_plane_config() {
+  if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then
+    for patch_file in "${patch_dir}"/*.yaml; do
+      if ! grep -Fxq 'apiVersion: v1' "${patch_file}" ||
+        ! grep -Fxq 'kind: Pod' "${patch_file}" ||
+        ! grep -Eq '^[[:space:]]+resources:$' "${patch_file}"; then
+        echo "::error::rendered static pod patch ${patch_file} is missing expected top-level YAML"
+        sed 's/^/  /' "${patch_file}" || true
+        exit 1
+      fi
+    done
+
+    if ! grep -Eq '^[[:space:]]*extraMounts:$' "${config_template}" ||
+      ! grep -Fq 'directory: /patches' "${config_template}"; then
+      echo "::error::rendered kind config is missing control-plane patch mounts"
+      sed 's/^/  /' "${config_template}" || true
+      exit 1
+    fi
+  fi
+
+  if [[ "${CONTROL_PLANE_LEADER_ELECTION_TUNING}" == "true" ]]; then
+    for expected in \
+      'apiVersion: kubeadm.k8s.io/v1beta3' \
+      'apiVersion: kubeadm.k8s.io/v1beta4' \
+      "leader-elect-lease-duration: \"${LEADER_ELECTION_LEASE_DURATION}\"" \
+      "leader-elect-renew-deadline: \"${LEADER_ELECTION_RENEW_DEADLINE}\"" \
+      "leader-elect-retry-period: \"${LEADER_ELECTION_RETRY_PERIOD}\"" \
+      "value: \"${LEADER_ELECTION_LEASE_DURATION}\"" \
+      "value: \"${LEADER_ELECTION_RENEW_DEADLINE}\"" \
+      "value: \"${LEADER_ELECTION_RETRY_PERIOD}\""; do
+      if ! grep -Fq "${expected}" "${config_template}"; then
+        echo "::error::rendered kind config is missing expected leader election setting: ${expected}"
+        sed 's/^/  /' "${config_template}" || true
+        exit 1
+      fi
+    done
+  fi
+}
+
+validate_duration_input cluster_create_timeout "${CLUSTER_CREATE_TIMEOUT}"
+validate_duration_input leader_election_lease_duration "${LEADER_ELECTION_LEASE_DURATION}"
+validate_duration_input leader_election_renew_deadline "${LEADER_ELECTION_RENEW_DEADLINE}"
+validate_duration_input leader_election_retry_period "${LEADER_ELECTION_RETRY_PERIOD}"
+
+CREATE_ARGS=(--name="${KIND_CLUSTER_NAME}")
+if [[ -n "${KIND_NODE_IMAGE}" ]]; then
+  echo "Using kind node image: ${KIND_NODE_IMAGE}"
+  CREATE_ARGS+=(--image="${KIND_NODE_IMAGE}")
+fi
+
+case "${CONTROL_PLANE_RESOURCE_PATCHES}" in
+  true) ;;
+  ""|false) CONTROL_PLANE_RESOURCE_PATCHES=false ;;
+  *)
+    echo "::error::control_plane_resource_patches must be true or false, got '${CONTROL_PLANE_RESOURCE_PATCHES}'"
+    exit 1
+    ;;
+esac
+
+case "${CONTROL_PLANE_LEADER_ELECTION_TUNING}" in
+  true) ;;
+  ""|false) CONTROL_PLANE_LEADER_ELECTION_TUNING=false ;;
+  *)
+    echo "::error::control_plane_leader_election_tuning must be true or false, got '${CONTROL_PLANE_LEADER_ELECTION_TUNING}'"
+    exit 1
+    ;;
+esac
+
+if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" || "${CONTROL_PLANE_LEADER_ELECTION_TUNING}" == "true" ]]; then
+  patch_dir="$(mktemp -d)"
+  config_template="$(mktemp)"
+  cleanup_generated_config() {
+    [[ -n "${patch_dir:-}" ]] && rm -rf "${patch_dir}"
+    [[ -n "${config_template:-}" ]] && rm -f "${config_template}"
+  }
+  trap cleanup_generated_config EXIT
+
+  # Keep YAML heredocs at column 0; indentation is literal content.
+  if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then
+  cat > "${patch_dir}/kube-apiserver+strategic.yaml" <<EOF
+apiVersion: v1
+kind: Pod
+metadata:
+  name: kube-apiserver
+  namespace: kube-system
+spec:
+  containers:
+  - name: kube-apiserver
+    resources:
+      requests:
+        cpu: ${API_SERVER_CPU_REQUEST}
+        memory: ${API_SERVER_MEMORY_REQUEST}
+EOF
+
+  cat > "${patch_dir}/kube-controller-manager+strategic.yaml" <<EOF
+apiVersion: v1
+kind: Pod
+metadata:
+  name: kube-controller-manager
+  namespace: kube-system
+spec:
+  containers:
+  - name: kube-controller-manager
+    resources:
+      requests:
+        cpu: ${CONTROLLER_MANAGER_CPU_REQUEST}
+        memory: ${CONTROLLER_MANAGER_MEMORY_REQUEST}
+EOF
+
+  cat > "${patch_dir}/kube-scheduler+strategic.yaml" <<EOF
+apiVersion: v1
+kind: Pod
+metadata:
+  name: kube-scheduler
+  namespace: kube-system
+spec:
+  containers:
+  - name: kube-scheduler
+    resources:
+      requests:
+        cpu: ${SCHEDULER_CPU_REQUEST}
+        memory: ${SCHEDULER_MEMORY_REQUEST}
+EOF
+
+  cat > "${patch_dir}/etcd+strategic.yaml" <<EOF
+apiVersion: v1
+kind: Pod
+metadata:
+  name: etcd
+  namespace: kube-system
+spec:
+  containers:
+  - name: etcd
+    resources:
+      requests:
+        cpu: ${ETCD_CPU_REQUEST}
+        memory: ${ETCD_MEMORY_REQUEST}
+EOF
+  fi
+
+  cat > "${config_template}" <<'EOF'
+kind: Cluster
+apiVersion: kind.x-k8s.io/v1alpha4
+{{- if hasKey $ "name" }}
+name: {{ $.name }}
+{{- end }}
+nodes:
+- role: control-plane
+  {{- if hasKey $ "image" }}
+  image: {{ $.image }}
+  {{- end }}
+EOF
+  if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then
+  cat >> "${config_template}" <<EOF
+  extraMounts:
+  - hostPath: ${patch_dir}
+    containerPath: /patches
+EOF
+  fi
+  cat >> "${config_template}" <<'EOF'
+  kubeadmConfigPatches:
+EOF
+  if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then
+  cat >> "${config_template}" <<'EOF'
+  - |
+    kind: InitConfiguration
+    patches:
+      directory: /patches
+EOF
+  fi
+  if [[ "${CONTROL_PLANE_LEADER_ELECTION_TUNING}" == "true" ]]; then
+  # kind v0.31 renders kubeadm v1beta3. Keep a v1beta4 patch too so
+  # this remains valid when a future kind image switches API versions.
+  cat >> "${config_template}" <<EOF
+  - |
+    kind: ClusterConfiguration
+    apiVersion: kubeadm.k8s.io/v1beta3
+    controllerManager:
+      extraArgs:
+        leader-elect-lease-duration: "${LEADER_ELECTION_LEASE_DURATION}"
+        leader-elect-renew-deadline: "${LEADER_ELECTION_RENEW_DEADLINE}"
+        leader-elect-retry-period: "${LEADER_ELECTION_RETRY_PERIOD}"
+    scheduler:
+      extraArgs:
+        leader-elect-lease-duration: "${LEADER_ELECTION_LEASE_DURATION}"
+        leader-elect-renew-deadline: "${LEADER_ELECTION_RENEW_DEADLINE}"
+        leader-elect-retry-period: "${LEADER_ELECTION_RETRY_PERIOD}"
+  - |
+    kind: ClusterConfiguration
+    apiVersion: kubeadm.k8s.io/v1beta4
+    controllerManager:
+      extraArgs:
+      - name: leader-elect-lease-duration
+        value: "${LEADER_ELECTION_LEASE_DURATION}"
+      - name: leader-elect-renew-deadline
+        value: "${LEADER_ELECTION_RENEW_DEADLINE}"
+      - name: leader-elect-retry-period
+        value: "${LEADER_ELECTION_RETRY_PERIOD}"
+    scheduler:
+      extraArgs:
+      - name: leader-elect-lease-duration
+        value: "${LEADER_ELECTION_LEASE_DURATION}"
+      - name: leader-elect-renew-deadline
+        value: "${LEADER_ELECTION_RENEW_DEADLINE}"
+      - name: leader-elect-retry-period
+        value: "${LEADER_ELECTION_RETRY_PERIOD}"
+EOF
+  fi
+  cat >> "${config_template}" <<'EOF'
+{{- range $.workers }}
+- role: worker
+  {{- if hasKey $ "image" }}
+  image: {{ $.image }}
+  {{- end }}
+
+  {{- if hasKey . "devices" }}
+  {{- $devices := .devices }}
+  {{- if not (kindIs "slice" $devices) }}
+    {{- $devices = list .devices }}
+  {{- end }}
+  extraMounts:
+    # We inject all NVIDIA GPUs using the nvidia-container-runtime.
+    # This requires `accept-nvidia-visible-devices-as-volume-mounts = true` be set
+    # in `/etc/nvidia-container-runtime/config.toml`
+    {{- range $d := $devices }}
+    - hostPath: /dev/null
+      containerPath: /var/run/nvidia-container-devices/{{ $d }}
+    {{- end }}
+  {{- end }}
+{{- end }}
+EOF
+  if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then
+    echo "Applying control-plane static pod resource patches from ${patch_dir}:"
+    for patch_file in "${patch_dir}"/*.yaml; do
+      echo "--- ${patch_file}"
+      sed 's/^/  /' "${patch_file}"
+    done
+  fi
+  if [[ "${CONTROL_PLANE_LEADER_ELECTION_TUNING}" == "true" ]]; then
+    echo "Increasing kube-controller-manager and kube-scheduler leader election timeouts for slow CI control planes:"
+    echo "  lease-duration=${LEADER_ELECTION_LEASE_DURATION}"
+    echo "  renew-deadline=${LEADER_ELECTION_RENEW_DEADLINE}"
+    echo "  retry-period=${LEADER_ELECTION_RETRY_PERIOD}"
+  fi
+  validate_generated_control_plane_config
+  CREATE_ARGS+=(--config-template="${config_template}")
+fi
+
+set +e
+timeout "${CLUSTER_CREATE_TIMEOUT}" nvkind cluster create "${CREATE_ARGS[@]}"
+create_status=$?
+set -e
+case "${create_status}" in
+  0) ;;
+  124)
+    echo "::warning::nvkind cluster create timed out after ${CLUSTER_CREATE_TIMEOUT}; continuing only if post-create checks pass"
+    ;;
+  *)
+    echo "::error::nvkind cluster create failed with status ${create_status}"
+    exit "${create_status}"
+    ;;
+esac
+
+kubectl_kind_wait wait --for=condition=Ready nodes --all --timeout=300s
+kubectl_kind cluster-info
+kubectl_kind get nodes -o wide
+kubectl_kind describe nodes | \
+  grep -E "^(Name:|Capacity:|Allocatable:|Allocated resources:|  cpu|  memory|  nvidia.com/gpu)" || true
+
+echo "=== Kind node container resources ==="
+docker_timeout 30s ps --filter "label=io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}" \
+  --format '{{.Names}}' | sort | while read -r node_container; do
+    [[ -z "${node_container}" ]] && continue
+    docker_timeout 30s inspect "${node_container}" \
+      --format '{{.Name}} NanoCpus={{.HostConfig.NanoCpus}} CpuShares={{.HostConfig.CpuShares}} Memory={{.HostConfig.Memory}} MemoryReservation={{.HostConfig.MemoryReservation}}'
+  done
+
+echo "=== Control-plane resource requests/limits ==="
+kubectl_kind -n kube-system \
+  get pods -l tier=control-plane -o json | jq -r '
+    .items[] as $pod |
+    $pod.metadata.name,
+    ($pod.spec.containers[] |
+      "  " + .name +
+      " requests=" + ((.resources.requests // {}) | tostring) +
+      " limits=" + ((.resources.limits // {}) | tostring))
+  ' || true
+
+normalize_cpu_request() {
+  local cpu="$1"
+
+  if [[ "${cpu}" =~ ^([0-9]+)000m$ ]]; then
+    echo "${BASH_REMATCH[1]}"
+    return
+  fi
+  echo "${cpu}"
+}
+
+control_plane_request() {
+  local component="$1"
+  local resource="$2"
+
+  kubectl_kind -n kube-system \
+    get pod -l "component=${component}" \
+    -o "jsonpath={.items[0].spec.containers[0].resources.requests.${resource}}"
+}
+
+assert_control_plane_request() {
+  local component="$1"
+  local resource="$2"
+  local expected="$3"
+  local actual
+
+  actual="$(control_plane_request "${component}" "${resource}")"
+  if [[ "${resource}" == "cpu" ]]; then
+    expected="$(normalize_cpu_request "${expected}")"
+    actual="$(normalize_cpu_request "${actual}")"
+  fi
+  if [[ "${actual}" != "${expected}" ]]; then
+    echo "::error::${component} ${resource} request is '${actual}', expected '${expected}'"
+    exit 1
+  fi
+  echo "${component} ${resource} request verified: ${actual}"
+}
+
+control_plane_command_args() {
+  local component="$1"
+
+  kubectl_kind -n kube-system \
+    get pod -l "component=${component}" \
+    -o json | jq -r '.items[0].spec.containers[0] | ((.command // []) + (.args // []))[]?'
+}
+
+static_pod_manifest_contains_arg() {
+  local component="$1"
+  local expected="$2"
+  local node="${KIND_CLUSTER_NAME}-control-plane"
+
+  docker_timeout 30s exec "${node}" grep -Fq -- "- ${expected}" "/etc/kubernetes/manifests/${component}.yaml"
+}
+
+running_static_pod_container_contains_arg() {
+  local component="$1"
+  local expected="$2"
+  local node="${KIND_CLUSTER_NAME}-control-plane"
+  local container_ids
+  local container_id
+  local inspect_output
+
+  if ! container_ids="$(docker_timeout 30s exec "${node}" crictl ps --name "${component}" -q 2>/dev/null)"; then
+    return 1
+  fi
+  [[ -z "${container_ids}" ]] && return 1
+
+  for container_id in ${container_ids}; do
+    inspect_output="$(docker_timeout 30s exec "${node}" crictl inspect "${container_id}" 2>/dev/null || true)"
+    if jq -e --arg expected "${expected}" '
+      ([.info.runtimeSpec.process.args[]?, .status.info.runtimeSpec.process.args[]?] | index($expected)) != null
+    ' >/dev/null 2>&1 <<< "${inspect_output}" || grep -Fq -- "${expected}" <<< "${inspect_output}"; then
+      return 0
+    fi
+  done
+  return 1
+}
+
+dump_running_static_pod_container_args() {
+  local component="$1"
+  local node="${KIND_CLUSTER_NAME}-control-plane"
+  local container_ids
+  local container_id
+
+  echo "Running ${component} CRI container args:"
+  container_ids="$(docker_timeout 30s exec "${node}" crictl ps --name "${component}" -q 2>/dev/null || true)"
+  if [[ -z "${container_ids}" ]]; then
+    echo "(no running ${component} CRI containers found)"
+    return
+  fi
+  for container_id in ${container_ids}; do
+    echo "--- ${container_id} ---"
+    docker_timeout 30s exec "${node}" crictl inspect "${container_id}" 2>/dev/null | jq -r '
+      [.info.runtimeSpec.process.args[]?, .status.info.runtimeSpec.process.args[]?][]?
+    ' || true
+  done
+}
+
+dump_static_pod_manifest() {
+  local component="$1"
+  local node="${KIND_CLUSTER_NAME}-control-plane"
+
+  echo "Static pod manifest /etc/kubernetes/manifests/${component}.yaml:"
+  docker_timeout 30s exec "${node}" sed -n '1,220p' "/etc/kubernetes/manifests/${component}.yaml" || true
+}
+
+assert_control_plane_arg() {
+  local component="$1"
+  local expected="$2"
+  local attempt
+  local command_args
+
+  for attempt in $(seq 1 12); do
+    command_args="$(control_plane_command_args "${component}" || true)"
+    if grep -Fxq -- "${expected}" <<< "${command_args}"; then
+      echo "${component} command/args verified: ${expected}"
+      return
+    fi
+    if running_static_pod_container_contains_arg "${component}" "${expected}"; then
+      echo "${component} running CRI container args verified: ${expected} (live mirror pod omitted it)"
+      return
+    fi
+    if static_pod_manifest_contains_arg "${component}" "${expected}"; then
+      echo "::warning::${component} static pod manifest has ${expected}, but the running container does not yet; waiting for kubelet to converge (${attempt}/12)"
+      sleep 5
+      continue
+    fi
+
+    break
+  done
+
+  echo "::error::${component} running command/args does not contain ${expected}"
+  echo "Observed live command/args:"
+  echo "${command_args:-}"
+  dump_running_static_pod_container_args "${component}"
+  dump_static_pod_manifest "${component}"
+  exit 1
+}
+
+if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then
+  echo "Verifying control-plane resource patches..."
+  assert_control_plane_request kube-apiserver cpu "${API_SERVER_CPU_REQUEST}"
+  assert_control_plane_request kube-apiserver memory "${API_SERVER_MEMORY_REQUEST}"
+  assert_control_plane_request kube-controller-manager cpu "${CONTROLLER_MANAGER_CPU_REQUEST}"
+  assert_control_plane_request kube-controller-manager memory "${CONTROLLER_MANAGER_MEMORY_REQUEST}"
+  assert_control_plane_request kube-scheduler cpu "${SCHEDULER_CPU_REQUEST}"
+  assert_control_plane_request kube-scheduler memory "${SCHEDULER_MEMORY_REQUEST}"
+  assert_control_plane_request etcd cpu "${ETCD_CPU_REQUEST}"
+  assert_control_plane_request etcd memory "${ETCD_MEMORY_REQUEST}"
+fi
+
+if [[ "${CONTROL_PLANE_LEADER_ELECTION_TUNING}" == "true" ]]; then
+  echo "Verifying control-plane leader election timeout patches..."
+  for component in kube-controller-manager kube-scheduler; do
+    assert_control_plane_arg "${component}" "--leader-elect-lease-duration=${LEADER_ELECTION_LEASE_DURATION}"
+    assert_control_plane_arg "${component}" "--leader-elect-renew-deadline=${LEADER_ELECTION_RENEW_DEADLINE}"
+    assert_control_plane_arg "${component}" "--leader-elect-retry-period=${LEADER_ELECTION_RETRY_PERIOD}"
+  done
+fi
diff --git a/.github/actions/gpu-cluster-setup/delete-stale-kind-cluster.sh b/.github/actions/gpu-cluster-setup/delete-stale-kind-cluster.sh
new file mode 100644
index 000000000..5e0a81778
--- /dev/null
+++ b/.github/actions/gpu-cluster-setup/delete-stale-kind-cluster.sh
@@ -0,0 +1,58 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+kind_cluster_label="io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}"
+docker_timeout() {
+  timeout 30s docker "$@"
+}
+
+read_kind_container_ids() {
+  local output
+
+  if ! output="$(docker_timeout ps -aq --filter "label=${kind_cluster_label}" 2>&1)"; then
+    echo "::error::failed to query stale kind containers for ${KIND_CLUSTER_NAME}"
+    echo "${output}"
+    exit 1
+  fi
+
+  remaining_containers=()
+  if [[ -n "${output}" ]]; then
+    mapfile -t remaining_containers <<< "${output}"
+  fi
+}
+
+if kind get clusters | grep -Fxq "${KIND_CLUSTER_NAME}"; then
+  echo "Deleting stale kind cluster: ${KIND_CLUSTER_NAME}"
+  if ! timeout 180s kind delete cluster --name "${KIND_CLUSTER_NAME}"; then
+    echo "::warning::kind delete cluster timed out or failed; falling back to direct container cleanup"
+  fi
+else
+  echo "No stale kind cluster named ${KIND_CLUSTER_NAME}"
+fi
+
+read_kind_container_ids
+if (( ${#remaining_containers[@]} > 0 )); then
+  echo "Removing stale containers for ${KIND_CLUSTER_NAME}:"
+  docker_timeout ps -a --filter "label=${kind_cluster_label}"
+  docker_timeout rm -f "${remaining_containers[@]}"
+fi
+
+read_kind_container_ids
+if (( ${#remaining_containers[@]} > 0 )); then
+  echo "::error::stale containers still remain for ${KIND_CLUSTER_NAME}:"
+  docker_timeout ps -a --filter "label=${kind_cluster_label}"
+  exit 1
+fi
diff --git a/.github/actions/gpu-cluster-setup/increase-inotify-limits.sh b/.github/actions/gpu-cluster-setup/increase-inotify-limits.sh
new file mode 100644
index 000000000..843496a38
--- /dev/null
+++ b/.github/actions/gpu-cluster-setup/increase-inotify-limits.sh
@@ -0,0 +1,19 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+sudo sysctl -w fs.inotify.max_user_watches=524288
+sudo sysctl -w fs.inotify.max_user_instances=1024
diff --git a/.github/actions/gpu-cluster-setup/install-nvkind.sh b/.github/actions/gpu-cluster-setup/install-nvkind.sh
new file mode 100644
index 000000000..c2200e078
--- /dev/null
+++ b/.github/actions/gpu-cluster-setup/install-nvkind.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+if [[ -z "${NVKIND_VERSION:-}" ]]; then
+  echo "::error::NVKIND_VERSION must be set"
+  exit 1
+fi
+
+go install "github.com/NVIDIA/nvkind/cmd/nvkind@${NVKIND_VERSION}"
+nvkind_bin="${GOBIN:-$(go env GOPATH)/bin}/nvkind"
+"${nvkind_bin}" --help
diff --git a/.github/actions/gpu-cluster-setup/runner-preflight.sh b/.github/actions/gpu-cluster-setup/runner-preflight.sh
new file mode 100644
index 000000000..678b9d419
--- /dev/null
+++ b/.github/actions/gpu-cluster-setup/runner-preflight.sh
@@ -0,0 +1,69 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+echo "=== Runner baseline ==="
+date -u
+hostname
+uptime
+nproc
+free -h
+df -h /
+df -ih /
+
+for value_name in MIN_GPU_COUNT MIN_FREE_DISK_GB MIN_AVAILABLE_MEMORY_GB; do
+  value="${!value_name}"
+  if ! [[ "${value}" =~ ^[0-9]+$ ]]; then
+    echo "::error::${value_name} must be an integer, got '${value}'"
+    exit 1
+  fi
+done
+
+echo "=== Docker health ==="
+docker info >/dev/null
+docker version
+
+echo "=== Host GPUs ==="
+nvidia-smi -L
+nvidia-smi
+
+mapfile -t gpu_names < <(nvidia-smi --query-gpu=name --format=csv,noheader)
+if [[ -n "${GPU_MODEL_PATTERN}" ]]; then
+  set +e
+  gpu_count=$(printf '%s\n' "${gpu_names[@]}" | grep -Eic -- "${GPU_MODEL_PATTERN}")
+  grep_status=$?
+  set -e
+  if (( grep_status == 2 )); then
+    echo "::error::invalid gpu_model_pattern regex: ${GPU_MODEL_PATTERN}"
+    exit 1
+  fi
+  if (( grep_status != 0 )); then
+    gpu_count=0
+  fi
+  echo "Visible GPUs matching '${GPU_MODEL_PATTERN}': ${gpu_count}"
+else
+  gpu_count="${#gpu_names[@]}"
+  echo "Visible GPUs: ${gpu_count}"
+fi
+
+if (( gpu_count < MIN_GPU_COUNT )); then
+  echo "::error::visible GPU count ${gpu_count} is below required minimum ${MIN_GPU_COUNT}"
+  exit 1
+fi
+
+echo "=== Existing kind state ==="
+kind get clusters || true
+docker ps -a --filter "label=io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}" || true
diff --git a/.github/actions/gpu-cluster-setup/validate-docker-gpu-access.sh b/.github/actions/gpu-cluster-setup/validate-docker-gpu-access.sh
new file mode 100644
index 000000000..6f01ba156
--- /dev/null
+++ b/.github/actions/gpu-cluster-setup/validate-docker-gpu-access.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+timeout 300s docker run --rm -v /dev/null:/var/run/nvidia-container-devices/all ubuntu:22.04 nvidia-smi -L
diff --git a/.github/actions/gpu-cluster-setup/validate-env.sh b/.github/actions/gpu-cluster-setup/validate-env.sh
new file mode 100644
index 000000000..697d077c2
--- /dev/null
+++ b/.github/actions/gpu-cluster-setup/validate-env.sh
@@ -0,0 +1,21 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+if [[ -z "${KIND_CLUSTER_NAME:-}" ]]; then
+  echo "::error::KIND_CLUSTER_NAME environment variable must be set by the calling workflow"
+  exit 1
+fi
diff --git a/.github/actions/gpu-cluster-setup/warm-kind-node-image.sh b/.github/actions/gpu-cluster-setup/warm-kind-node-image.sh
new file mode 100644
index 000000000..b0567fa7c
--- /dev/null
+++ b/.github/actions/gpu-cluster-setup/warm-kind-node-image.sh
@@ -0,0 +1,31 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+echo "=== Kind node image cache ==="
+if docker image inspect "${KIND_NODE_IMAGE}" >/dev/null 2>&1; then
+  echo "Kind node image already cached: ${KIND_NODE_IMAGE}"
+else
+  echo "Pulling kind node image: ${KIND_NODE_IMAGE}"
+  timeout 600s docker pull "${KIND_NODE_IMAGE}"
+fi
+free_disk_bytes=$(df -B1 --output=avail / | tail -1 | tr -dc '0-9')
+min_free_disk_bytes=$((MIN_FREE_DISK_GB * 1024 * 1024 * 1024))
+free_disk_gib=$((free_disk_bytes / 1024 / 1024 / 1024))
+if (( free_disk_bytes < min_free_disk_bytes )); then
+  echo "::error::free disk on / is ${free_disk_bytes} bytes (${free_disk_gib}GiB) after warming ${KIND_NODE_IMAGE}, need at least ${min_free_disk_bytes} bytes (${MIN_FREE_DISK_GB}GiB)"
+  exit 1
+fi
+echo "Runner disk remains sufficient after kind image warm-up: ${free_disk_gib}GiB (${free_disk_bytes} bytes)"
diff --git a/.github/actions/gpu-debug-diagnostics/action.yml b/.github/actions/gpu-debug-diagnostics/action.yml
new file mode 100644
index 000000000..e5a38b964
--- /dev/null
+++ b/.github/actions/gpu-debug-diagnostics/action.yml
@@ -0,0 +1,35 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: 'GPU Debug Diagnostics'
+description: 'Print bounded GPU CI diagnostics while the kind cluster is still present.'
+
+inputs:
+  cluster_name:
+    description: 'Kind cluster name'
+    required: true
+  mode:
+    description: 'Diagnostic mode: smoke, training, or inference'
+    required: false
+    default: 'smoke'
+
+runs:
+  using: 'composite'
+  steps:
+    - name: Print GPU debug diagnostics
+      shell: bash
+      env:
+        KIND_CLUSTER_NAME: ${{ inputs.cluster_name }}
+        GPU_TEST_DIAGNOSTIC_MODE: ${{ inputs.mode }}
+      run: bash "${{ github.workspace }}/.github/scripts/gpu-debug-diagnostics.sh"
diff --git a/.github/actions/gpu-operator-install/action.yml b/.github/actions/gpu-operator-install/action.yml
index e2bdb300c..b30c63f2d 100644
--- a/.github/actions/gpu-operator-install/action.yml
+++ b/.github/actions/gpu-operator-install/action.yml
@@ -31,6 +31,14 @@ inputs:
     description: 'Platform for recipe generation (bundle mode only, e.g. dynamo)'
     required: false
     default: ''
+  wait:
+    description: 'Wait for bundle Helm resources during deploy'
+    required: false
+    default: 'false'
+  best_effort:
+    description: 'Continue deploying remaining bundle components after a component failure'
+    required: false
+    default: 'true'
 
 runs:
   using: 'composite'
@@ -41,102 +49,33 @@ runs:
     - name: Install GPU Operator (helm)
       if: inputs.method == 'helm'
       shell: bash
-      run: |
-        helm repo add nvidia https://helm.ngc.nvidia.com/nvidia
-        helm repo update
-        helm upgrade -i \
-          --kube-context="kind-${KIND_CLUSTER_NAME}" \
-          --namespace gpu-operator \
-          --create-namespace \
-          --set driver.enabled=false \
-          --set toolkit.enabled=false \
-          --set dcgmExporter.enabled=false \
-          --set nfd.enabled=true \
-          --wait --timeout=600s \
-          gpu-operator nvidia/gpu-operator
-
+      run: bash "${{ github.action_path }}/install-gpu-operator-helm.sh"
     - name: Wait for GPU operands (helm)
       if: inputs.method == 'helm'
       shell: bash
-      run: |
-        echo "Waiting for device plugin to be ready..."
-        kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \
-          rollout status daemonset -l app=nvidia-device-plugin-daemonset --timeout=300s || true
-        echo "GPU Operator pods:"
-        kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods
-
+      run: bash "${{ github.action_path }}/wait-gpu-operands-helm.sh"
     # --- Bundle mode: aicr recipe → bundle → deploy ---
 
     - name: Generate recipe
       if: inputs.method == 'bundle'
       shell: bash
-      run: |
-        PLATFORM_FLAG=""
-        if [[ -n "${{ inputs.platform }}" ]]; then
-          PLATFORM_FLAG="--platform ${{ inputs.platform }}"
-        fi
-        ./aicr recipe \
-          --service kind \
-          --accelerator ${{ inputs.accelerator }} \
-          --os ubuntu \
-          --intent ${{ inputs.intent }} \
-          ${PLATFORM_FLAG} \
-          --output recipe.yaml
-        echo "--- Recipe ---"
-        cat recipe.yaml
-
+      env:
+        AICR_ACCELERATOR: ${{ inputs.accelerator }}
+        AICR_INTENT: ${{ inputs.intent }}
+        AICR_PLATFORM: ${{ inputs.platform }}
+      run: bash "${{ github.action_path }}/generate-recipe.sh"
     - name: Generate deployment bundle
       if: inputs.method == 'bundle'
       shell: bash
-      run: |
-        ./aicr bundle \
-          --recipe recipe.yaml \
-          --accelerated-node-toleration nvidia.com/gpu:NoSchedule \
-          --output bundle
-        echo "--- Bundle contents ---"
-        ls -la bundle/
-
+      run: bash "${{ github.action_path }}/generate-bundle.sh"
     - name: Install bundle into cluster
       if: inputs.method == 'bundle'
       shell: bash
-      run: |
-        cd bundle
-        # Use --no-wait: several components (gpu-operator ClusterPolicy,
-        # kai-scheduler SchedulingShard, nvidia-dra-driver-gpu kubelet plugin)
-        # stay InProgress in kind because their CRs/DaemonSets require
-        # features not available in kind (DRA feature gates, driver modules).
-        # The explicit "Wait for GPU operands" step below gates on what
-        # actually matters (device plugin readiness).
-        # --best-effort: some components (e.g. network-operator) have Helm
-        # hooks that may time out in Kind; continue deploying remaining
-        # components so the overall stack is functional.
-        chmod +x deploy.sh
-        echo "--- deploy.sh ---"
-        cat deploy.sh
-        ./deploy.sh --no-wait --best-effort
-
+      env:
+        AICR_DEPLOY_WAIT: ${{ inputs.wait }}
+        AICR_DEPLOY_BEST_EFFORT: ${{ inputs.best_effort }}
+      run: bash "${{ github.action_path }}/install-bundle.sh"
     - name: Wait for GPU operands (bundle)
       if: inputs.method == 'bundle'
       shell: bash
-      run: |
-        echo "Waiting for GPU operator controller to deploy operands..."
-        # The GPU operator controller watches ClusterPolicy and creates
-        # DaemonSets for device-plugin, NFD, GFD, etc. This happens
-        # asynchronously after the helm install completes.
-        for i in $(seq 1 30); do
-          count=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \
-            get daemonset -l app=nvidia-device-plugin-daemonset --no-headers 2>/dev/null | wc -l)
-          if [[ "$count" -gt 0 ]]; then
-            echo "Device plugin DaemonSet found."
-            break
-          fi
-          echo "Waiting for device plugin DaemonSet to be created... (${i}/30)"
-          sleep 10
-        done
-        echo "Waiting for device plugin rollout..."
-        # Operands are excluded from control-plane nodes via nodeAffinity in
-        # the kind overlay, so all scheduled pods should become ready.
-        kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \
-          rollout status daemonset -l app=nvidia-device-plugin-daemonset --timeout=300s
-        echo "GPU Operator pods:"
-        kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods
+      run: bash "${{ github.action_path }}/wait-gpu-operands-bundle.sh"
diff --git a/.github/actions/gpu-operator-install/generate-bundle.sh b/.github/actions/gpu-operator-install/generate-bundle.sh
new file mode 100644
index 000000000..095b68415
--- /dev/null
+++ b/.github/actions/gpu-operator-install/generate-bundle.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+rm -rf bundle
+./aicr bundle \
+  --recipe recipe.yaml \
+  --accelerated-node-toleration nvidia.com/gpu:NoSchedule \
+  --output bundle
+echo "--- Bundle contents ---"
+ls -la bundle/
diff --git a/.github/actions/gpu-operator-install/generate-recipe.sh b/.github/actions/gpu-operator-install/generate-recipe.sh
new file mode 100644
index 000000000..6015e69ed
--- /dev/null
+++ b/.github/actions/gpu-operator-install/generate-recipe.sh
@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+RECIPE_ARGS=(
+  --service kind
+  --accelerator "${AICR_ACCELERATOR}"
+  --os ubuntu
+  --intent "${AICR_INTENT}"
+)
+if [[ -n "${AICR_PLATFORM}" ]]; then
+  RECIPE_ARGS+=(--platform "${AICR_PLATFORM}")
+fi
+
+./aicr recipe "${RECIPE_ARGS[@]}" --output recipe.yaml
+echo "Recipe written to recipe.yaml"
diff --git a/.github/actions/gpu-operator-install/install-bundle.sh b/.github/actions/gpu-operator-install/install-bundle.sh
new file mode 100644
index 000000000..cefa4ce5d
--- /dev/null
+++ b/.github/actions/gpu-operator-install/install-bundle.sh
@@ -0,0 +1,35 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+cd bundle
+# The default keeps legacy bundle-mode behavior: do not wait on every
+# Helm resource and keep deploying after component failures. H100
+# qualification jobs override these inputs to hard-fail and wait.
+chmod +x deploy.sh
+DEPLOY_ARGS=()
+if [[ "${AICR_DEPLOY_WAIT}" != "true" ]]; then
+  DEPLOY_ARGS+=(--no-wait)
+fi
+if [[ "${AICR_DEPLOY_BEST_EFFORT}" == "true" ]]; then
+  DEPLOY_ARGS+=(--best-effort)
+fi
+if [[ "${#DEPLOY_ARGS[@]}" -gt 0 ]]; then
+  echo "Deploying bundle with args: ${DEPLOY_ARGS[*]}"
+else
+  echo "Deploying bundle with default args"
+fi
+./deploy.sh "${DEPLOY_ARGS[@]}"
diff --git a/.github/actions/gpu-operator-install/install-gpu-operator-helm.sh b/.github/actions/gpu-operator-install/install-gpu-operator-helm.sh
new file mode 100644
index 000000000..6079cad83
--- /dev/null
+++ b/.github/actions/gpu-operator-install/install-gpu-operator-helm.sh
@@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+GPU_OPERATOR_CHART_VERSION="v25.10.1"
+
+helm repo add nvidia https://helm.ngc.nvidia.com/nvidia --force-update
+helm repo update
+helm upgrade -i \
+  --kube-context="kind-${KIND_CLUSTER_NAME}" \
+  --namespace gpu-operator \
+  --create-namespace \
+  --set driver.enabled=false \
+  --set toolkit.enabled=false \
+  --set dcgmExporter.enabled=false \
+  --set nfd.enabled=true \
+  --version="${GPU_OPERATOR_CHART_VERSION}" \
+  --wait --timeout=600s \
+  gpu-operator nvidia/gpu-operator
diff --git a/.github/actions/gpu-operator-install/wait-gpu-operands-bundle.sh b/.github/actions/gpu-operator-install/wait-gpu-operands-bundle.sh
new file mode 100644
index 000000000..9566fb8ba
--- /dev/null
+++ b/.github/actions/gpu-operator-install/wait-gpu-operands-bundle.sh
@@ -0,0 +1,50 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+echo "Waiting for GPU operator controller to deploy operands..."
+# The GPU operator controller watches ClusterPolicy and creates
+# DaemonSets for device-plugin, NFD, GFD, etc. This happens
+# asynchronously after the helm install completes.
+daemonset_found=false
+for i in $(seq 1 30); do
+  daemonsets=""
+  if daemonsets=$(kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \
+    get daemonset -l app=nvidia-device-plugin-daemonset --no-headers 2>/dev/null); then
+    if [[ -n "${daemonsets}" ]]; then
+      daemonset_found=true
+    fi
+  fi
+  if [[ "${daemonset_found}" == "true" ]]; then
+    echo "Device plugin DaemonSet found."
+    break
+  fi
+  echo "Waiting for device plugin DaemonSet to be created... (${i}/30)"
+  sleep 10
+done
+if [[ "${daemonset_found}" != "true" ]]; then
+  echo "::error::device plugin DaemonSet was not created within 300s"
+  kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods || true
+  kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get events --sort-by='.lastTimestamp' || true
+  exit 1
+fi
+echo "Waiting for device plugin rollout..."
+# Operands are excluded from control-plane nodes via nodeAffinity in
+# the kind overlay, so all scheduled pods should become ready.
+kubectl --request-timeout=300s --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \
+  rollout status daemonset -l app=nvidia-device-plugin-daemonset --timeout=300s
+echo "GPU Operator pods:"
+kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods
diff --git a/.github/actions/gpu-operator-install/wait-gpu-operands-helm.sh b/.github/actions/gpu-operator-install/wait-gpu-operands-helm.sh
new file mode 100644
index 000000000..3d3042f8a
--- /dev/null
+++ b/.github/actions/gpu-operator-install/wait-gpu-operands-helm.sh
@@ -0,0 +1,37 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+echo "Waiting for device plugin to be ready..."
+for i in $(seq 1 30); do
+  if kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \
+    get daemonset -l app=nvidia-device-plugin-daemonset --no-headers 2>/dev/null | grep -q .; then
+    echo "Device plugin DaemonSet found."
+    break
+  fi
+  if (( i == 30 )); then
+    echo "::error::device plugin DaemonSet was not created within 300s"
+    kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods || true
+    exit 1
+  fi
+  echo "Waiting for device plugin DaemonSet to be created... (${i}/30)"
+  sleep 10
+done
+
+kubectl --request-timeout=300s --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \
+  rollout status daemonset -l app=nvidia-device-plugin-daemonset --timeout=300s
+echo "GPU Operator pods:"
+kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods
diff --git a/.github/actions/gpu-smoke-nvidia-smi/action.yml b/.github/actions/gpu-smoke-nvidia-smi/action.yml
new file mode 100644
index 000000000..cb61b5d0d
--- /dev/null
+++ b/.github/actions/gpu-smoke-nvidia-smi/action.yml
@@ -0,0 +1,36 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: 'GPU Smoke nvidia-smi'
+description: 'Run nvidia-smi in a GPU-backed kind pod and print its logs.'
+
+inputs:
+  cluster_name:
+    description: 'Kind cluster name'
+    required: true
+
+runs:
+  using: 'composite'
+  steps:
+    - name: Run nvidia-smi in a pod
+      shell: bash
+      env:
+        KIND_CLUSTER_NAME: ${{ inputs.cluster_name }}
+      run: bash "${{ github.workspace }}/.github/scripts/gpu-smoke-run-nvidia-smi.sh"
+    - name: Show nvidia-smi output
+      if: always()
+      shell: bash
+      env:
+        KIND_CLUSTER_NAME: ${{ inputs.cluster_name }}
+      run: bash "${{ github.workspace }}/.github/scripts/gpu-smoke-show-nvidia-smi-log.sh"
diff --git a/.github/actions/gpu-snapshot-validate/action.yml b/.github/actions/gpu-snapshot-validate/action.yml
index e1ee3c14b..7af987da0 100644
--- a/.github/actions/gpu-snapshot-validate/action.yml
+++ b/.github/actions/gpu-snapshot-validate/action.yml
@@ -26,60 +26,28 @@ inputs:
   cluster_name:
     description: 'Kind cluster name (for kubectl context)'
     required: true
+  snapshot_timeout:
+    description: 'Timeout for aicr snapshot'
+    required: false
+    default: '5m'
 
 runs:
   using: composite
   steps:
     - name: Run aicr snapshot
       shell: bash
-      run: |
-        ./aicr snapshot \
-          --kubeconfig="${HOME}/.kube/config" \
-          --namespace=default \
-          --image=ko.local:smoke-test \
-          --require-gpu \
-          --output=snapshot.yaml
-        echo "--- Snapshot output ---"
-        cat snapshot.yaml
-
+      env:
+        SNAPSHOT_TIMEOUT: ${{ inputs.snapshot_timeout }}
+      run: bash "${{ github.action_path }}/run-snapshot.sh"
     - name: Validate snapshot detected GPU
       shell: bash
-      run: |
-        # Query by subtype field (not index) — #502 added a "hardware" subtype before "smi".
-        GPU_MODEL=$(yq eval '.measurements[] | select(.type == "GPU") | .subtypes[] | select(.subtype == "smi") | .data["gpu.model"]' snapshot.yaml)
-        GPU_COUNT=$(yq eval '.measurements[] | select(.type == "GPU") | .subtypes[] | select(.subtype == "smi") | .data["gpu-count"]' snapshot.yaml)
-        echo "GPU model: ${GPU_MODEL}"
-        echo "GPU count: ${GPU_COUNT}"
-        if [[ "${GPU_MODEL}" != *"${{ inputs.gpu_model }}"* ]]; then
-          echo "::error::Expected ${{ inputs.gpu_model }} GPU in snapshot, got: ${GPU_MODEL}"
-          exit 1
-        fi
-        if [[ "${GPU_COUNT}" -lt ${{ inputs.min_gpu_count }} ]]; then
-          echo "::error::Expected gpu-count >= ${{ inputs.min_gpu_count }}, got: ${GPU_COUNT}"
-          exit 1
-        fi
-        echo "Snapshot correctly detected ${GPU_COUNT}x ${GPU_MODEL}"
-
+      env:
+        EXPECTED_GPU_MODEL: ${{ inputs.gpu_model }}
+        MIN_GPU_COUNT: ${{ inputs.min_gpu_count }}
+      run: bash "${{ github.action_path }}/validate-snapshot-gpu.sh"
     - name: Debug snapshot Job
       if: failure()
       shell: bash
-      run: |
-        echo "=== Snapshot Job ==="
-        kubectl --context="kind-${{ inputs.cluster_name }}" -n default get job aicr -o yaml || true
-        echo "=== Snapshot Pods ==="
-        kubectl --context="kind-${{ inputs.cluster_name }}" -n default \
-          get pods -l app.kubernetes.io/name=aicr -o wide || true
-        echo "=== Snapshot Job describe ==="
-        kubectl --context="kind-${{ inputs.cluster_name }}" -n default describe job aicr || true
-        echo "=== Snapshot Pod describe ==="
-        kubectl --context="kind-${{ inputs.cluster_name }}" -n default \
-          describe pods -l app.kubernetes.io/name=aicr || true
-        echo "=== Snapshot current logs ==="
-        kubectl --context="kind-${{ inputs.cluster_name }}" -n default \
-          logs -l app.kubernetes.io/name=aicr --all-containers --tail=200 || true
-        echo "=== Snapshot previous logs ==="
-        kubectl --context="kind-${{ inputs.cluster_name }}" -n default \
-          logs -l app.kubernetes.io/name=aicr --all-containers --previous --tail=200 || true
-        echo "=== Snapshot ConfigMap ==="
-        kubectl --context="kind-${{ inputs.cluster_name }}" -n default \
-          get configmap aicr-snapshot -o yaml || true
+      env:
+        KIND_CLUSTER_NAME: ${{ inputs.cluster_name }}
+      run: bash "${{ github.action_path }}/debug-snapshot-job.sh"
diff --git a/.github/actions/gpu-snapshot-validate/debug-snapshot-job.sh b/.github/actions/gpu-snapshot-validate/debug-snapshot-job.sh
new file mode 100644
index 000000000..2e0f1547f
--- /dev/null
+++ b/.github/actions/gpu-snapshot-validate/debug-snapshot-job.sh
@@ -0,0 +1,35 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+kubectl_kind() {
+  timeout 30s kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" "$@"
+}
+
+echo "=== Snapshot Job ==="
+kubectl_kind -n default get job aicr -o yaml || true
+echo "=== Snapshot Pods ==="
+kubectl_kind -n default get pods -l app.kubernetes.io/name=aicr -o wide || true
+echo "=== Snapshot Job describe ==="
+kubectl_kind -n default describe job aicr || true
+echo "=== Snapshot Pod describe ==="
+kubectl_kind -n default describe pods -l app.kubernetes.io/name=aicr || true
+echo "=== Snapshot current logs ==="
+kubectl_kind -n default logs -l app.kubernetes.io/name=aicr --all-containers --tail=200 || true
+echo "=== Snapshot previous logs ==="
+kubectl_kind -n default logs -l app.kubernetes.io/name=aicr --all-containers --previous --tail=200 || true
+echo "=== Snapshot ConfigMap ==="
+kubectl_kind -n default get configmap aicr-snapshot -o yaml || true
diff --git a/.github/actions/gpu-snapshot-validate/run-snapshot.sh b/.github/actions/gpu-snapshot-validate/run-snapshot.sh
new file mode 100644
index 000000000..e45b575ef
--- /dev/null
+++ b/.github/actions/gpu-snapshot-validate/run-snapshot.sh
@@ -0,0 +1,26 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+./aicr snapshot \
+  --kubeconfig="${HOME}/.kube/config" \
+  --namespace=default \
+  --image=ko.local:smoke-test \
+  --require-gpu \
+  --timeout="${SNAPSHOT_TIMEOUT}" \
+  --output=snapshot.yaml
+echo "--- Snapshot output ---"
+cat snapshot.yaml
diff --git a/.github/actions/gpu-snapshot-validate/validate-snapshot-gpu.sh b/.github/actions/gpu-snapshot-validate/validate-snapshot-gpu.sh
new file mode 100644
index 000000000..5a27e6093
--- /dev/null
+++ b/.github/actions/gpu-snapshot-validate/validate-snapshot-gpu.sh
@@ -0,0 +1,35 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+# Query by subtype field (not index) — #502 added a "hardware" subtype before "smi".
+GPU_MODEL=$(yq eval '.measurements[] | select(.type == "GPU") | .subtypes[] | select(.subtype == "smi") | .data["gpu.model"]' snapshot.yaml)
+GPU_COUNT=$(yq eval '.measurements[] | select(.type == "GPU") | .subtypes[] | select(.subtype == "smi") | .data["gpu-count"]' snapshot.yaml)
+echo "GPU model: ${GPU_MODEL}"
+echo "GPU count: ${GPU_COUNT}"
+if ! [[ "${GPU_COUNT}" =~ ^[0-9]+$ ]]; then
+  echo "::error::Expected numeric gpu-count in snapshot, got: ${GPU_COUNT}"
+  exit 1
+fi
+if [[ "${GPU_MODEL}" != *"${EXPECTED_GPU_MODEL}"* ]]; then
+  echo "::error::Expected ${EXPECTED_GPU_MODEL} GPU in snapshot, got: ${GPU_MODEL}"
+  exit 1
+fi
+if [[ "${GPU_COUNT}" -lt ${MIN_GPU_COUNT} ]]; then
+  echo "::error::Expected gpu-count >= ${MIN_GPU_COUNT}, got: ${GPU_COUNT}"
+  exit 1
+fi
+echo "Snapshot correctly detected ${GPU_COUNT}x ${GPU_MODEL}"
diff --git a/.github/actions/gpu-test-cleanup/action.yml b/.github/actions/gpu-test-cleanup/action.yml
index 30ac7831f..417130669 100644
--- a/.github/actions/gpu-test-cleanup/action.yml
+++ b/.github/actions/gpu-test-cleanup/action.yml
@@ -23,35 +23,34 @@ inputs:
     description: 'Prefix for the uploaded artifact name'
     required: false
     default: 'gpu-test-debug'
+  collect_artifacts:
+    description: 'Collect and upload debug artifacts before deleting the kind cluster'
+    required: false
+    default: 'false'
 
 runs:
   using: 'composite'
   steps:
     - name: Collect debug artifacts
-      if: failure()
+      if: inputs.collect_artifacts == 'true'
       shell: bash
       env:
         KIND_CLUSTER_NAME: ${{ inputs.cluster_name }}
-      run: |
-        mkdir -p /tmp/debug-artifacts
-        kubectl --context="kind-${KIND_CLUSTER_NAME}" get all --all-namespaces > /tmp/debug-artifacts/all-resources.txt || true
-        kubectl --context="kind-${KIND_CLUSTER_NAME}" get events --all-namespaces --sort-by='.lastTimestamp' > /tmp/debug-artifacts/events.txt || true
-        kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods -o wide > /tmp/debug-artifacts/gpu-operator-pods.txt || true
-        kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator logs -l app=nvidia-device-plugin-daemonset --tail=100 > /tmp/debug-artifacts/device-plugin-logs.txt || true
-        kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator logs -l app.kubernetes.io/component=gpu-operator --tail=100 > /tmp/debug-artifacts/gpu-operator-logs.txt || true
-        kubectl --context="kind-${KIND_CLUSTER_NAME}" get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded > /tmp/debug-artifacts/non-running-pods.txt || true
-
+      run: bash "${{ github.action_path }}/collect-debug-artifacts.sh"
     - name: Export kind logs
-      if: failure()
+      if: always() && inputs.collect_artifacts == 'true'
       shell: bash
       env:
         KIND_CLUSTER_NAME: ${{ inputs.cluster_name }}
-      run: |
-        mkdir -p /tmp/kind-logs
-        kind export logs /tmp/kind-logs --name "${KIND_CLUSTER_NAME}" || true
-
+      run: bash "${{ github.action_path }}/export-kind-logs.sh"
+    - name: Cleanup
+      if: always()
+      shell: bash
+      env:
+        KIND_CLUSTER_NAME: ${{ inputs.cluster_name }}
+      run: bash "${{ github.action_path }}/cleanup-kind-cluster.sh"
     - name: Upload debug artifacts
-      if: failure()
+      if: always() && inputs.collect_artifacts == 'true'
       uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f  # v6.0.0
       with:
         name: ${{ inputs.artifact_name_prefix }}-${{ github.run_id }}
@@ -59,12 +58,3 @@ runs:
           /tmp/debug-artifacts/
           /tmp/kind-logs/
         retention-days: 7
-
-    - name: Cleanup
-      if: always()
-      shell: bash
-      env:
-        KIND_CLUSTER_NAME: ${{ inputs.cluster_name }}
-      run: |
-        kind delete cluster --name "${KIND_CLUSTER_NAME}" || true
-        docker system prune -f || true
diff --git a/.github/actions/gpu-test-cleanup/cleanup-kind-cluster.sh b/.github/actions/gpu-test-cleanup/cleanup-kind-cluster.sh
new file mode 100644
index 000000000..4603d494d
--- /dev/null
+++ b/.github/actions/gpu-test-cleanup/cleanup-kind-cluster.sh
@@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+timeout 300s kind delete cluster --name "${KIND_CLUSTER_NAME}" || true
+docker_timeout() {
+  local limit="$1"
+  shift
+  timeout "${limit}" docker "$@"
+}
+kind_cluster_label="io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}"
+remaining_containers=$(docker_timeout 30s ps -aq --filter "label=${kind_cluster_label}" || true)
+if [[ -n "${remaining_containers}" ]]; then
+  echo "Removing leftover kind containers for ${KIND_CLUSTER_NAME}:"
+  docker_timeout 30s ps -a --filter "label=${kind_cluster_label}" || true
+  docker_timeout 30s rm -f ${remaining_containers} || true
+fi
+docker_timeout 60s builder prune -f --filter "until=24h" || true
+docker_timeout 60s system prune -f --filter "until=24h" || true
diff --git a/.github/actions/gpu-test-cleanup/collect-debug-artifacts.sh b/.github/actions/gpu-test-cleanup/collect-debug-artifacts.sh
new file mode 100644
index 000000000..7c780e3f7
--- /dev/null
+++ b/.github/actions/gpu-test-cleanup/collect-debug-artifacts.sh
@@ -0,0 +1,152 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Diagnostic artifact collection intentionally omits -e so one broken cluster
+# call does not prevent later artifacts from being collected.
+set -uo pipefail
+rm -rf /tmp/debug-artifacts
+mkdir -p /tmp/debug-artifacts
+CONTROL_PLANE_COMPONENTS="kube-apiserver kube-controller-manager kube-scheduler etcd"
+MAX_KIND_NODE_ARTIFACT_SECONDS="${MAX_KIND_NODE_ARTIFACT_SECONDS:-600}"
+kubectl_kind() {
+  timeout 30s kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" "$@"
+}
+docker_timeout() {
+  local limit="$1"
+  shift
+  timeout "${limit}" docker "$@"
+}
+
+{
+  date -u || true
+  hostname || true
+  uptime || true
+  nproc || true
+  free -h || true
+  df -h / || true
+  df -ih / || true
+} > /tmp/debug-artifacts/runner-baseline.txt 2>&1 || true
+docker_timeout 30s version > /tmp/debug-artifacts/docker-version.txt 2>&1 || true
+docker_timeout 30s info > /tmp/debug-artifacts/docker-info.txt 2>&1 || true
+nvidia-smi -L > /tmp/debug-artifacts/host-gpus.txt 2>&1 || true
+nvidia-smi >> /tmp/debug-artifacts/host-gpus.txt 2>&1 || true
+kind get clusters > /tmp/debug-artifacts/kind-clusters.txt 2>&1 || true
+docker_timeout 30s ps -a --filter "label=io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}" \
+  > /tmp/debug-artifacts/kind-node-containers.txt 2>&1 || true
+
+kubectl_kind get all --all-namespaces > /tmp/debug-artifacts/all-resources.txt || true
+kubectl_kind get events --all-namespaces --sort-by='.lastTimestamp' > /tmp/debug-artifacts/events.txt || true
+kubectl_kind get --raw='/livez?verbose' > /tmp/debug-artifacts/apiserver-livez.txt 2>&1 || true
+kubectl_kind get --raw='/readyz?verbose' > /tmp/debug-artifacts/apiserver-readyz.txt 2>&1 || true
+kubectl_kind -n kube-system get pods -l tier=control-plane -o wide \
+  > /tmp/debug-artifacts/control-plane-pods.txt 2>&1 || true
+kubectl_kind -n kube-system get events --sort-by='.lastTimestamp' \
+  > /tmp/debug-artifacts/kube-system-events.txt 2>&1 || true
+for component in ${CONTROL_PLANE_COMPONENTS}; do
+  kubectl_kind -n kube-system describe pod -l "component=${component}" \
+    > "/tmp/debug-artifacts/${component}-describe.txt" 2>&1 || true
+  kubectl_kind -n kube-system logs -l "component=${component}" --all-containers --tail=300 \
+    > "/tmp/debug-artifacts/${component}-logs.txt" 2>&1 || true
+  kubectl_kind -n kube-system logs -l "component=${component}" --all-containers --previous --tail=300 \
+    > "/tmp/debug-artifacts/${component}-previous-logs.txt" 2>&1 || true
+  kubectl_kind -n kube-system get lease "${component}" -o yaml \
+    > "/tmp/debug-artifacts/${component}-lease.yaml" 2>&1 || true
+done
+kubectl_kind -n gpu-operator get pods -o wide > /tmp/debug-artifacts/gpu-operator-pods.txt || true
+kubectl_kind -n gpu-operator logs -l app=nvidia-device-plugin-daemonset --tail=100 > /tmp/debug-artifacts/device-plugin-logs.txt || true
+kubectl_kind -n gpu-operator logs -l app.kubernetes.io/component=gpu-operator --tail=100 > /tmp/debug-artifacts/gpu-operator-logs.txt || true
+kubectl_kind -n monitoring get deployment,statefulset,daemonset,pods -o wide \
+  > /tmp/debug-artifacts/monitoring-workloads.txt 2>&1 || true
+kubectl_kind -n monitoring describe deployment kube-prometheus-operator \
+  > /tmp/debug-artifacts/kube-prometheus-operator-deployment-describe.txt 2>&1 || true
+kubectl_kind -n monitoring logs deployment/kube-prometheus-operator --all-containers --tail=300 \
+  > /tmp/debug-artifacts/kube-prometheus-operator-logs.txt 2>&1 || true
+kubectl_kind -n monitoring logs deployment/kube-prometheus-operator --all-containers --previous --tail=300 \
+  > /tmp/debug-artifacts/kube-prometheus-operator-previous-logs.txt 2>&1 || true
+kubectl_kind -n monitoring get events --sort-by='.lastTimestamp' \
+  > /tmp/debug-artifacts/monitoring-events.txt 2>&1 || true
+{
+  kubectl_kind -n monitoring get pods -o name 2>/dev/null \
+    | grep '^pod/kube-prometheus-operator-' \
+    | while read -r pod; do
+        echo "=== ${pod} ==="
+        kubectl_kind -n monitoring describe "${pod}" 2>&1 || true
+      done
+} > /tmp/debug-artifacts/kube-prometheus-operator-pods-describe.txt 2>&1 || true
+kubectl_kind get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded > /tmp/debug-artifacts/non-running-pods.txt || true
+tar_inputs=()
+[[ -f recipe.yaml ]] && tar_inputs+=(recipe.yaml)
+[[ -d bundle ]] && tar_inputs+=(bundle)
+if [[ "${#tar_inputs[@]}" -gt 0 ]]; then
+  echo "Archiving runtime bundle inputs: ${tar_inputs[*]}"
+  tar -czf /tmp/debug-artifacts/aicr-runtime-bundle.tar.gz "${tar_inputs[@]}" || true
+else
+  echo "No recipe.yaml or bundle directory found; skipping runtime bundle archive"
+fi
+
+artifact_loop_start="$(date +%s)"
+docker_timeout 30s ps --filter "label=io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}" \
+  --format '{{.Names}}' | sort | while read -r node_container; do
+    [[ -z "${node_container}" ]] && continue
+    artifact_loop_elapsed=$(($(date +%s) - artifact_loop_start))
+    if (( artifact_loop_elapsed > MAX_KIND_NODE_ARTIFACT_SECONDS )); then
+      echo "Kind node artifact collection exceeded ${MAX_KIND_NODE_ARTIFACT_SECONDS}s; stopping after partial collection."
+      break
+    fi
+    node_file="${node_container//[^A-Za-z0-9_.-]/_}"
+    docker_timeout 30s inspect "${node_container}" \
+      > "/tmp/debug-artifacts/${node_file}-docker-inspect.json" 2>&1 || true
+    docker_timeout 30s exec "${node_container}" journalctl -u kubelet \
+      --since "90 minutes ago" --no-pager \
+      > "/tmp/debug-artifacts/${node_file}-kubelet-journal.txt" 2>&1 || true
+    docker_timeout 30s exec "${node_container}" journalctl -u containerd \
+      --since "90 minutes ago" --no-pager \
+      > "/tmp/debug-artifacts/${node_file}-containerd-journal.txt" 2>&1 || true
+    docker_timeout 30s exec "${node_container}" crictl ps -a \
+      > "/tmp/debug-artifacts/${node_file}-crictl-ps-a.txt" 2>&1 || true
+    docker_timeout 30s exec "${node_container}" crictl pods \
+      > "/tmp/debug-artifacts/${node_file}-crictl-pods.txt" 2>&1 || true
+    docker_timeout 30s exec "${node_container}" crictl stats \
+      > "/tmp/debug-artifacts/${node_file}-crictl-stats.txt" 2>&1 || true
+    docker_timeout 30s exec "${node_container}" sh -c '
+      date
+      uptime || true
+      free -h || true
+      df -h / /var/lib/containerd /var/lib/kubelet 2>/dev/null || df -h
+      echo "--- top cpu/memory processes ---"
+      ps -eo pid,ppid,stat,etime,%cpu,%mem,comm,args --sort=-%cpu | head -40 || true
+    ' > "/tmp/debug-artifacts/${node_file}-node-pressure.txt" 2>&1 || true
+    # shellcheck disable=SC2016 # Expanded inside the kind node shell.
+    docker_timeout 120s exec "${node_container}" sh -c '
+      for component in kube-apiserver kube-controller-manager kube-scheduler etcd; do
+        echo "=== ${component} static pod manifest ==="
+        sed -n "1,220p" "/etc/kubernetes/manifests/${component}.yaml" 2>/dev/null || true
+        echo "=== ${component} CRI containers ==="
+        crictl ps -a --name "${component}" || true
+        count=0
+        for container_id in $(crictl ps -a --name "${component}" -q 2>/dev/null); do
+          count=$((count + 1))
+          if [ "${count}" -gt 8 ]; then
+            echo "Skipping remaining ${component} CRI containers after first 8 entries."
+            break
+          fi
+          echo "=== crictl inspect ${component} ${container_id} ==="
+          crictl inspect "${container_id}" || true
+          echo "=== crictl logs ${component} ${container_id} ==="
+          crictl logs --tail=300 "${container_id}" || true
+        done
+      done
+    ' > "/tmp/debug-artifacts/${node_file}-control-plane-cri.txt" 2>&1 || true
+  done || true
diff --git a/.github/actions/gpu-test-cleanup/export-kind-logs.sh b/.github/actions/gpu-test-cleanup/export-kind-logs.sh
new file mode 100644
index 000000000..2522481eb
--- /dev/null
+++ b/.github/actions/gpu-test-cleanup/export-kind-logs.sh
@@ -0,0 +1,19 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+mkdir -p /tmp/kind-logs
+timeout 300s kind export logs /tmp/kind-logs --name "${KIND_CLUSTER_NAME}" || true
diff --git a/.github/actions/install-karpenter-kwok/action.yml b/.github/actions/install-karpenter-kwok/action.yml
index fde7bddde..f26aa38a5 100644
--- a/.github/actions/install-karpenter-kwok/action.yml
+++ b/.github/actions/install-karpenter-kwok/action.yml
@@ -19,6 +19,18 @@ inputs:
   cluster_name:
     description: 'Kind cluster name (used for kubectl context)'
     required: true
+  kwok_helm_timeout:
+    description: 'Timeout for KWOK controller Helm install'
+    required: false
+    default: '300s'
+  ko_build_timeout:
+    description: 'Timeout in seconds for Karpenter KWOK provider ko build'
+    required: false
+    default: '900'
+  karpenter_helm_timeout:
+    description: 'Timeout for Karpenter Helm install'
+    required: false
+    default: '300s'
 
 runs:
   using: 'composite'
@@ -26,9 +38,12 @@ runs:
     - name: Resolve versions
       id: versions
       shell: bash
-      run: |
-        echo "karpenter=$(yq eval '.testing_tools.karpenter' .settings.yaml)" >> "$GITHUB_OUTPUT"
-        echo "go=$(go env GOVERSION)" >> "$GITHUB_OUTPUT"
+      run: bash "${{ github.action_path }}/resolve-versions.sh"
+    - name: Install ko
+      uses: ./.github/actions/setup-build-tools
+      with:
+        install_ko: 'true'
+        ko_version: ${{ steps.versions.outputs.ko }}
 
     - name: Cache Karpenter Go build cache
       uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684  # v4.2.3
@@ -46,7 +61,7 @@ runs:
       env:
         KIND_CLUSTER_NAME: ${{ inputs.cluster_name }}
         KARPENTER_VERSION: ${{ steps.versions.outputs.karpenter }}
-      run: |
-        set -euo pipefail
-        bash kwok/scripts/install-karpenter-kwok.sh
-        kubectl --context="kind-${KIND_CLUSTER_NAME}" apply -f kwok/manifests/karpenter/nodepool.yaml
+        KWOK_HELM_TIMEOUT: ${{ inputs.kwok_helm_timeout }}
+        KO_BUILD_TIMEOUT: ${{ inputs.ko_build_timeout }}
+        KARPENTER_HELM_TIMEOUT: ${{ inputs.karpenter_helm_timeout }}
+      run: bash "${{ github.action_path }}/install-karpenter-kwok.sh"
diff --git a/.github/actions/install-karpenter-kwok/install-karpenter-kwok.sh b/.github/actions/install-karpenter-kwok/install-karpenter-kwok.sh
new file mode 100644
index 000000000..8987144ab
--- /dev/null
+++ b/.github/actions/install-karpenter-kwok/install-karpenter-kwok.sh
@@ -0,0 +1,47 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+validate_duration_input() {
+  local input_name="$1"
+  local input_value="$2"
+
+  if ! [[ "${input_value}" =~ ^[0-9]+[smh]$ ]]; then
+    echo "::error::${input_name} must be a duration like 300s, 10m, or 1h; got '${input_value}'"
+    exit 1
+  fi
+}
+
+validate_seconds_input() {
+  local input_name="$1"
+  local input_value="$2"
+
+  if ! [[ "${input_value}" =~ ^[0-9]+$ ]]; then
+    echo "::error::${input_name} must be an integer number of seconds, got '${input_value}'"
+    exit 1
+  fi
+  if (( 10#${input_value} <= 0 )); then
+    echo "::error::${input_name} must be greater than 0 seconds, got '${input_value}'"
+    exit 1
+  fi
+}
+
+validate_duration_input kwok_helm_timeout "${KWOK_HELM_TIMEOUT}"
+validate_seconds_input ko_build_timeout "${KO_BUILD_TIMEOUT}"
+validate_duration_input karpenter_helm_timeout "${KARPENTER_HELM_TIMEOUT}"
+bash kwok/scripts/install-karpenter-kwok.sh
+timeout 30s kubectl --request-timeout=10s \
+  --context="kind-${KIND_CLUSTER_NAME}" \
+  apply -f kwok/manifests/karpenter/nodepool.yaml
diff --git a/.github/actions/install-karpenter-kwok/resolve-versions.sh b/.github/actions/install-karpenter-kwok/resolve-versions.sh
new file mode 100644
index 000000000..84e85458e
--- /dev/null
+++ b/.github/actions/install-karpenter-kwok/resolve-versions.sh
@@ -0,0 +1,20 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+echo "karpenter=$(yq eval '.testing_tools.karpenter' .settings.yaml)" >> "$GITHUB_OUTPUT"
+echo "ko=$(yq eval '.build_tools.ko' .settings.yaml)" >> "$GITHUB_OUTPUT"
+echo "go=$(go env GOVERSION)" >> "$GITHUB_OUTPUT"
diff --git a/.github/actions/load-versions/action.yml b/.github/actions/load-versions/action.yml
index b87e321d1..b3c506d40 100644
--- a/.github/actions/load-versions/action.yml
+++ b/.github/actions/load-versions/action.yml
@@ -40,6 +40,9 @@ outputs:
   kind:
     description: 'Kind version'
     value: ${{ steps.versions.outputs.kind }}
+  nvkind:
+    description: 'nvkind git ref'
+    value: ${{ steps.versions.outputs.nvkind }}
   ctlptl:
     description: 'ctlptl version'
     value: ${{ steps.versions.outputs.ctlptl }}
@@ -91,6 +94,9 @@ outputs:
   kind_node_image:
     description: 'Kind node image for testing'
     value: ${{ steps.versions.outputs.kind_node_image }}
+  h100_kind_node_image:
+    description: 'Kind node image for H100 GPU tests'
+    value: ${{ steps.versions.outputs.h100_kind_node_image }}
 
 runs:
   using: 'composite'
@@ -121,6 +127,7 @@ runs:
         # Testing tools
         echo "kubectl=$(yq eval '.testing_tools.kubectl' .settings.yaml)" >> $GITHUB_OUTPUT
         echo "kind=$(yq eval '.testing_tools.kind' .settings.yaml)" >> $GITHUB_OUTPUT
+        echo "nvkind=$(yq eval '.testing_tools.nvkind' .settings.yaml)" >> $GITHUB_OUTPUT
         echo "ctlptl=$(yq eval '.testing_tools.ctlptl' .settings.yaml)" >> $GITHUB_OUTPUT
         echo "tilt=$(yq eval '.testing_tools.tilt' .settings.yaml)" >> $GITHUB_OUTPUT
         echo "helm=$(yq eval '.testing_tools.helm' .settings.yaml)" >> $GITHUB_OUTPUT
@@ -141,6 +148,7 @@ runs:
 
         # Testing configuration
         echo "kind_node_image=$(yq eval '.testing.kind_node_image' .settings.yaml)" >> $GITHUB_OUTPUT
+        echo "h100_kind_node_image=$(yq eval '.testing.h100_kind_node_image' .settings.yaml)" >> $GITHUB_OUTPUT
 
     - name: Display loaded versions
       shell: bash
@@ -158,6 +166,7 @@ runs:
         echo "  grype: ${{ steps.versions.outputs.grype }}"
         echo "  kubectl: ${{ steps.versions.outputs.kubectl }}"
         echo "  kind: ${{ steps.versions.outputs.kind }}"
+        echo "  nvkind: ${{ steps.versions.outputs.nvkind }}"
         echo "  ctlptl: ${{ steps.versions.outputs.ctlptl }}"
         echo "  tilt: ${{ steps.versions.outputs.tilt }}"
         echo "  helm: ${{ steps.versions.outputs.helm }}"
@@ -172,3 +181,4 @@ runs:
         echo "  lint_timeout: ${{ steps.versions.outputs.lint_timeout }}"
         echo "  test_timeout: ${{ steps.versions.outputs.test_timeout }}"
         echo "  kind_node_image: ${{ steps.versions.outputs.kind_node_image }}"
+        echo "  h100_kind_node_image: ${{ steps.versions.outputs.h100_kind_node_image }}"
diff --git a/.github/scripts/gpu-chainsaw-health.sh b/.github/scripts/gpu-chainsaw-health.sh
new file mode 100644
index 000000000..7098c6bf8
--- /dev/null
+++ b/.github/scripts/gpu-chainsaw-health.sh
@@ -0,0 +1,78 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+if [[ $# -ne 1 ]]; then
+  echo "::error::Usage: $0 <test_dir>"
+  exit 2
+fi
+test_dir="$1"
+if [[ ! -d "${test_dir}" ]]; then
+  echo "::error::Test directory not found: ${test_dir}"
+  exit 1
+fi
+
+CHAINSAW_TEST_TIMEOUT="${CHAINSAW_TEST_TIMEOUT:-30m}"
+MONITORING_READY_TIMEOUT="${MONITORING_READY_TIMEOUT:-180s}"
+
+kubectl_kind() {
+  timeout 30s kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" "$@"
+}
+
+kubectl_kind_wait() {
+  # Rollout status opens a watch that is already bounded by --timeout. Keep
+  # request-timeout unset here so a slow API server does not cut the watch short.
+  kubectl --context="kind-${KIND_CLUSTER_NAME}" "$@"
+}
+
+print_monitoring_diagnostics() {
+  echo "=== Monitoring workloads ==="
+  kubectl_kind -n monitoring get deployment,statefulset,daemonset,pods -o wide 2>/dev/null || true
+  echo "=== kube-prometheus-operator deployment ==="
+  kubectl_kind -n monitoring get deployment kube-prometheus-operator -o wide 2>/dev/null || true
+  echo "=== kube-prometheus-operator deployment describe ==="
+  kubectl_kind -n monitoring describe deployment kube-prometheus-operator 2>/dev/null || true
+  echo "=== kube-prometheus-operator pods ==="
+  kubectl_kind -n monitoring get pods -o wide 2>/dev/null \
+    | grep -E '(^NAME|^kube-prometheus-operator-)' || true
+  echo "=== kube-prometheus-operator logs ==="
+  kubectl_kind -n monitoring logs deployment/kube-prometheus-operator --all-containers --tail=200 2>/dev/null || true
+  echo "=== kube-prometheus-operator previous logs ==="
+  kubectl_kind -n monitoring logs deployment/kube-prometheus-operator --all-containers --previous --tail=200 2>/dev/null || true
+  echo "=== Recent events (monitoring) ==="
+  kubectl_kind -n monitoring get events --sort-by='.lastTimestamp' 2>/dev/null | tail -100 || true
+}
+
+wait_for_monitoring_operator() {
+  echo "Waiting for monitoring/kube-prometheus-operator before Chainsaw..."
+  print_monitoring_diagnostics
+  if kubectl_kind_wait -n monitoring rollout status deployment/kube-prometheus-operator \
+    --timeout="${MONITORING_READY_TIMEOUT}"; then
+    echo "monitoring/kube-prometheus-operator is rolled out."
+    return 0
+  fi
+
+  echo "::error::monitoring/kube-prometheus-operator did not become available within ${MONITORING_READY_TIMEOUT}"
+  print_monitoring_diagnostics
+  return 1
+}
+
+wait_for_monitoring_operator
+
+timeout "${CHAINSAW_TEST_TIMEOUT}" chainsaw test \
+  --test-dir "${test_dir}" \
+  --config tests/chainsaw/chainsaw-config.yaml \
+  --skip-delete
diff --git a/.github/scripts/gpu-debug-diagnostics.sh b/.github/scripts/gpu-debug-diagnostics.sh
new file mode 100644
index 000000000..3db82a6e8
--- /dev/null
+++ b/.github/scripts/gpu-debug-diagnostics.sh
@@ -0,0 +1,256 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Diagnostic script: intentionally omits -e so each mode can keep collecting
+# partial failure data. Keep -u and pipefail to catch script bugs and pipeline
+# failures while individual kubectl_kind calls tolerate cluster errors.
+set -uo pipefail
+
+mode="${GPU_TEST_DIAGNOSTIC_MODE:-smoke}"
+
+kubectl_kind() {
+  timeout 30s kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" "$@"
+}
+
+docker_timeout() {
+  local limit="$1"
+  shift
+  timeout "${limit}" docker "$@"
+}
+
+print_setup_diagnostics() {
+  echo "=== Runner baseline ==="
+  date -u || true
+  hostname || true
+  uptime || true
+  nproc || true
+  free -h || true
+  df -h / || true
+  df -ih / || true
+  echo "=== Docker health ==="
+  docker info >/dev/null 2>&1 && docker version || true
+  echo "=== Host GPUs ==="
+  nvidia-smi -L || true
+  nvidia-smi || true
+  echo "=== Kind clusters ==="
+  kind get clusters || true
+  echo "=== Kind node containers ==="
+  docker_timeout 30s ps -a --filter "label=io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}" || true
+  echo "=== Kind node container resources ==="
+  docker_timeout 30s ps --filter "label=io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}" \
+    --format '{{.Names}}' | sort | while read -r node_container; do
+      [[ -z "${node_container}" ]] && continue
+      docker_timeout 30s inspect "${node_container}" \
+        --format '{{.Name}} State={{.State.Status}} NanoCpus={{.HostConfig.NanoCpus}} CpuShares={{.HostConfig.CpuShares}} Memory={{.HostConfig.Memory}} MemoryReservation={{.HostConfig.MemoryReservation}}' || true
+    done || true
+}
+
+print_workload_images() {
+  local ns="$1"
+  kubectl_kind -n "${ns}" get deployment,daemonset,statefulset -o json 2>/dev/null \
+    | jq -r '
+      .items[] |
+      [
+        .kind,
+        .metadata.namespace + "/" + .metadata.name,
+        (([.spec.template.spec.containers[]?.image] +
+          [.spec.template.spec.initContainers[]?.image]) | unique | join(","))
+      ] | @tsv
+    ' || true
+}
+
+print_workload_inventory() {
+  local ns
+  echo "=== Workload image inventory ==="
+  for ns in "$@"; do
+    echo "--- ${ns} ---"
+    print_workload_images "${ns}"
+  done
+}
+
+print_component_status_summary() {
+  echo "=== Component workload status ==="
+  kubectl_kind get deployments,statefulsets,daemonsets,pods -A -o wide 2>/dev/null || true
+  echo "=== Component rollout conditions ==="
+  kubectl_kind get deployments,statefulsets,daemonsets -A \
+    -o custom-columns='KIND:.kind,NAMESPACE:.metadata.namespace,NAME:.metadata.name,READY:.status.readyReplicas,AVAILABLE:.status.availableReplicas,DESIRED:.status.replicas,UPDATED:.status.updatedReplicas,AGE:.metadata.creationTimestamp' \
+    2>/dev/null || true
+  echo "=== Non-ready pods ==="
+  kubectl_kind get pods -A \
+    --field-selector=status.phase!=Running,status.phase!=Succeeded \
+    -o wide 2>/dev/null || true
+}
+
+print_kube_prometheus_operator_diagnostics() {
+  echo "=== Monitoring workloads ==="
+  kubectl_kind -n monitoring get deployment,statefulset,daemonset,pods -o wide 2>/dev/null || true
+  echo "=== kube-prometheus-operator deployment ==="
+  kubectl_kind -n monitoring get deployment kube-prometheus-operator -o wide 2>/dev/null || true
+  echo "=== kube-prometheus-operator deployment describe ==="
+  kubectl_kind -n monitoring describe deployment kube-prometheus-operator 2>/dev/null || true
+  echo "=== kube-prometheus-operator pod describe ==="
+  kubectl_kind -n monitoring get pods -o name 2>/dev/null \
+    | grep '^pod/kube-prometheus-operator-' \
+    | while read -r pod; do
+        echo "--- ${pod} ---"
+        kubectl_kind -n monitoring describe "${pod}" 2>/dev/null || true
+      done || true
+  echo "=== kube-prometheus-operator logs ==="
+  kubectl_kind -n monitoring logs deployment/kube-prometheus-operator --all-containers --tail=200 2>/dev/null || true
+  echo "=== kube-prometheus-operator previous logs ==="
+  kubectl_kind -n monitoring logs deployment/kube-prometheus-operator --all-containers --previous --tail=200 2>/dev/null || true
+  echo "=== Recent events (monitoring) ==="
+  kubectl_kind -n monitoring get events --sort-by='.lastTimestamp' 2>/dev/null | tail -80 || true
+}
+
+print_kai_diagnostics() {
+  echo "=== KAI scheduler pods ==="
+  kubectl_kind -n kai-scheduler get pods -o wide 2>/dev/null || true
+  echo "=== KAI admission deployment ==="
+  kubectl_kind -n kai-scheduler get deployment admission -o wide 2>/dev/null || true
+  echo "=== KAI admission deployment describe ==="
+  kubectl_kind -n kai-scheduler describe deployment admission 2>/dev/null || true
+  echo "=== KAI admission pod describe ==="
+  kubectl_kind -n kai-scheduler get pods -o name 2>/dev/null \
+    | grep '^pod/admission-' \
+    | while read -r pod; do
+        kubectl_kind -n kai-scheduler describe "${pod}" 2>/dev/null || true
+      done || true
+  echo "=== KAI admission logs ==="
+  kubectl_kind -n kai-scheduler logs deployment/admission --all-containers --tail=200 2>/dev/null || true
+  echo "=== KAI scheduler logs ==="
+  kubectl_kind -n kai-scheduler logs deployment/kai-scheduler-default --tail=100 2>/dev/null || true
+  echo "=== KAI scheduler queues ==="
+  kubectl_kind get queues -A 2>/dev/null || true
+  echo "=== KAI scheduler podgroups ==="
+  kubectl_kind get podgroups -A 2>/dev/null || true
+  echo "=== Recent events (kai-scheduler) ==="
+  kubectl_kind -n kai-scheduler get events --sort-by='.lastTimestamp' 2>/dev/null | tail -50 || true
+}
+
+print_custom_metrics() {
+  local metric
+  local ns
+  local namespaces=("$@")
+
+  echo "=== Custom metrics API ==="
+  for metric in gpu_utilization gpu_memory_used gpu_power_usage; do
+    for ns in "${namespaces[@]}"; do
+      echo "--- ${ns}/${metric} ---"
+      kubectl_kind get --raw "/apis/custom.metrics.k8s.io/v1beta1/namespaces/${ns}/pods/*/${metric}" 2>/dev/null \
+        | jq . || true
+    done
+  done
+}
+
+print_metrics_pipeline_diagnostics() {
+  echo "=== prometheus-adapter pods ==="
+  kubectl_kind -n monitoring get pods -l app.kubernetes.io/name=prometheus-adapter -o wide 2>/dev/null || true
+  echo "=== DCGM Exporter pods ==="
+  kubectl_kind -n gpu-operator get pods -l app=nvidia-dcgm-exporter -o wide 2>/dev/null || true
+  echo "=== Monitoring pods ==="
+  kubectl_kind -n monitoring get pods -o wide 2>/dev/null || true
+  echo "=== DRA ResourceSlices ==="
+  kubectl_kind get resourceslices -o wide 2>/dev/null || true
+  echo "=== Node status ==="
+  kubectl_kind get nodes -o wide 2>/dev/null || true
+}
+
+print_common_gpu_diagnostics() {
+  echo "=== ClusterPolicy status ==="
+  kubectl_kind get clusterpolicy -o yaml 2>/dev/null || true
+  echo "=== GPU Operator pods ==="
+  kubectl_kind -n gpu-operator get pods -o wide 2>/dev/null || true
+  echo "=== Non-running pods (all namespaces) ==="
+  kubectl_kind get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded 2>/dev/null || true
+  echo "=== Recent events (gpu-operator) ==="
+  kubectl_kind -n gpu-operator get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true
+}
+
+print_h100_common_diagnostics() {
+  local metric_namespaces=("$@")
+  local common_namespaces=(
+    cert-manager
+    gpu-operator
+    monitoring
+    skyhook
+    nvsentinel
+    nvidia-dra-driver
+    nvidia-network-operator
+    kai-scheduler
+  )
+
+  print_setup_diagnostics
+  print_component_status_summary
+  print_workload_inventory "${common_namespaces[@]}" "${metric_namespaces[@]}"
+  print_common_gpu_diagnostics
+  print_kube_prometheus_operator_diagnostics
+  print_kai_diagnostics
+  print_custom_metrics gpu-operator "${metric_namespaces[@]}"
+  print_metrics_pipeline_diagnostics
+  echo "=== Node resources ==="
+  kubectl_kind describe nodes 2>/dev/null | grep -A 20 "Allocated resources" || true
+}
+
+print_kubeflow_diagnostics() {
+  echo "=== Kubeflow Trainer deployment ==="
+  kubectl_kind -n kubeflow get deployment kubeflow-trainer-controller-manager -o wide 2>/dev/null || true
+  echo "=== Kubeflow pods ==="
+  kubectl_kind -n kubeflow get pods -o wide 2>/dev/null || true
+  echo "=== Kubeflow validating webhooks ==="
+  kubectl_kind get validatingwebhookconfigurations validator.trainer.kubeflow.org -o yaml 2>/dev/null || true
+  echo "=== Kubeflow Trainer CRD ==="
+  kubectl_kind get crd trainjobs.trainer.kubeflow.org -o yaml 2>/dev/null || true
+}
+
+print_dynamo_diagnostics() {
+  echo "=== Dynamo pods ==="
+  kubectl_kind -n dynamo-system get pods -o wide 2>/dev/null || true
+  echo "=== Dynamo operator logs ==="
+  kubectl_kind -n dynamo-system logs deployment/dynamo-platform-dynamo-operator-controller-manager --tail=100 -c manager 2>/dev/null || true
+  echo "=== Recent events (dynamo-system) ==="
+  kubectl_kind -n dynamo-system get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true
+}
+
+print_kgateway_diagnostics() {
+  echo "=== kgateway pods ==="
+  kubectl_kind -n kgateway-system get pods -o wide 2>/dev/null || true
+  echo "=== GatewayClass status ==="
+  kubectl_kind get gatewayclass -o yaml 2>/dev/null || true
+  echo "=== Gateway status ==="
+  kubectl_kind get gateways -A -o yaml 2>/dev/null || true
+}
+
+case "${mode}" in
+  smoke)
+    print_setup_diagnostics
+    print_common_gpu_diagnostics
+    echo "=== Node status ==="
+    kubectl_kind get nodes -o wide 2>/dev/null || true
+    ;;
+  training)
+    print_h100_common_diagnostics kubeflow
+    print_kubeflow_diagnostics
+    ;;
+  inference)
+    print_h100_common_diagnostics dynamo-system kgateway-system
+    print_dynamo_diagnostics
+    print_kgateway_diagnostics
+    ;;
+  *)
+    echo "::error::unknown GPU_TEST_DIAGNOSTIC_MODE: ${mode}"
+    exit 1
+    ;;
+esac
diff --git a/.github/scripts/gpu-runtime-component-health.sh b/.github/scripts/gpu-runtime-component-health.sh
new file mode 100644
index 000000000..3d668d37b
--- /dev/null
+++ b/.github/scripts/gpu-runtime-component-health.sh
@@ -0,0 +1,136 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+if [[ $# -ne 1 ]]; then
+  echo "::error::Usage: $0 <training|inference>"
+  exit 2
+fi
+
+mode="$1"
+COMPONENT_HEALTH_TIMEOUT="${COMPONENT_HEALTH_TIMEOUT:-120s}"
+
+duration_seconds() {
+  local input_value="$1"
+  local number="${input_value%[smh]}"
+  local unit="${input_value: -1}"
+
+  case "${unit}" in
+    s) echo "$((10#${number}))" ;;
+    m) echo "$((10#${number} * 60))" ;;
+    h) echo "$((10#${number} * 3600))" ;;
+    *)
+      echo "::error::unsupported duration '${input_value}'" >&2
+      exit 1
+      ;;
+  esac
+}
+
+kubectl_kind() {
+  timeout 30s kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" "$@"
+}
+
+kubectl_kind_wait() {
+  # kubectl wait opens a watch that is already bounded by --timeout. Keep
+  # request-timeout unset here so a slow API server does not cut the watch short.
+  kubectl --context="kind-${KIND_CLUSTER_NAME}" "$@"
+}
+
+print_namespace_diagnostics() {
+  local ns="$1"
+
+  echo "=== ${ns} workloads ==="
+  kubectl_kind -n "${ns}" get deployments,statefulsets,daemonsets,pods -o wide 2>/dev/null || true
+  echo "=== Recent events (${ns}) ==="
+  kubectl_kind -n "${ns}" get events --sort-by='.lastTimestamp' 2>/dev/null | tail -80 || true
+}
+
+wait_for_deployments() {
+  local ns="$1"
+  shift
+  local deployments=("$@")
+
+  echo "Waiting up to ${COMPONENT_HEALTH_TIMEOUT} for ${ns}: ${deployments[*]}"
+  if kubectl_kind_wait -n "${ns}" wait \
+    --for=condition=Available \
+    --timeout="${COMPONENT_HEALTH_TIMEOUT}" \
+    "${deployments[@]}"; then
+    return 0
+  fi
+
+  echo "::error::One or more deployments in ${ns} did not become Available within ${COMPONENT_HEALTH_TIMEOUT}: ${deployments[*]}"
+  print_namespace_diagnostics "${ns}"
+  return 1
+}
+
+wait_for_required_object() {
+  local resource="$1"
+  local timeout_seconds
+  local deadline
+
+  timeout_seconds="$(duration_seconds "${COMPONENT_HEALTH_TIMEOUT}")"
+  deadline=$((SECONDS + timeout_seconds))
+
+  echo "Waiting up to ${COMPONENT_HEALTH_TIMEOUT} for ${resource}"
+  while (( SECONDS <= deadline )); do
+    if kubectl_kind get "${resource}" >/dev/null; then
+      return 0
+    fi
+    sleep 2
+  done
+
+  echo "::error::Required object is missing: ${resource}"
+  kubectl_kind get "${resource}" -o yaml 2>/dev/null || true
+  kubectl_kind describe "${resource}" 2>/dev/null || true
+  return 1
+}
+
+echo "=== Runtime component health (${mode}) ==="
+
+wait_for_deployments monitoring \
+  deployment/kube-prometheus-operator
+
+wait_for_deployments kai-scheduler \
+  deployment/kai-scheduler-default \
+  deployment/admission \
+  deployment/binder \
+  deployment/kai-operator \
+  deployment/pod-grouper \
+  deployment/podgroup-controller \
+  deployment/queue-controller
+
+case "${mode}" in
+  training)
+    wait_for_deployments kubeflow \
+      deployment/kubeflow-trainer-controller-manager
+    wait_for_required_object validatingwebhookconfiguration/validator.trainer.kubeflow.org
+    wait_for_required_object customresourcedefinition/trainjobs.trainer.kubeflow.org
+    ;;
+  inference)
+    wait_for_deployments dynamo-system \
+      deployment/dynamo-platform-dynamo-operator-controller-manager \
+      deployment/grove-operator
+    wait_for_deployments kgateway-system \
+      deployment/kgateway \
+      deployment/inference-gateway
+    ;;
+  *)
+    echo "::error::unknown runtime component health mode: ${mode}"
+    exit 2
+    ;;
+esac
+
+echo "Runtime component health check passed."
diff --git a/.github/scripts/gpu-smoke-run-nvidia-smi.sh b/.github/scripts/gpu-smoke-run-nvidia-smi.sh
new file mode 100644
index 000000000..0d4ea31d7
--- /dev/null
+++ b/.github/scripts/gpu-smoke-run-nvidia-smi.sh
@@ -0,0 +1,55 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+KUBECTL_REQUEST_TIMEOUT="${KUBECTL_REQUEST_TIMEOUT:-10s}"
+KUBECTL_WAIT_REQUEST_TIMEOUT="${KUBECTL_WAIT_REQUEST_TIMEOUT:-130s}"
+POD_NAME_FILE="${POD_NAME_FILE:-/tmp/aicr-gpu-smoke-pod-name-${KIND_CLUSTER_NAME}}"
+
+kubectl_kind() {
+  kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" --context="kind-${KIND_CLUSTER_NAME}" "$@"
+}
+
+kubectl_kind_wait() {
+  timeout 150s kubectl --request-timeout="${KUBECTL_WAIT_REQUEST_TIMEOUT}" --context="kind-${KIND_CLUSTER_NAME}" "$@"
+}
+
+pod_name=$(cat <<'EOF' | kubectl_kind create -f - -o jsonpath='{.metadata.name}'
+apiVersion: v1
+kind: Pod
+metadata:
+  generateName: gpu-smoke-test-
+  labels:
+    app: gpu-smoke-test
+spec:
+  restartPolicy: Never
+  containers:
+  - name: nvidia-smi
+    image: ubuntu:22.04
+    command: ["nvidia-smi"]
+    resources:
+      limits:
+        nvidia.com/gpu: 1
+EOF
+)
+
+echo "${pod_name}" > "${POD_NAME_FILE}"
+
+echo "Waiting for ${pod_name} pod to complete..."
+kubectl_kind_wait wait "pod/${pod_name}" \
+  --for=condition=Ready --timeout=120s || true
+kubectl_kind_wait wait "pod/${pod_name}" \
+  --for=jsonpath='{.status.phase}'=Succeeded --timeout=120s
diff --git a/.github/scripts/gpu-smoke-show-nvidia-smi-log.sh b/.github/scripts/gpu-smoke-show-nvidia-smi-log.sh
new file mode 100644
index 000000000..05bc09523
--- /dev/null
+++ b/.github/scripts/gpu-smoke-show-nvidia-smi-log.sh
@@ -0,0 +1,46 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+KUBECTL_REQUEST_TIMEOUT="${KUBECTL_REQUEST_TIMEOUT:-10s}"
+POD_NAME_FILE="${POD_NAME_FILE:-/tmp/aicr-gpu-smoke-pod-name-${KIND_CLUSTER_NAME}}"
+trap 'rm -f "${POD_NAME_FILE}"' EXIT
+
+kubectl_kind() {
+  kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" --context="kind-${KIND_CLUSTER_NAME}" "$@"
+}
+
+pod_name=""
+if [[ -f "${POD_NAME_FILE}" ]]; then
+  pod_name="$(cat "${POD_NAME_FILE}")"
+  if [[ -n "${pod_name}" ]] && ! kubectl_kind get pod "${pod_name}" >/dev/null 2>&1; then
+    pod_name=""
+  fi
+fi
+
+if [[ -z "${pod_name}" ]]; then
+  pod_name=$(kubectl_kind get pods \
+    -l app=gpu-smoke-test \
+    --sort-by=.metadata.creationTimestamp \
+    -o jsonpath='{.items[-1:].metadata.name}')
+fi
+
+if [[ -z "${pod_name}" ]]; then
+  echo "::error::no gpu-smoke-test pod found"
+  exit 1
+fi
+
+kubectl_kind logs "${pod_name}"
diff --git a/.github/scripts/gpu-validate-conformance.sh b/.github/scripts/gpu-validate-conformance.sh
new file mode 100644
index 000000000..79550cb3a
--- /dev/null
+++ b/.github/scripts/gpu-validate-conformance.sh
@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+AICR_VALIDATOR_IMAGE_REGISTRY=ko.local \
+./aicr validate \
+  --recipe recipe.yaml \
+  --phase conformance \
+  --namespace gpu-operator \
+  --kubeconfig="${HOME}/.kube/config" \
+  --require-gpu \
+  --image=ko.local:smoke-test \
+  --timeout=10m \
+  --toleration '*' \
+  --output=validation-result.yaml \
+  --evidence-dir=conformance-evidence
diff --git a/.github/workflows/gpu-h100-inference-test.yaml b/.github/workflows/gpu-h100-inference-test.yaml
index c5e1882d4..4f06bb396 100644
--- a/.github/workflows/gpu-h100-inference-test.yaml
+++ b/.github/workflows/gpu-h100-inference-test.yaml
@@ -16,12 +16,10 @@ name: GPU Inference Test (nvkind + H100 x2)
 
 on:
   schedule:
-    - cron: '15 6,18 * * *'  # Every 12 hours (2x daily), offset from T4 smoke test
+    - cron: '15 6,18 * * *'  # Every 12 hours (2x daily), offset from training test
   push:
     branches:
       - "pull-request/[0-9]+"
-  pull_request:
-    types: [labeled]
   workflow_dispatch: {}  # Allow manual runs
 
 permissions:
@@ -40,6 +38,8 @@ jobs:
       should-run: ${{ steps.filter.outputs.matched }}
     steps:
       - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
+        with:
+          persist-credentials: false
       - uses: dorny/paths-filter@fbd0ab8f3e69293af611ebaee6363fc25e6d187d  # v4.0.1
         id: filter
         with:
@@ -50,13 +50,21 @@ jobs:
               - '.settings.yaml'
               - '.github/actions/gpu-cluster-setup/**'
               - '.github/actions/gpu-operator-install/**'
+              - '.github/actions/check-control-plane-health/**'
               - '.github/actions/aicr-build/**'
               - '.github/actions/setup-build-tools/**'
               - '.github/actions/install-karpenter-kwok/**'
               - 'validators/*/Dockerfile'
               - 'pkg/evidence/**'
+              - '.github/workflows/gpu-h100-kind-runtime-test.yaml'
+              - '.github/actions/gpu-debug-diagnostics/**'
               - '.github/actions/gpu-test-cleanup/**'
               - '.github/actions/load-versions/**'
+              - '.github/scripts/gpu-chainsaw-health.sh'
+              - '.github/scripts/gpu-debug-diagnostics.sh'
+              - '.github/scripts/gpu-runtime-component-health.sh'
+              - '.github/scripts/gpu-validate-conformance.sh'
+              - 'pkg/bundler/deployer/helm/**'
               - 'tests/chainsaw/chainsaw-config.yaml'
               - 'tests/chainsaw/ai-conformance/main.go'
               - 'tests/chainsaw/ai-conformance/common/**'
@@ -82,203 +90,23 @@ jobs:
               - 'pkg/defaults/timeouts.go'
               - 'validators/conformance/**'
 
+  # NVIDIA self-hosted GPU runners reject pull_request event jobs before
+  # checkout. PR GPU coverage runs through the pull-request/<number> push
+  # mirror after ok-to-test approval.
   gpu-inference-test:
     needs: [check-paths]
     if: >
       always() && (
         github.event_name == 'schedule' ||
         github.event_name == 'workflow_dispatch' ||
-        (github.event_name == 'pull_request' && github.event.label.name == 'run-gpu-tests') ||
         (github.event_name == 'push' && needs.check-paths.outputs.should-run == 'true')
       )
     name: GPU Inference Test (nvkind + H100 x2)
-    concurrency:
-      group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}
-      cancel-in-progress: true
-    runs-on: linux-amd64-gpu-h100-latest-2
-    timeout-minutes: 120
-
-    env:
-      KIND_CLUSTER_NAME: gpu-inference-test
-
-    steps:
-
-      - name: Checkout Code
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
-        with:
-          persist-credentials: false
-
-      - name: Set up GPU cluster
-        uses: ./.github/actions/gpu-cluster-setup
-
-      - name: Build aicr
-        uses: ./.github/actions/aicr-build
-        with:
-          validator_phases: 'conformance'
-
-      - name: Install runtime bundle
-        id: bundle-install
-        uses: ./.github/actions/gpu-operator-install
-        with:
-          method: bundle
-          accelerator: h100
-          platform: dynamo
-
-      # --- Snapshot and GPU validation ---
-
-      - name: Snapshot and validate GPU
-        uses: ./.github/actions/gpu-snapshot-validate
-        with:
-          gpu_model: H100
-          min_gpu_count: '2'
-          cluster_name: ${{ env.KIND_CLUSTER_NAME }}
-
-      # --- Install Karpenter + KWOK early to give monitoring stack settle time ---
-
-      - name: Install Karpenter + KWOK
-        uses: ./.github/actions/install-karpenter-kwok
-        with:
-          cluster_name: ${{ env.KIND_CLUSTER_NAME }}
-
-      # --- Health checks ---
-
-      - name: Prepare chainsaw
-        id: versions
-        uses: ./.github/actions/load-versions
-
-      - name: Install chainsaw
-        uses: ./.github/actions/setup-build-tools
-        with:
-          install_chainsaw: 'true'
-          chainsaw_version: '${{ steps.versions.outputs.chainsaw }}'
-
-      - name: Run chainsaw health checks
-        run: |
-          chainsaw test \
-            --test-dir tests/chainsaw/ai-conformance/kind-inference-dynamo \
-            --config tests/chainsaw/chainsaw-config.yaml
-
-      # --- CNCF AI Conformance validation ---
-      # Runs after the stack health checks so gateway and metrics validators
-      # see a settled inference stack.
-
-      - name: Verify expected resources exist
-        run: |
-          go run ./tests/chainsaw/ai-conformance/ \
-            --dir tests/chainsaw/ai-conformance/kind-inference-dynamo \
-            --dir tests/chainsaw/ai-conformance/common \
-            --dir tests/chainsaw/ai-conformance/kind-common \
-            --kubeconfig="${HOME}/.kube/config" \
-            --debug
-
-      - name: Validate CNCF AI Conformance
-        id: validate-conformance
-        run: |
-          AICR_VALIDATOR_IMAGE_REGISTRY=ko.local \
-          ./aicr validate \
-            --recipe recipe.yaml \
-            --phase conformance \
-            --namespace gpu-operator \
-            --kubeconfig="${HOME}/.kube/config" \
-            --require-gpu \
-            --image=ko.local:smoke-test \
-            --timeout=10m \
-            --toleration '*' \
-            --output=validation-result.yaml \
-            --evidence-dir=conformance-evidence
-
-      # Dynamo smoke is intentionally disabled for now. The vLLM runtime image
-      # adds significant latency and flakiness in Kind CI, and training has no
-      # matching smoke path yet. Reintroduce it later alongside a symmetric
-      # training smoke test if needed.
-      # --- Validation artifacts ---
-
-      # Collect a post-run resource snapshot regardless of whether conformance
-      # validation ran, so triage always has a cluster-state artifact.
-      - name: Collect validation artifacts
-        if: >-
-          always()
-          && !cancelled()
-          && steps.bundle-install.outcome == 'success'
-        continue-on-error: true
-        shell: bash
-        run: |
-          set -o pipefail
-          mkdir -p conformance-evidence
-          go run ./tests/chainsaw/ai-conformance/ \
-            --dir tests/chainsaw/ai-conformance/kind-inference-dynamo \
-            --dir tests/chainsaw/ai-conformance/common \
-            --dir tests/chainsaw/ai-conformance/kind-common \
-            --kubeconfig="${HOME}/.kube/config" \
-            --debug | tee conformance-evidence/resource-existence-post.txt
-
-      - name: Upload validation artifacts
-        if: always()
-        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a  # v7.0.1
-        with:
-          name: conformance-evidence
-          path: |
-            conformance-evidence/
-            validation-result.yaml
-          if-no-files-found: warn
-
-      - name: Debug diagnostics
-        if: failure()
-        run: |
-          echo "=== ClusterPolicy status ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" get clusterpolicy -o yaml 2>/dev/null || true
-          echo "=== GPU Operator pods ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods -o wide 2>/dev/null || true
-          echo "=== Non-running pods (all namespaces) ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded 2>/dev/null || true
-          echo "=== Recent events (gpu-operator) ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true
-          echo "=== Dynamo pods ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system get pods -o wide 2>/dev/null || true
-          echo "=== Dynamo operator logs ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system \
-            logs deployment/dynamo-operator-controller-manager --tail=100 -c manager 2>/dev/null || true
-          echo "=== Recent events (dynamo-system) ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true
-          echo "=== Custom metrics API ==="
-          for METRIC in gpu_utilization gpu_memory_used gpu_power_usage; do
-            echo "--- ${METRIC} ---"
-            for NS in gpu-operator dynamo-system; do
-              kubectl --context="kind-${KIND_CLUSTER_NAME}" get --raw \
-                "/apis/custom.metrics.k8s.io/v1beta1/namespaces/${NS}/pods/*/${METRIC}" 2>/dev/null | jq . || true
-            done
-          done
-          echo "=== Grafana deployment ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring get deployment grafana -o wide 2>/dev/null || true
-          echo "=== Grafana pods ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring get pods \
-            -l app.kubernetes.io/name=grafana -o wide 2>/dev/null || true
-          echo "=== Grafana deployment describe ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring describe deployment grafana 2>/dev/null || true
-          echo "=== Grafana pod describe ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring describe pods \
-            -l app.kubernetes.io/name=grafana 2>/dev/null || true
-          echo "=== prometheus-adapter pods ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring get pods -l app.kubernetes.io/name=prometheus-adapter -o wide 2>/dev/null || true
-          echo "=== kgateway pods ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kgateway-system get pods -o wide 2>/dev/null || true
-          echo "=== GatewayClass status ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" get gatewayclass -o yaml 2>/dev/null || true
-          echo "=== Gateway status ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" get gateways -A -o yaml 2>/dev/null || true
-          echo "=== DCGM Exporter pods ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \
-            get pods -l app=nvidia-dcgm-exporter -o wide 2>/dev/null || true
-          echo "=== Monitoring pods ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring get pods -o wide 2>/dev/null || true
-          echo "=== DRA ResourceSlices ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" get resourceslices -o wide 2>/dev/null || true
-          echo "=== Node status ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" get nodes -o wide 2>/dev/null || true
-
-      - name: GPU Test Cleanup
-        if: always()
-        uses: ./.github/actions/gpu-test-cleanup
-        with:
-          cluster_name: ${{ env.KIND_CLUSTER_NAME }}
-          artifact_name_prefix: gpu-inference-test-debug
+    uses: ./.github/workflows/gpu-h100-kind-runtime-test.yaml
+    with:
+      job_name: GPU Inference Test (nvkind + H100 x2)
+      cluster_name: gpu-inference-test
+      intent: inference
+      platform: dynamo
+      chainsaw_path: tests/chainsaw/ai-conformance/kind-inference-dynamo
+      artifact_name_prefix: gpu-inference-test-debug
diff --git a/.github/workflows/gpu-h100-kind-runtime-test.yaml b/.github/workflows/gpu-h100-kind-runtime-test.yaml
new file mode 100644
index 000000000..6d0f8757b
--- /dev/null
+++ b/.github/workflows/gpu-h100-kind-runtime-test.yaml
@@ -0,0 +1,221 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: GPU H100 Kind Runtime Test
+
+on:
+  workflow_call:
+    inputs:
+      job_name:
+        description: 'Display name for the H100 runtime job'
+        required: true
+        type: string
+      cluster_name:
+        description: 'Kind cluster name'
+        required: true
+        type: string
+      intent:
+        description: 'Runtime intent passed to the bundle installer'
+        required: true
+        type: string
+      platform:
+        description: 'Runtime platform passed to the bundle installer'
+        required: true
+        type: string
+      chainsaw_path:
+        description: 'Chainsaw health-check directory'
+        required: true
+        type: string
+      artifact_name_prefix:
+        description: 'Prefix for uploaded debug artifacts'
+        required: true
+        type: string
+
+permissions:
+  contents: read
+
+jobs:
+  gpu-h100-kind-runtime-test:
+    name: ${{ inputs.job_name }}
+    runs-on: linux-amd64-gpu-h100-latest-2
+    # Cold self-hosted H100 runners can spend most of this budget pulling
+    # images and loading Kind nodes before validation starts.
+    timeout-minutes: 180
+    concurrency:
+      group: gpu-h100-${{ inputs.cluster_name }}-${{ github.event_name }}-${{ github.ref }}
+      cancel-in-progress: true
+
+    env:
+      KIND_CLUSTER_NAME: ${{ inputs.cluster_name }}
+
+    steps:
+      - name: Checkout Code
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
+        with:
+          persist-credentials: false
+
+      - name: Load GPU test versions
+        id: versions
+        uses: ./.github/actions/load-versions
+
+      - name: Set up GPU cluster
+        uses: ./.github/actions/gpu-cluster-setup
+        with:
+          kind_node_image: ${{ steps.versions.outputs.h100_kind_node_image }}
+          min_gpu_count: '2'
+          gpu_model_pattern: H100
+          min_free_disk_gb: '50'
+          min_available_memory_gb: '16'
+          cluster_create_timeout: 900s
+          control_plane_resource_patches: 'true'
+          control_plane_leader_election_tuning: 'true'
+
+      - name: Build aicr and snapshot agent image
+        uses: ./.github/actions/aicr-build
+        with:
+          build_snapshot_agent: 'true'
+          validator_phases: 'none'
+
+      # Fast readiness gate after cluster setup. Stability windows start after
+      # runtime install, where component rollouts can stress the control plane.
+      - name: Check control plane health
+        uses: ./.github/actions/check-control-plane-health
+        with:
+          cluster_name: ${{ env.KIND_CLUSTER_NAME }}
+          wait_timeout: 120s
+          stability_window: 0s
+          recover_unhealthy: 'true'
+
+      - name: Install runtime bundle
+        id: bundle-install
+        uses: ./.github/actions/gpu-operator-install
+        with:
+          method: bundle
+          accelerator: h100
+          intent: ${{ inputs.intent }}
+          platform: ${{ inputs.platform }}
+          wait: 'true'
+          best_effort: 'false'
+
+      - name: Check runtime component health
+        run: bash .github/scripts/gpu-runtime-component-health.sh "${{ inputs.intent }}"
+
+      # Runtime install creates many CRDs, webhooks, and controllers. Keep a
+      # stability window here to catch KCM/scheduler restarts before snapshot.
+      - name: Check control plane health
+        uses: ./.github/actions/check-control-plane-health
+        with:
+          cluster_name: ${{ env.KIND_CLUSTER_NAME }}
+          wait_timeout: 120s
+          stability_window: 60s
+          recover_unhealthy: 'true'
+
+      - name: Snapshot and validate GPU
+        uses: ./.github/actions/gpu-snapshot-validate
+        with:
+          gpu_model: H100
+          min_gpu_count: '2'
+          cluster_name: ${{ env.KIND_CLUSTER_NAME }}
+          snapshot_timeout: 10m
+
+      # Snapshot deploys a GPU Job and exercises cluster discovery; verify the
+      # control plane stayed stable before adding Karpenter/KWOK.
+      - name: Check control plane health
+        uses: ./.github/actions/check-control-plane-health
+        with:
+          cluster_name: ${{ env.KIND_CLUSTER_NAME }}
+          wait_timeout: 120s
+          stability_window: 60s
+          recover_unhealthy: 'true'
+
+      - name: Install Karpenter + KWOK
+        uses: ./.github/actions/install-karpenter-kwok
+        with:
+          cluster_name: ${{ env.KIND_CLUSTER_NAME }}
+          kwok_helm_timeout: 600s
+          ko_build_timeout: '1200'
+          karpenter_helm_timeout: 600s
+
+      - name: Install chainsaw
+        uses: ./.github/actions/setup-build-tools
+        with:
+          install_chainsaw: 'true'
+          chainsaw_version: '${{ steps.versions.outputs.chainsaw }}'
+
+      # This gate is effectively post-Karpenter/KWOK; the chainsaw setup above
+      # only installs a runner-side binary.
+      - name: Check control plane health
+        uses: ./.github/actions/check-control-plane-health
+        with:
+          cluster_name: ${{ env.KIND_CLUSTER_NAME }}
+          wait_timeout: 120s
+          stability_window: 60s
+          recover_unhealthy: 'true'
+
+      - name: Run chainsaw health checks
+        run: bash .github/scripts/gpu-chainsaw-health.sh "${{ inputs.chainsaw_path }}"
+
+      - name: Build conformance validator image
+        uses: ./.github/actions/aicr-build
+        with:
+          build_cli: 'false'
+          build_snapshot_agent: 'false'
+          validator_phases: 'conformance'
+
+      # Validator image build/load can contend with Docker and kind containerd;
+      # verify the control plane before the final conformance workload.
+      - name: Check control plane health
+        uses: ./.github/actions/check-control-plane-health
+        with:
+          cluster_name: ${{ env.KIND_CLUSTER_NAME }}
+          wait_timeout: 120s
+          stability_window: 60s
+          recover_unhealthy: 'true'
+
+      - name: Validate CNCF AI Conformance
+        id: validate-conformance
+        run: bash .github/scripts/gpu-validate-conformance.sh
+
+      - name: Upload validation artifacts
+        if: always()
+        timeout-minutes: 5
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a  # v7.0.1
+        with:
+          name: conformance-evidence
+          path: |
+            conformance-evidence/
+            validation-result.yaml
+          if-no-files-found: warn
+
+      - name: Debug diagnostics
+        if: failure()
+        timeout-minutes: 5
+        uses: ./.github/actions/gpu-debug-diagnostics
+        with:
+          cluster_name: ${{ env.KIND_CLUSTER_NAME }}
+          mode: ${{ inputs.intent }}
+
+      - name: Mark debug artifact collection
+        id: gpu-debug-artifacts
+        if: failure() || cancelled()
+        shell: bash
+        run: echo "collect=true" >> "${GITHUB_OUTPUT}"
+
+      - name: GPU Test Cleanup
+        if: always()
+        uses: ./.github/actions/gpu-test-cleanup
+        with:
+          cluster_name: ${{ env.KIND_CLUSTER_NAME }}
+          artifact_name_prefix: ${{ inputs.artifact_name_prefix }}
+          collect_artifacts: ${{ steps.gpu-debug-artifacts.outputs.collect == 'true' }}
diff --git a/.github/workflows/gpu-h100-training-test.yaml b/.github/workflows/gpu-h100-training-test.yaml
index d3a04de03..51fbed8ba 100644
--- a/.github/workflows/gpu-h100-training-test.yaml
+++ b/.github/workflows/gpu-h100-training-test.yaml
@@ -20,8 +20,6 @@ on:
   push:
     branches:
       - "pull-request/[0-9]+"
-  pull_request:
-    types: [labeled]
   workflow_dispatch: {}  # Allow manual runs
 
 permissions:
@@ -40,6 +38,8 @@ jobs:
       should-run: ${{ steps.filter.outputs.matched }}
     steps:
       - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
+        with:
+          persist-credentials: false
       - uses: dorny/paths-filter@fbd0ab8f3e69293af611ebaee6363fc25e6d187d  # v4.0.1
         id: filter
         with:
@@ -50,13 +50,21 @@ jobs:
               - '.settings.yaml'
               - '.github/actions/gpu-cluster-setup/**'
               - '.github/actions/gpu-operator-install/**'
+              - '.github/actions/check-control-plane-health/**'
               - '.github/actions/aicr-build/**'
               - '.github/actions/setup-build-tools/**'
               - '.github/actions/install-karpenter-kwok/**'
               - 'validators/*/Dockerfile'
               - 'pkg/evidence/**'
+              - '.github/workflows/gpu-h100-kind-runtime-test.yaml'
+              - '.github/actions/gpu-debug-diagnostics/**'
               - '.github/actions/gpu-test-cleanup/**'
               - '.github/actions/load-versions/**'
+              - '.github/scripts/gpu-chainsaw-health.sh'
+              - '.github/scripts/gpu-debug-diagnostics.sh'
+              - '.github/scripts/gpu-runtime-component-health.sh'
+              - '.github/scripts/gpu-validate-conformance.sh'
+              - 'pkg/bundler/deployer/helm/**'
               - 'tests/chainsaw/chainsaw-config.yaml'
               - 'tests/chainsaw/ai-conformance/main.go'
               - 'tests/chainsaw/ai-conformance/common/**'
@@ -78,187 +86,23 @@ jobs:
               - 'pkg/defaults/timeouts.go'
               - 'validators/conformance/**'
 
+  # NVIDIA self-hosted GPU runners reject pull_request event jobs before
+  # checkout. PR GPU coverage runs through the pull-request/<number> push
+  # mirror after ok-to-test approval.
   gpu-training-test:
     needs: [check-paths]
     if: >
       always() && (
         github.event_name == 'schedule' ||
         github.event_name == 'workflow_dispatch' ||
-        (github.event_name == 'pull_request' && github.event.label.name == 'run-gpu-tests') ||
         (github.event_name == 'push' && needs.check-paths.outputs.should-run == 'true')
       )
     name: GPU Training Test (nvkind + H100 x2)
-    concurrency:
-      group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}
-      cancel-in-progress: true
-    runs-on: linux-amd64-gpu-h100-latest-2
-    timeout-minutes: 120
-
-    env:
-      KIND_CLUSTER_NAME: gpu-training-test
-
-    steps:
-
-      - name: Checkout Code
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
-        with:
-          persist-credentials: false
-
-      - name: Set up GPU cluster
-        uses: ./.github/actions/gpu-cluster-setup
-
-      - name: Build aicr
-        uses: ./.github/actions/aicr-build
-        with:
-          validator_phases: 'conformance'
-
-      - name: Install runtime bundle
-        id: bundle-install
-        uses: ./.github/actions/gpu-operator-install
-        with:
-          method: bundle
-          accelerator: h100
-          intent: training
-          platform: kubeflow
-
-      # --- Snapshot and GPU validation ---
-
-      - name: Snapshot and validate GPU
-        uses: ./.github/actions/gpu-snapshot-validate
-        with:
-          gpu_model: H100
-          min_gpu_count: '2'
-          cluster_name: ${{ env.KIND_CLUSTER_NAME }}
-
-      # --- Install Karpenter + KWOK early to give monitoring stack settle time ---
-
-      - name: Install Karpenter + KWOK
-        uses: ./.github/actions/install-karpenter-kwok
-        with:
-          cluster_name: ${{ env.KIND_CLUSTER_NAME }}
-
-      # --- Health checks ---
-
-      - name: Prepare chainsaw
-        id: versions
-        uses: ./.github/actions/load-versions
-
-      - name: Install chainsaw
-        uses: ./.github/actions/setup-build-tools
-        with:
-          install_chainsaw: 'true'
-          chainsaw_version: '${{ steps.versions.outputs.chainsaw }}'
-
-      - name: Run chainsaw health checks
-        run: |
-          chainsaw test \
-            --test-dir tests/chainsaw/ai-conformance/kind-training-kubeflow \
-            --config tests/chainsaw/chainsaw-config.yaml
-
-      # --- CNCF AI Conformance validation ---
-      # Runs last to ensure the DCGM → Prometheus → adapter pipeline
-      # has had time to bootstrap (pod-autoscaling check needs live metric data).
-
-      - name: Verify expected resources exist
-        run: |
-          go run ./tests/chainsaw/ai-conformance/ \
-            --dir tests/chainsaw/ai-conformance/kind-training-kubeflow \
-            --dir tests/chainsaw/ai-conformance/common \
-            --dir tests/chainsaw/ai-conformance/kind-common \
-            --kubeconfig="${HOME}/.kube/config" \
-            --debug
-
-      - name: Validate CNCF AI Conformance
-        id: validate-conformance
-        run: |
-          AICR_VALIDATOR_IMAGE_REGISTRY=ko.local \
-          ./aicr validate \
-            --recipe recipe.yaml \
-            --phase conformance \
-            --namespace gpu-operator \
-            --kubeconfig="${HOME}/.kube/config" \
-            --require-gpu \
-            --image=ko.local:smoke-test \
-            --timeout=10m \
-            --toleration '*' \
-            --output=validation-result.yaml \
-            --evidence-dir=conformance-evidence
-
-      # --- Validation artifacts ---
-
-      # Collect a post-run resource snapshot regardless of whether conformance
-      # validation ran, so triage always has a cluster-state artifact.
-      - name: Collect validation artifacts
-        if: >-
-          always()
-          && !cancelled()
-          && steps.bundle-install.outcome == 'success'
-        continue-on-error: true
-        shell: bash
-        run: |
-          set -o pipefail
-          mkdir -p conformance-evidence
-          go run ./tests/chainsaw/ai-conformance/ \
-            --dir tests/chainsaw/ai-conformance/kind-training-kubeflow \
-            --dir tests/chainsaw/ai-conformance/common \
-            --dir tests/chainsaw/ai-conformance/kind-common \
-            --kubeconfig="${HOME}/.kube/config" \
-            --debug | tee conformance-evidence/resource-existence-post.txt
-
-      - name: Upload validation artifacts
-        if: always()
-        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a  # v7.0.1
-        with:
-          name: conformance-evidence
-          path: |
-            conformance-evidence/
-            validation-result.yaml
-          if-no-files-found: warn
-
-      # --- Debug diagnostics (before cleanup so resources still exist) ---
-
-      - name: Debug diagnostics
-        if: failure()
-        run: |
-          echo "=== Grafana deployment ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring get deployment grafana -o wide 2>/dev/null || true
-          echo "=== Grafana pods ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring get pods \
-            -l app.kubernetes.io/name=grafana -o wide 2>/dev/null || true
-          echo "=== Grafana deployment describe ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring describe deployment grafana 2>/dev/null || true
-          echo "=== Grafana pod describe ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring describe pods \
-            -l app.kubernetes.io/name=grafana 2>/dev/null || true
-          echo "=== KAI scheduler pods ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kai-scheduler get pods -o wide 2>/dev/null || true
-          echo "=== KAI scheduler logs ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kai-scheduler \
-            logs deployment/kai-scheduler-default --tail=100 2>/dev/null || true
-          echo "=== KAI scheduler queues ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" get queues -A 2>/dev/null || true
-          echo "=== KAI scheduler podgroups ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" get podgroups -A 2>/dev/null || true
-          echo "=== Kubeflow Trainer deployment ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kubeflow get deployment kubeflow-trainer-controller-manager -o wide 2>/dev/null || true
-          echo "=== Kubeflow pods ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kubeflow get pods -o wide 2>/dev/null || true
-          echo "=== Kubeflow validating webhooks ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" get validatingwebhookconfigurations validator.trainer.kubeflow.org -o yaml 2>/dev/null || true
-          echo "=== Kubeflow Trainer CRD ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" get crd trainjobs.trainer.kubeflow.org -o yaml 2>/dev/null || true
-          echo "=== Non-running pods (all namespaces) ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" get pods -A \
-            --field-selector=status.phase!=Running,status.phase!=Succeeded 2>/dev/null || true
-          echo "=== GPU Operator pods ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods -o wide 2>/dev/null || true
-          echo "=== Node resources ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" describe nodes 2>/dev/null | \
-            grep -A 20 "Allocated resources" || true
-
-      - name: GPU Test Cleanup
-        if: always()
-        uses: ./.github/actions/gpu-test-cleanup
-        with:
-          cluster_name: ${{ env.KIND_CLUSTER_NAME }}
-          artifact_name_prefix: gpu-training-test-debug
+    uses: ./.github/workflows/gpu-h100-kind-runtime-test.yaml
+    with:
+      job_name: GPU Training Test (nvkind + H100 x2)
+      cluster_name: gpu-training-test
+      intent: training
+      platform: kubeflow
+      chainsaw_path: tests/chainsaw/ai-conformance/kind-training-kubeflow
+      artifact_name_prefix: gpu-training-test-debug
diff --git a/.github/workflows/gpu-smoke-test.yaml b/.github/workflows/gpu-smoke-test.yaml
index d5b8c5c74..af8d3860c 100644
--- a/.github/workflows/gpu-smoke-test.yaml
+++ b/.github/workflows/gpu-smoke-test.yaml
@@ -20,8 +20,6 @@ on:
   push:
     branches:
       - "pull-request/[0-9]+"
-  pull_request:
-    types: [labeled]
   workflow_dispatch: {}  # Allow manual runs
 
 permissions:
@@ -40,6 +38,8 @@ jobs:
       should-run: ${{ steps.filter.outputs.matched }}
     steps:
       - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
+        with:
+          persist-credentials: false
       - uses: dorny/paths-filter@fbd0ab8f3e69293af611ebaee6363fc25e6d187d  # v4.0.1
         id: filter
         with:
@@ -50,8 +50,13 @@ jobs:
               - '.github/actions/gpu-cluster-setup/**'
               - '.github/actions/gpu-operator-install/**'
               - '.github/actions/aicr-build/**'
+              - '.github/actions/gpu-debug-diagnostics/**'
               - '.github/actions/gpu-test-cleanup/**'
+              - '.github/actions/gpu-smoke-nvidia-smi/**'
               - '.github/actions/load-versions/**'
+              - '.github/scripts/gpu-debug-diagnostics.sh'
+              - '.github/scripts/gpu-smoke-run-nvidia-smi.sh'
+              - '.github/scripts/gpu-smoke-show-nvidia-smi-log.sh'
               - 'pkg/collector/**'
               - 'pkg/snapshotter/**'
               - '.github/actions/gpu-snapshot-validate/**'
@@ -62,11 +67,13 @@ jobs:
 
   gpu-smoke-test:
     needs: [check-paths]
+    # NVIDIA self-hosted GPU runners reject pull_request event jobs before
+    # checkout. PR GPU coverage runs through the pull-request/<number> push
+    # mirror after ok-to-test approval.
     if: >
       always() && (
         github.event_name == 'schedule' ||
         github.event_name == 'workflow_dispatch' ||
-        (github.event_name == 'pull_request' && github.event.label.name == 'run-gpu-tests') ||
         (github.event_name == 'push' && needs.check-paths.outputs.should-run == 'true')
       )
     name: GPU Smoke Test (nvkind + L40G)
@@ -88,6 +95,12 @@ jobs:
 
       - name: Set up GPU cluster
         uses: ./.github/actions/gpu-cluster-setup
+        with:
+          # Keep smoke runner preflight explicit so action default changes do not
+          # silently alter L40G coverage.
+          min_gpu_count: '1'
+          min_free_disk_gb: '20'
+          min_available_memory_gb: '8'
 
       - name: Build aicr
         uses: ./.github/actions/aicr-build
@@ -100,31 +113,9 @@ jobs:
           method: helm
 
       - name: Run nvidia-smi in a pod
-        run: |
-          cat <<'EOF' | kubectl --context="kind-${KIND_CLUSTER_NAME}" apply -f -
-          apiVersion: v1
-          kind: Pod
-          metadata:
-            name: gpu-smoke-test
-          spec:
-            restartPolicy: Never
-            containers:
-            - name: nvidia-smi
-              image: ubuntu:22.04
-              command: ["nvidia-smi"]
-              resources:
-                limits:
-                  nvidia.com/gpu: 1
-          EOF
-
-          echo "Waiting for gpu-smoke-test pod to complete..."
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" wait pod/gpu-smoke-test \
-            --for=condition=Ready --timeout=120s || true
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" wait pod/gpu-smoke-test \
-            --for=jsonpath='{.status.phase}'=Succeeded --timeout=120s
-
-      - name: Show nvidia-smi output
-        run: kubectl --context="kind-${KIND_CLUSTER_NAME}" logs gpu-smoke-test
+        uses: ./.github/actions/gpu-smoke-nvidia-smi
+        with:
+          cluster_name: ${{ env.KIND_CLUSTER_NAME }}
 
       # --- Snapshot and validation ---
 
@@ -137,20 +128,20 @@ jobs:
 
       - name: Debug diagnostics
         if: failure()
-        run: |
-          echo "=== ClusterPolicy status ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" get clusterpolicy -o yaml 2>/dev/null || true
-          echo "=== GPU Operator pods ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods -o wide 2>/dev/null || true
-          echo "=== Non-running pods (all namespaces) ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded 2>/dev/null || true
-          echo "=== Recent events (gpu-operator) ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true
-          echo "=== Node status ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" get nodes -o wide 2>/dev/null || true
+        uses: ./.github/actions/gpu-debug-diagnostics
+        with:
+          cluster_name: ${{ env.KIND_CLUSTER_NAME }}
+          mode: smoke
+
+      - name: Mark debug artifact collection
+        id: gpu-debug-artifacts
+        if: failure() || cancelled()
+        shell: bash
+        run: echo "collect=true" >> "${GITHUB_OUTPUT}"
 
       - name: GPU Test Cleanup
         if: always()
         uses: ./.github/actions/gpu-test-cleanup
         with:
           cluster_name: ${{ env.KIND_CLUSTER_NAME }}
+          collect_artifacts: ${{ steps.gpu-debug-artifacts.outputs.collect == 'true' }}
diff --git a/.settings.yaml b/.settings.yaml
index 75b4559b1..5ef15198f 100644
--- a/.settings.yaml
+++ b/.settings.yaml
@@ -40,6 +40,7 @@ security_tools:
 testing_tools:
   kubectl: 'v1.35.0'
   kind: '0.31.0'
+  nvkind: '78a0a514c41c3e77ac0d935f38d971d3b4455138'
   ctlptl: '0.9.0'
   tilt: '0.37.0'
   helm: 'v4.1.1'
@@ -71,6 +72,7 @@ docs_tools:
 # Testing Configuration
 testing:
   kind_node_image: 'kindest/node:v1.32.0'
+  h100_kind_node_image: 'kindest/node:v1.35.0'
 
   # Component test harness configuration
   # Used by tools/component-test/ scripts to validate individual components
diff --git a/docs/user/cli-reference.md b/docs/user/cli-reference.md
index 302684d5a..c641eee07 100644
--- a/docs/user/cli-reference.md
+++ b/docs/user/cli-reference.md
@@ -1308,6 +1308,8 @@ Unknown flags are rejected with an error to catch typos (e.g., `--best-effort`).
 
 The deploy script retries failed `helm upgrade --install` and `kubectl apply` operations with exponential backoff. By default, each operation is retried up to 5 times (6 total attempts). The backoff delay increases quadratically: 5s, 20s, 45s, 80s, 120s (capped) between retries.
 
+On slower H100 CI runners, `kube-prometheus-stack` can hit Grafana's Deployment progress deadline before a longer Helm timeout would help. The deploy script intentionally keeps the default timeout and retry budget for `kube-prometheus-stack` so subsequent upgrade attempts can succeed after image pulls and controllers settle. Kind H100 Chainsaw health checks do not require Grafana because AICR conformance metrics use Prometheus, DCGM exporter, and prometheus-adapter directly.
+
 Use `--retries 0` to disable retries (fail-fast behavior). When `--best-effort` is also set, retries are exhausted first before falling through to best-effort handling.
 
 **Pre-install manifests and CRD ordering:**
@@ -1318,7 +1320,11 @@ After `helm install`, the same manifests are re-applied as post-install to ensur
 
 **Async components:**
 
-Components that use operator patterns with custom resources that reconcile asynchronously (e.g., `kai-scheduler`) are installed without `--wait` to avoid Helm timing out on CR readiness.
+Components that use operator patterns with custom resources that reconcile asynchronously use component-specific install behavior:
+
+- `kai-scheduler` installs without `--wait` to avoid Helm timing out on CR readiness. It uses a 30-minute per-attempt timeout and caps the retry budget at 1 retry so hook diagnostics surface quickly on cold runners.
+- `dynamo-platform` has `deploy.sh` attempt `--server-side=false` so retries and upgrades do not conflict with Grove-managed webhook certificate Secret data, while preserving Helm `--wait` behavior. The script only adds `--server-side=false` when Helm v4.0.5 or later is detected; with older Helm clients it logs a warning and proceeds without that mitigation. Dynamo installs use a 20-minute per-attempt timeout and cap the retry budget at 3 retries because cold-start failures often involve operator webhook and certificate resources settling across attempts.
+- `kube-prometheus-stack` keeps the default timeout and retry budget because Grafana can hit its Deployment progress deadline before a longer Helm timeout would help, especially on slower H100 CI runners under image-pull and control-plane load.
 
 ##### DRA kubelet plugin registration
 
diff --git a/kwok/scripts/install-karpenter-kwok.sh b/kwok/scripts/install-karpenter-kwok.sh
index 72b64dae1..d6a17481f 100755
--- a/kwok/scripts/install-karpenter-kwok.sh
+++ b/kwok/scripts/install-karpenter-kwok.sh
@@ -41,7 +41,9 @@ KARPENTER_VERSION="${KARPENTER_VERSION:-v1.8.0}"
 KIND_CLUSTER_NAME="${KIND_CLUSTER_NAME:?KIND_CLUSTER_NAME must be set}"
 KARPENTER_NAMESPACE="${KARPENTER_NAMESPACE:-karpenter}"
 KARPENTER_CLONE_DIR="${KARPENTER_CLONE_DIR:-/tmp/karpenter}"
+KWOK_HELM_TIMEOUT="${KWOK_HELM_TIMEOUT:-300s}"
 KO_BUILD_TIMEOUT="${KO_BUILD_TIMEOUT:-900}"  # 15 minutes
+KARPENTER_HELM_TIMEOUT="${KARPENTER_HELM_TIMEOUT:-300s}"
 
 RED='\033[0;31m'
 GREEN='\033[0;32m'
@@ -68,7 +70,7 @@ install_kwok() {
     helm upgrade --install kwok-controller kwok/kwok \
         --namespace kube-system \
         --set hostNetwork=true \
-        --wait --timeout 300s
+        --wait --timeout "${KWOK_HELM_TIMEOUT}"
 
     helm upgrade --install kwok-stage-fast kwok/stage-fast \
         --namespace kube-system
@@ -98,11 +100,16 @@ build_karpenter() {
     # Redirect stderr to avoid Go compilation warnings corrupting the image reference.
     # Output format: kind.local/<name>:<content-hash>
     # Hard timeout prevents a slow/stuck compilation from consuming the entire job.
+    local ko_stderr="${KARPENTER_CLONE_DIR}/ko-build.stderr"
     CONTROLLER_IMG=$(timeout "${KO_BUILD_TIMEOUT}" \
         env KO_DOCKER_REPO=kind.local \
         KIND_CLUSTER_NAME="${KIND_CLUSTER_NAME}" \
-        ko build sigs.k8s.io/karpenter/kwok 2>/dev/null) || {
+        ko build sigs.k8s.io/karpenter/kwok 2>"${ko_stderr}") || {
         log_error "ko build failed or timed out after ${KO_BUILD_TIMEOUT}s"
+        if [[ -s "${ko_stderr}" ]]; then
+            log_error "ko build stderr:"
+            sed 's/^/  /' "${ko_stderr}" || true
+        fi
         exit 1
     }
 
@@ -187,7 +194,7 @@ deploy_karpenter() {
         --set 'controller.extraVolumeMounts[0].readOnly=true' \
         --set 'controller.env[0].name=INSTANCE_TYPES_FILE_PATH' \
         --set 'controller.env[0].value=/etc/karpenter/instance-types/instance-types.json' \
-        --wait --timeout 300s \
+        --wait --timeout "${KARPENTER_HELM_TIMEOUT}" \
         || {
             log_error "Helm install failed. Diagnostics:"
             kubectl -n "${KARPENTER_NAMESPACE}" get pods -o wide 2>/dev/null || true
@@ -213,6 +220,7 @@ main() {
     log_info "Karpenter version: ${KARPENTER_VERSION}"
     log_info "Kind cluster: ${KIND_CLUSTER_NAME}"
     log_info "Namespace: ${KARPENTER_NAMESPACE}"
+    log_info "Timeouts: kwok=${KWOK_HELM_TIMEOUT} ko-build=${KO_BUILD_TIMEOUT}s karpenter=${KARPENTER_HELM_TIMEOUT}"
 
     install_kwok
     build_karpenter
diff --git a/kwok/scripts/run-all-recipes.sh b/kwok/scripts/run-all-recipes.sh
index 459b054b5..6b4af1549 100755
--- a/kwok/scripts/run-all-recipes.sh
+++ b/kwok/scripts/run-all-recipes.sh
@@ -37,6 +37,31 @@ log_info() { echo -e "${GREEN}[INFO]${NC} $*"; }
 log_warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
 log_error() { echo -e "${RED}[ERROR]${NC} $*"; }
 
+retry_command() {
+    local description="$1"
+    shift
+
+    local max_attempts="${KWOK_COMMAND_RETRIES:-3}"
+    local delay="${KWOK_COMMAND_RETRY_DELAY:-5}"
+    local attempt=1
+
+    while true; do
+        if "$@"; then
+            return 0
+        fi
+
+        if ((attempt >= max_attempts)); then
+            log_error "${description} failed after ${attempt} attempt(s)"
+            return 1
+        fi
+
+        log_warn "${description} failed (attempt ${attempt}/${max_attempts}); retrying in ${delay}s..."
+        sleep "${delay}"
+        attempt=$((attempt + 1))
+        delay=$((delay * 2))
+    done
+}
+
 # Find recipes with service criteria (testable cloud configurations)
 get_recipes() {
     for overlay in "${OVERLAYS_DIR}"/*.yaml; do
@@ -68,10 +93,13 @@ ensure_cluster() {
 
     if ! kubectl get deployment -n kube-system kwok-controller &>/dev/null; then
         log_info "Installing KWOK controller..."
-        helm repo add kwok https://kwok.sigs.k8s.io/charts/ --force-update
-        helm upgrade --install kwok-controller kwok/kwok \
+        retry_command "Adding KWOK Helm repository" \
+            helm repo add kwok https://kwok.sigs.k8s.io/charts/ --force-update
+        retry_command "Installing KWOK controller" \
+            helm upgrade --install kwok-controller kwok/kwok \
             --namespace kube-system --set hostNetwork=true --wait
-        helm upgrade --install kwok-stage-fast kwok/stage-fast --namespace kube-system
+        retry_command "Installing KWOK stage-fast" \
+            helm upgrade --install kwok-stage-fast kwok/stage-fast --namespace kube-system
     fi
 
     # Patch kindnet to exclude KWOK nodes
diff --git a/pkg/bundler/deployer/helm/helm_test.go b/pkg/bundler/deployer/helm/helm_test.go
index 435948e8f..33dc2c6e9 100644
--- a/pkg/bundler/deployer/helm/helm_test.go
+++ b/pkg/bundler/deployer/helm/helm_test.go
@@ -20,6 +20,7 @@ import (
 	"os"
 	"os/exec"
 	"path/filepath"
+	"regexp"
 	"strings"
 	"testing"
 	"time"
@@ -493,9 +494,9 @@ func TestGenerate_DeployScriptKaiSchedulerTimeout(t *testing.T) {
 	}
 	script := string(content)
 
-	// kai-scheduler should get a custom 20m timeout override
-	if !strings.Contains(script, `COMPONENT_HELM_TIMEOUT="20m"`) {
-		t.Error("deploy.sh missing kai-scheduler 20m timeout override")
+	// kai-scheduler should get a custom 30m timeout override
+	if !strings.Contains(script, `COMPONENT_HELM_TIMEOUT="30m"`) {
+		t.Error("deploy.sh missing kai-scheduler 30m timeout override")
 	}
 	// Other components should use the default HELM_TIMEOUT
 	if !strings.Contains(script, `COMPONENT_HELM_TIMEOUT="${HELM_TIMEOUT}"`) {
@@ -505,15 +506,241 @@ func TestGenerate_DeployScriptKaiSchedulerTimeout(t *testing.T) {
 	if !strings.Contains(script, `COMPONENT_MAX_RETRIES="1"`) {
 		t.Error("deploy.sh missing kai-scheduler retry override")
 	}
+	if !strings.Contains(script, `if [[ "${COMPONENT_MAX_RETRIES}" -gt 1 ]]`) {
+		t.Error("deploy.sh missing kai-scheduler retry cap")
+	}
 	if !strings.Contains(script, `dump_kai_scheduler_helm_diagnostics "${namespace}"`) {
 		t.Error("deploy.sh missing kai-scheduler diagnostics hook")
 	}
-	if !strings.Contains(script, `kubectl get jobs -n "${namespace}"`) {
+	if !strings.Contains(script, `kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" get jobs -n "${namespace}"`) {
 		t.Error("deploy.sh missing job diagnostics")
 	}
-	if !strings.Contains(script, `kubectl describe pods -n "${namespace}"`) {
+	if !strings.Contains(script, `kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" describe pods -n "${namespace}"`) {
 		t.Error("deploy.sh missing pod diagnostics")
 	}
+
+	rootReadmeContent, err := os.ReadFile(filepath.Join(outputDir, "README.md"))
+	if err != nil {
+		t.Fatalf("failed to read root README: %v", err)
+	}
+	componentReadmeContent, err := os.ReadFile(filepath.Join(outputDir, "kai-scheduler", "README.md"))
+	if err != nil {
+		t.Fatalf("failed to read component README: %v", err)
+	}
+	rootReadme := string(rootReadmeContent)
+	componentReadme := string(componentReadmeContent)
+	if !strings.Contains(rootReadme, `--timeout 30m`) {
+		t.Error("root README missing kai-scheduler 30m timeout")
+	}
+	if !strings.Contains(componentReadme, `--timeout 30m`) {
+		t.Error("component README missing kai-scheduler 30m timeout")
+	}
+	if strings.Contains(componentReadme, `--wait --timeout 30m`) {
+		t.Error("component README should document kai-scheduler without --wait")
+	}
+	if strings.Contains(componentReadme, `--wait --timeout 10m`) {
+		t.Error("component README should not use default timeout for kai-scheduler")
+	}
+}
+
+func TestGenerate_DeployScriptComponentTimeouts(t *testing.T) {
+	retryCapPattern := regexp.MustCompile(`(?m)(if \[\[ "\$\{COMPONENT_MAX_RETRIES\}" -gt \d+ \]\]|COMPONENT_MAX_RETRIES="\d+")`)
+	applyArgsExpansion := `${COMPONENT_HELM_APPLY_ARGS[@]+"${COMPONENT_HELM_APPLY_ARGS[@]}"}`
+	tests := []struct {
+		name                 string
+		component            recipe.ComponentRef
+		wantTimeout          string
+		wantRetryAssignment  string
+		wantRetryCap         string
+		wantApplyArgs        string
+		wantComment          string
+		wantSnippets         []string
+		wantReadmeSnippets   []string
+		rejectSnippets       []string
+		rejectScriptSnippets []string
+		rejectReadmeSnippets []string
+		rejectRetryCap       bool
+	}{
+		{
+			name: "dynamo-platform",
+			component: recipe.ComponentRef{
+				Name:      "dynamo-platform",
+				Namespace: "dynamo-system",
+				Chart:     "dynamo-platform",
+				Version:   "0.9.0",
+				Type:      recipe.ComponentTypeHelm,
+				Source:    "oci://nvcr.io/nvidia/ai-dynamo",
+			},
+			wantTimeout:         `COMPONENT_HELM_TIMEOUT="20m"`,
+			wantRetryAssignment: `COMPONENT_MAX_RETRIES="3"`,
+			wantRetryCap:        `if [[ "${COMPONENT_MAX_RETRIES}" -gt 3 ]]`,
+			wantApplyArgs:       `COMPONENT_HELM_APPLY_ARGS=(--server-side=false)`,
+			wantSnippets: []string{
+				`helm_supports_server_side_false_install`,
+				`Require v4.0.5+ before relying on`,
+				`--request-timeout="${KUBECTL_REQUEST_TIMEOUT}"`,
+				`dynamo-platform conflict mitigation requires Helm v4.0.5+`,
+				`dump_dynamo_platform_helm_diagnostics "${component}" "${namespace}"`,
+				`deployment/dynamo-platform-dynamo-operator-controller-manager`,
+				`--previous --tail=200`,
+			},
+			wantReadmeSnippets: []string{
+				`--server-side=false`,
+				`requires Helm v4.0.5 or later`,
+				`--wait --timeout 20m`,
+			},
+			rejectScriptSnippets: []string{
+				`local prerelease`,
+				`if [[ -n "${prerelease}" ]]`,
+			},
+		},
+		{
+			name: "kube-prometheus-stack",
+			component: recipe.ComponentRef{
+				Name:      "kube-prometheus-stack",
+				Namespace: "monitoring",
+				Chart:     "kube-prometheus-stack",
+				Version:   "82.8.0",
+				Type:      recipe.ComponentTypeHelm,
+				Source:    "https://prometheus-community.github.io/helm-charts",
+			},
+			wantTimeout:    `COMPONENT_HELM_TIMEOUT="${HELM_TIMEOUT}"`,
+			wantComment:    `preserve the default retry`,
+			rejectRetryCap: true,
+		},
+		{
+			name: "ordinary component defaults",
+			component: recipe.ComponentRef{
+				Name:      "gpu-operator",
+				Namespace: "gpu-operator",
+				Chart:     "gpu-operator",
+				Version:   "v25.10.1",
+				Type:      recipe.ComponentTypeHelm,
+				Source:    "https://helm.ngc.nvidia.com/nvidia",
+			},
+			wantTimeout:   `COMPONENT_HELM_TIMEOUT="${HELM_TIMEOUT}"`,
+			wantApplyArgs: `COMPONENT_HELM_APPLY_ARGS=()`,
+			wantReadmeSnippets: []string{
+				`--wait --timeout 10m`,
+			},
+			rejectSnippets: []string{
+				`--server-side=false`,
+				`COMPONENT_MAX_RETRIES="1"`,
+				`COMPONENT_MAX_RETRIES="3"`,
+			},
+			rejectReadmeSnippets: []string{
+				`--server-side=false`,
+				`--wait --timeout 20m`,
+				`--timeout 30m`,
+			},
+			rejectRetryCap: true,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			ctx := context.Background()
+			outputDir := t.TempDir()
+
+			g := &Generator{
+				RecipeResult: &recipe.RecipeResult{
+					Kind:            "RecipeResult",
+					APIVersion:      "aicr.nvidia.com/v1alpha1",
+					ComponentRefs:   []recipe.ComponentRef{tt.component},
+					DeploymentOrder: []string{tt.component.Name},
+				},
+				ComponentValues: map[string]map[string]any{
+					tt.component.Name: {},
+				},
+				Version: "v1.0.0",
+			}
+
+			_, err := g.Generate(ctx, outputDir)
+			if err != nil {
+				t.Fatalf("Generate failed: %v", err)
+			}
+
+			content, err := os.ReadFile(filepath.Join(outputDir, "deploy.sh"))
+			if err != nil {
+				t.Fatalf("failed to read deploy.sh: %v", err)
+			}
+			script := string(content)
+
+			blockStart := strings.Index(script, `Installing `+tt.component.Name)
+			if blockStart == -1 {
+				t.Fatalf("deploy.sh missing %s install block", tt.component.Name)
+			}
+			blockEnd := strings.Index(script[blockStart:], `helm upgrade --install `+tt.component.Name)
+			if blockEnd == -1 {
+				t.Fatalf("deploy.sh missing %s helm install command", tt.component.Name)
+			}
+			componentBlock := script[blockStart : blockStart+blockEnd]
+
+			if !strings.Contains(componentBlock, tt.wantTimeout) {
+				t.Errorf("deploy.sh missing %s timeout override %q", tt.component.Name, tt.wantTimeout)
+			}
+			if tt.wantRetryAssignment != "" && !strings.Contains(componentBlock, tt.wantRetryAssignment) {
+				t.Errorf("deploy.sh missing %s retry override %q", tt.component.Name, tt.wantRetryAssignment)
+			}
+			if tt.wantRetryCap != "" && !strings.Contains(componentBlock, tt.wantRetryCap) {
+				t.Errorf("deploy.sh missing %s retry cap %q", tt.component.Name, tt.wantRetryCap)
+			}
+			if tt.wantApplyArgs != "" && !strings.Contains(componentBlock, tt.wantApplyArgs) {
+				t.Errorf("deploy.sh missing %s apply args %q", tt.component.Name, tt.wantApplyArgs)
+			}
+			if tt.wantApplyArgs != "" && !strings.Contains(script[blockStart:], applyArgsExpansion) {
+				t.Errorf("deploy.sh missing %s apply args in helm command", tt.component.Name)
+			}
+			if tt.wantComment != "" && !strings.Contains(componentBlock, tt.wantComment) {
+				t.Errorf("deploy.sh missing %s retry rationale", tt.component.Name)
+			}
+			for _, snippet := range tt.wantSnippets {
+				if !strings.Contains(script, snippet) {
+					t.Errorf("deploy.sh missing %s snippet %q", tt.component.Name, snippet)
+				}
+			}
+			for _, snippet := range tt.rejectSnippets {
+				if strings.Contains(componentBlock, snippet) {
+					t.Errorf("deploy.sh should not include %s snippet %q", tt.component.Name, snippet)
+				}
+			}
+			for _, snippet := range tt.rejectScriptSnippets {
+				if strings.Contains(script, snippet) {
+					t.Errorf("deploy.sh should not include %s script snippet %q", tt.component.Name, snippet)
+				}
+			}
+			if tt.rejectRetryCap && retryCapPattern.MatchString(componentBlock) {
+				t.Errorf("deploy.sh should not cap %s retries in its component block", tt.component.Name)
+			}
+
+			rootReadmeContent, err := os.ReadFile(filepath.Join(outputDir, "README.md"))
+			if err != nil {
+				t.Fatalf("failed to read root README: %v", err)
+			}
+			componentReadmeContent, err := os.ReadFile(filepath.Join(outputDir, tt.component.Name, "README.md"))
+			if err != nil {
+				t.Fatalf("failed to read component README: %v", err)
+			}
+			rootReadme := string(rootReadmeContent)
+			componentReadme := string(componentReadmeContent)
+			for _, snippet := range tt.wantReadmeSnippets {
+				if !strings.Contains(rootReadme, snippet) {
+					t.Errorf("root README missing %s snippet %q", tt.component.Name, snippet)
+				}
+				if !strings.Contains(componentReadme, snippet) {
+					t.Errorf("component README missing %s snippet %q", tt.component.Name, snippet)
+				}
+			}
+			for _, snippet := range tt.rejectReadmeSnippets {
+				if strings.Contains(rootReadme, snippet) {
+					t.Errorf("root README should not include %s snippet %q", tt.component.Name, snippet)
+				}
+				if strings.Contains(componentReadme, snippet) {
+					t.Errorf("component README should not include %s snippet %q", tt.component.Name, snippet)
+				}
+			}
+		})
+	}
 }
 
 func TestGenerate_UndeployScriptExecutable(t *testing.T) {
diff --git a/pkg/bundler/deployer/helm/templates/README.md.tmpl b/pkg/bundler/deployer/helm/templates/README.md.tmpl
index 3c3e874f4..1ef2f252b 100644
--- a/pkg/bundler/deployer/helm/templates/README.md.tmpl
+++ b/pkg/bundler/deployer/helm/templates/README.md.tmpl
@@ -77,21 +77,36 @@ kustomize build '{{ .Repository }}//{{ .Path }}{{ if .Tag }}?ref={{ .Tag }}{{ en
 ```bash
 {{ if .IsOCI -}}
 helm upgrade --install {{ .Name }} {{ .Repository }}/{{ .ChartName }} \
+  {{ if eq .Name "dynamo-platform" }}--server-side=false \
+  {{ end -}}
   --version {{ .Version }} \
   -n {{ .Namespace }} --create-namespace \
   -f {{ .Name }}/values.yaml \
   -f {{ .Name }}/cluster-values.yaml \
-  --wait --timeout 10m
+  {{ if eq .Name "kai-scheduler" -}}
+  --timeout 30m
+  {{ else -}}
+  --wait --timeout {{ if eq .Name "dynamo-platform" }}20m{{ else }}10m{{ end }}
+  {{ end -}}
 {{ else -}}
 helm upgrade --install {{ .Name }} {{ .ChartName }} \
+  {{ if eq .Name "dynamo-platform" }}--server-side=false \
+  {{ end -}}
   --repo {{ .Repository }} \
   --version {{ .Version }} \
   -n {{ .Namespace }} --create-namespace \
   -f {{ .Name }}/values.yaml \
   -f {{ .Name }}/cluster-values.yaml \
-  --wait --timeout 10m
+  {{ if eq .Name "kai-scheduler" -}}
+  --timeout 30m
+  {{ else -}}
+  --wait --timeout {{ if eq .Name "dynamo-platform" }}20m{{ else }}10m{{ end }}
+  {{ end -}}
 {{ end -}}
 ```
+{{ if eq .Name "dynamo-platform" }}
+`--server-side=false` requires Helm v4.0.5 or later. `deploy.sh` checks the Helm version before adding the flag and warns before falling back when unavailable.
+{{ end -}}
 {{ end -}}
 {{ if .HasManifests }}
 ```bash
@@ -119,7 +134,9 @@ Each Helm component has two values files in its directory:
 
 ## Upgrade
 
-To upgrade a specific Helm component:
+To upgrade a specific Helm component, use the generic form below. Some
+components require component-specific flags; use the component subdirectory
+`README.md` for the exact command.
 
 ```bash
 helm upgrade <component> <chart> --version <version> -n <namespace> -f <component>/values.yaml -f <component>/cluster-values.yaml --wait --timeout 10m
diff --git a/pkg/bundler/deployer/helm/templates/component-README.md.tmpl b/pkg/bundler/deployer/helm/templates/component-README.md.tmpl
index 068bfcd28..66762ac7f 100644
--- a/pkg/bundler/deployer/helm/templates/component-README.md.tmpl
+++ b/pkg/bundler/deployer/helm/templates/component-README.md.tmpl
@@ -43,21 +43,36 @@ Namespace: {{ .Namespace }}
 ```bash
 {{ if .IsOCI -}}
 helm upgrade --install {{ .Name }} {{ .Repository }}/{{ .ChartName }} \
+  {{ if eq .Name "dynamo-platform" }}--server-side=false \
+  {{ end -}}
   --version {{ .Version }} \
   -n {{ .Namespace }} --create-namespace \
   -f values.yaml \
   -f cluster-values.yaml \
-  --wait --timeout 10m
+  {{ if eq .Name "kai-scheduler" -}}
+  --timeout 30m
+  {{ else -}}
+  --wait --timeout {{ if eq .Name "dynamo-platform" }}20m{{ else }}10m{{ end }}
+  {{ end -}}
 {{ else -}}
 helm upgrade --install {{ .Name }} {{ .ChartName }} \
+  {{ if eq .Name "dynamo-platform" }}--server-side=false \
+  {{ end -}}
   --repo {{ .Repository }} \
   --version {{ .Version }} \
   -n {{ .Namespace }} --create-namespace \
   -f values.yaml \
   -f cluster-values.yaml \
-  --wait --timeout 10m
+  {{ if eq .Name "kai-scheduler" -}}
+  --timeout 30m
+  {{ else -}}
+  --wait --timeout {{ if eq .Name "dynamo-platform" }}20m{{ else }}10m{{ end }}
+  {{ end -}}
 {{ end -}}
 ```
+{{ if eq .Name "dynamo-platform" }}
+`--server-side=false` requires Helm v4.0.5 or later. `deploy.sh` checks the Helm version before adding the flag and warns before falling back when unavailable.
+{{ end -}}
 {{ if .HasManifests }}
 After the chart is installed, apply additional manifests:
 
@@ -70,19 +85,31 @@ kubectl apply -f manifests/
 ```bash
 {{ if .IsOCI -}}
 helm upgrade {{ .Name }} {{ .Repository }}/{{ .ChartName }} \
+  {{ if eq .Name "dynamo-platform" }}--server-side=false \
+  {{ end -}}
   --version {{ .Version }} \
   -n {{ .Namespace }} \
   -f values.yaml \
   -f cluster-values.yaml \
-  --wait --timeout 10m
+  {{ if eq .Name "kai-scheduler" -}}
+  --timeout 30m
+  {{ else -}}
+  --wait --timeout {{ if eq .Name "dynamo-platform" }}20m{{ else }}10m{{ end }}
+  {{ end -}}
 {{ else -}}
 helm upgrade {{ .Name }} {{ .ChartName }} \
+  {{ if eq .Name "dynamo-platform" }}--server-side=false \
+  {{ end -}}
   --repo {{ .Repository }} \
   --version {{ .Version }} \
   -n {{ .Namespace }} \
   -f values.yaml \
   -f cluster-values.yaml \
-  --wait --timeout 10m
+  {{ if eq .Name "kai-scheduler" -}}
+  --timeout 30m
+  {{ else -}}
+  --wait --timeout {{ if eq .Name "dynamo-platform" }}20m{{ else }}10m{{ end }}
+  {{ end -}}
 {{ end -}}
 ```
 
diff --git a/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl b/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl
index 0f83eb71c..6b946a969 100644
--- a/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl
+++ b/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl
@@ -24,6 +24,7 @@ trap 'rm -rf "${HELM_WORKDIR}"' EXIT
 cd "${HELM_WORKDIR}"
 
 HELM_TIMEOUT="10m"
+KUBECTL_REQUEST_TIMEOUT="10s"
 NO_WAIT=false
 BEST_EFFORT=false
 FAILED_COMPONENTS=""
@@ -59,6 +60,34 @@ function backoff_seconds() {
   echo "${seconds}"
 }
 
+function helm_supports_server_side_false_install() {
+  local version
+  local major
+  local minor
+  local patch
+
+  # Helm v4.0.0-v4.0.4 advertise --server-side=false but ignore it for the
+  # upgrade --install install-fallback path. Require v4.0.5+ before relying on
+  # the flag for Dynamo's webhook Secret conflict mitigation.
+  version="$(helm version --short 2>/dev/null | head -n 1 || true)"
+  version="${version#v}"
+  version="${version%%+*}"
+  if ! [[ "${version}" =~ ^([0-9]+)\.([0-9]+)\.([0-9]+)(-[0-9A-Za-z.-]+)?$ ]]; then
+    return 1
+  fi
+  major="${BASH_REMATCH[1]}"
+  minor="${BASH_REMATCH[2]}"
+  patch="${BASH_REMATCH[3]}"
+  if (( major < 4 )); then
+    return 1
+  fi
+  if (( major == 4 )) && (( minor == 0 )) && (( patch < 5 )); then
+    return 1
+  fi
+
+  helm help upgrade 2>/dev/null | grep -q -- '--server-side'
+}
+
 function retry() {
   local desc="$1"; shift
   local attempt=0
@@ -86,7 +115,7 @@ function retry() {
 function cleanup_helm_hooks() {
   local namespace="$1"
   local job_names
-  job_names=$(kubectl get jobs -n "${namespace}" \
+  job_names=$(kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" get jobs -n "${namespace}" \
     --field-selector=status.successful=0 \
     -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' \
     2>/dev/null || true)
@@ -97,7 +126,7 @@ function cleanup_helm_hooks() {
     [[ -z "${name}" ]] && continue
     # Get the full Job JSON to reliably check annotations and status
     local job_json
-    job_json=$(kubectl get job "${name}" -n "${namespace}" -o json 2>/dev/null || true)
+    job_json=$(kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" get job "${name}" -n "${namespace}" -o json 2>/dev/null || true)
     [[ -z "${job_json}" ]] && continue
     # Skip non-hook Jobs (no helm.sh/hook annotation)
     local hook_val
@@ -106,13 +135,13 @@ function cleanup_helm_hooks() {
     # Capture diagnostics before deleting. This helps diagnose transient hook
     # failures (e.g., dynamo ssh-keygen) that are otherwise lost after cleanup.
     echo "  --- Failed hook Job ${name} diagnostics ---"
-    kubectl describe job "${name}" -n "${namespace}" 2>/dev/null | tail -50 || true
+    kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" describe job "${name}" -n "${namespace}" 2>/dev/null | tail -50 || true
     local pod_names
-    pod_names=$(kubectl get pods -n "${namespace}" -l "job-name=${name}" \
+    pod_names=$(kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" get pods -n "${namespace}" -l "job-name=${name}" \
       -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' 2>/dev/null || true)
     for pod_name in ${pod_names}; do
       echo "  --- Hook pod ${pod_name} describe ---"
-      kubectl describe pod "${pod_name}" -n "${namespace}" 2>/dev/null | tail -50 || true
+      kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" describe pod "${pod_name}" -n "${namespace}" 2>/dev/null | tail -50 || true
     done
     echo "  --- End diagnostics for ${name} ---"
     # Delete any non-succeeded hook Job. This function only runs after a Helm
@@ -120,7 +149,7 @@ function cleanup_helm_hooks() {
     # retry — whether it failed, is stuck Pending (timed out before the pod
     # started), or is still active with a stuck container.
     echo "  Cleaning up stale Helm hook Job ${name} in ${namespace}..."
-    kubectl delete job "${name}" -n "${namespace}" --ignore-not-found 2>/dev/null || true
+    kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" delete job "${name}" -n "${namespace}" --ignore-not-found 2>/dev/null || true
   done <<< "${job_names}"
 }
 
@@ -132,28 +161,58 @@ function dump_kai_scheduler_helm_diagnostics() {
 
   echo "  --- ${namespace} diagnostics ---"
   echo "  Jobs:"
-  kubectl get jobs -n "${namespace}" 2>/dev/null || true
+  kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" get jobs -n "${namespace}" 2>/dev/null || true
   echo "  Job descriptions:"
-  kubectl describe jobs -n "${namespace}" 2>/dev/null || true
+  kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" describe jobs -n "${namespace}" 2>/dev/null || true
   echo "  Pods:"
-  kubectl get pods -n "${namespace}" -o wide 2>/dev/null || true
+  kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" get pods -n "${namespace}" -o wide 2>/dev/null || true
   echo "  Pod descriptions:"
-  kubectl describe pods -n "${namespace}" 2>/dev/null || true
+  kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" describe pods -n "${namespace}" 2>/dev/null || true
   echo "  Recent events:"
-  kubectl get events -n "${namespace}" --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true
+  kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" get events -n "${namespace}" --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true
+  echo "  --- End ${namespace} diagnostics ---"
+}
+
+function dump_dynamo_platform_helm_diagnostics() {
+  local component="$1"
+  local namespace="$2"
+  if [[ "${component}" != "dynamo-platform" ]]; then
+    return
+  fi
+
+  echo "  --- ${namespace} diagnostics ---"
+  echo "  Deployments:"
+  kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" get deployments -n "${namespace}" -o wide 2>/dev/null || true
+  echo "  Jobs:"
+  kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" get jobs -n "${namespace}" 2>/dev/null || true
+  echo "  Pods:"
+  kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" get pods -n "${namespace}" -o wide 2>/dev/null || true
+  echo "  Pod descriptions:"
+  kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" describe pods -n "${namespace}" 2>/dev/null || true
+  echo "  Dynamo operator manager logs:"
+  kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" logs deployment/dynamo-platform-dynamo-operator-controller-manager -n "${namespace}" -c manager --tail=200 2>/dev/null || true
+  echo "  Dynamo operator manager previous logs:"
+  kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" logs deployment/dynamo-platform-dynamo-operator-controller-manager -n "${namespace}" -c manager --previous --tail=200 2>/dev/null || true
+  echo "  Grove operator logs:"
+  kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" logs deployment/grove-operator -n "${namespace}" --all-containers --tail=100 2>/dev/null || true
+  echo "  Grove operator previous logs:"
+  kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" logs deployment/grove-operator -n "${namespace}" --all-containers --previous --tail=100 2>/dev/null || true
+  echo "  Recent events:"
+  kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" get events -n "${namespace}" --sort-by='.lastTimestamp' 2>/dev/null | tail -50 || true
   echo "  --- End ${namespace} diagnostics ---"
 }
 
 # helm_retry contract:
-#   helm_retry "<description>" "<namespace>" "<max_retries>" <command> [args...]
-# Callers must pass the retry budget as the third positional argument before the
-# command to execute. This keeps per-component retry tuning explicit at the
-# callsite instead of relying on the global MAX_RETRIES fallback.
+#   helm_retry "<description>" "<component>" "<namespace>" "<max_retries>" <command> [args...]
+# Callers must pass the component name and retry budget before the command to
+# execute. This keeps per-component retry tuning and diagnostics explicit at the
+# callsite instead of relying on global fallbacks.
 function helm_retry() {
   local desc="$1"
-  local namespace="$2"
-  local max_retries="$3"
-  shift 3
+  local component="$2"
+  local namespace="$3"
+  local max_retries="$4"
+  shift 4
   local attempt=0
   while true; do
     if "$@"; then
@@ -161,6 +220,7 @@ function helm_retry() {
     fi
     attempt=$((attempt + 1))
     dump_kai_scheduler_helm_diagnostics "${namespace}"
+    dump_dynamo_platform_helm_diagnostics "${component}" "${namespace}"
     if [[ ${attempt} -gt ${max_retries} ]]; then
       echo "ERROR: ${desc} failed after ${attempt} attempts"
       return 1
@@ -371,13 +431,35 @@ retry "{{ .Name }} pre-install manifests" apply_ignoring_crd_race "${SCRIPT_DIR}
   || helm_failed "{{ .Name }}"
 {{ end -}}
 # Per-component timeout override. Most components use HELM_TIMEOUT (10m).
-# Components with slow hooks (e.g., kai-scheduler crd-upgrader image pull
-# on cold runners) get a longer timeout to avoid unnecessary retry cycles.
+# Components with slow hooks on cold runners get a longer timeout to avoid
+# unnecessary retry cycles.
 COMPONENT_HELM_TIMEOUT="${HELM_TIMEOUT}"
 COMPONENT_MAX_RETRIES="${MAX_RETRIES}"
+COMPONENT_HELM_APPLY_ARGS=()
 {{ if eq .Name "kai-scheduler" -}}
+COMPONENT_HELM_TIMEOUT="30m"
+if [[ "${COMPONENT_MAX_RETRIES}" -gt 1 ]]; then
+  COMPONENT_MAX_RETRIES="1"
+fi
+{{ else if eq .Name "dynamo-platform" -}}
 COMPONENT_HELM_TIMEOUT="20m"
-COMPONENT_MAX_RETRIES="1"
+# Grove owns the generated webhook certificate Secret data after install.
+# Client-side apply avoids server-side field ownership conflicts during retries.
+# This flag requires Helm v4.0.5+; earlier Helm v4 releases advertise the flag
+# but ignore --server-side=false on a fresh upgrade --install fallback.
+if helm_supports_server_side_false_install; then
+  COMPONENT_HELM_APPLY_ARGS=(--server-side=false)
+else
+  echo "::warning::dynamo-platform conflict mitigation requires Helm v4.0.5+ with working --server-side=false install fallback; proceeding without this flag"
+fi
+if [[ "${COMPONENT_MAX_RETRIES}" -gt 3 ]]; then
+  COMPONENT_MAX_RETRIES="3"
+fi
+{{ else if eq .Name "kube-prometheus-stack" -}}
+# Grafana can trip its Deployment progress deadline before a longer Helm
+# timeout helps, especially on slower H100 CI runners under image-pull and
+# control-plane load. Keep the default 10m timeout and preserve the default retry
+# budget so later upgrades can succeed after images and controllers settle.
 {{ end -}}
 # Derive wait args: global --wait/--no-wait behavior + component timeout.
 if [[ "${NO_WAIT}" == "true" ]]; then
@@ -391,9 +473,11 @@ if echo "${ASYNC_COMPONENTS}" | grep -qw "{{ .Name }}"; then
   echo "  (async component — skipping --wait, keeping --timeout for hooks)"
 fi
 {{ if .IsOCI -}}
-helm_retry "{{ .Name }} helm install" "{{ .Namespace }}" \
+helm_retry "{{ .Name }} helm install" "{{ .Name }}" \
+  "{{ .Namespace }}" \
   "${COMPONENT_MAX_RETRIES}" \
   helm upgrade --install {{ .Name }} {{ .Repository }}/{{ .ChartName }} \
+  ${COMPONENT_HELM_APPLY_ARGS[@]+"${COMPONENT_HELM_APPLY_ARGS[@]}"} \
   {{ if .Version }}--version {{ .Version }} \
   {{ end -}}
   -n {{ .Namespace }} --create-namespace \
@@ -402,9 +486,11 @@ helm_retry "{{ .Name }} helm install" "{{ .Namespace }}" \
   ${COMPONENT_WAIT_ARGS} \
   || helm_failed "{{ .Name }}"
 {{ else -}}
-helm_retry "{{ .Name }} helm install" "{{ .Namespace }}" \
+helm_retry "{{ .Name }} helm install" "{{ .Name }}" \
+  "{{ .Namespace }}" \
   "${COMPONENT_MAX_RETRIES}" \
   helm upgrade --install {{ .Name }} {{ .ChartName }} \
+  ${COMPONENT_HELM_APPLY_ARGS[@]+"${COMPONENT_HELM_APPLY_ARGS[@]}"} \
   --repo {{ .Repository }} \
   {{ if .Version }}--version {{ .Version }} \
   {{ end -}}
diff --git a/recipes/overlays/kind.yaml b/recipes/overlays/kind.yaml
index b0d8dbd76..791016135 100644
--- a/recipes/overlays/kind.yaml
+++ b/recipes/overlays/kind.yaml
@@ -115,6 +115,11 @@ spec:
     - name: kube-prometheus-stack
       type: Helm
       overrides:
+        # CI only needs component health, not the full upstream alerting rule
+        # set. Skipping default rules reduces PrometheusRule churn during
+        # install on small kind control planes.
+        defaultRules:
+          create: false
         prometheus:
           prometheusSpec:
             # Smaller storage for local testing
@@ -132,7 +137,35 @@ spec:
                 memory: 1Gi
             # Shorter retention for local testing
             retention: 7d
+        prometheusOperator:
+          # Keep operator-owned monitoring custom resources in the monitoring
+          # namespace for kind. Do not scope ServiceMonitor discovery here;
+          # GPU, Kubeflow, and Dynamo monitors may live in their own namespaces.
+          alertmanagerInstanceNamespaces:
+            - monitoring
+          alertmanagerConfigNamespaces:
+            - monitoring
+          prometheusInstanceNamespaces:
+            - monitoring
+          thanosRulerInstanceNamespaces:
+            - monitoring
+          # CI kind control planes can be slow under image pulls and controller
+          # startup. Avoid restarting the operator on short health probe stalls.
+          livenessProbe:
+            timeoutSeconds: 10
+            failureThreshold: 10
+          readinessProbe:
+            timeoutSeconds: 10
+            failureThreshold: 6
+          resources:
+            requests:
+              cpu: 100m
+              memory: 256Mi
+            limits:
+              cpu: 500m
+              memory: 512Mi
         grafana:
+          enabled: false
           resources:
             requests:
               cpu: 100m
diff --git a/tests/chainsaw/ai-conformance/README.md b/tests/chainsaw/ai-conformance/README.md
index b1a88e9d4..a69b88f13 100644
--- a/tests/chainsaw/ai-conformance/README.md
+++ b/tests/chainsaw/ai-conformance/README.md
@@ -73,10 +73,11 @@ tests/chainsaw/ai-conformance/
 │   ├── assert-cert-manager.yaml         # cert-manager healthy
 │   ├── assert-dra-driver.yaml           # DRA driver healthy
 │   ├── assert-kai-scheduler.yaml        # KAI scheduler healthy
-│   ├── assert-monitoring.yaml           # Prometheus stack healthy
+│   ├── assert-monitoring.yaml           # Prometheus stack healthy with Grafana
 │   └── assert-skyhook.yaml              # Skyhook operator healthy
 ├── kind-common/                         # Shared Kind-only assertions
 │   ├── assert-gpu-operator.yaml         # GPU operator healthy on kind
+│   ├── assert-monitoring.yaml           # Prometheus stack healthy without Grafana
 │   ├── assert-network-operator.yaml     # Network operator healthy on kind
 │   └── assert-nvsentinel.yaml           # NVSentinel healthy on kind
 ├── kind-inference-dynamo/               # Kind + H100 + inference + dynamo leaf suite
diff --git a/tests/chainsaw/ai-conformance/kind-common/assert-monitoring.yaml b/tests/chainsaw/ai-conformance/kind-common/assert-monitoring.yaml
new file mode 100644
index 000000000..868be3fea
--- /dev/null
+++ b/tests/chainsaw/ai-conformance/kind-common/assert-monitoring.yaml
@@ -0,0 +1,85 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Assert kind monitoring stack components required by H100 CI are healthy.
+# Grafana is intentionally not asserted here because conformance metrics use
+# Prometheus, DCGM exporter, and prometheus-adapter directly.
+
+# Prometheus Operator - manages Prometheus, Alertmanager, and ServiceMonitor CRs
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: kube-prometheus-operator
+  namespace: monitoring
+status:
+  (conditions[?type == 'Available']):
+    - status: "True"
+---
+# kube-state-metrics - Kubernetes object state metrics
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: kube-state-metrics
+  namespace: monitoring
+status:
+  (conditions[?type == 'Available']):
+    - status: "True"
+---
+# Prometheus StatefulSet - time series database
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: prometheus-kube-prometheus-prometheus
+  namespace: monitoring
+status:
+  (readyReplicas > `0`): true
+---
+# Alertmanager StatefulSet - alert routing and silencing
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: alertmanager-kube-prometheus-alertmanager
+  namespace: monitoring
+status:
+  (readyReplicas > `0`): true
+---
+# Prometheus Node Exporter DaemonSet - node-level hardware/OS metrics
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: prometheus-node-exporter
+  namespace: monitoring
+status:
+  (numberReady > `0`): true
+  (desiredNumberScheduled > `0`): true
+---
+# k8s-ephemeral-storage-metrics - ephemeral storage usage metrics
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: k8s-ephemeral-storage-metrics
+  namespace: monitoring
+status:
+  (conditions[?type == 'Available']):
+    - status: "True"
+---
+# Prometheus Adapter - custom metrics API for HPA
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: prometheus-adapter
+  namespace: monitoring
+status:
+  (conditions[?type == 'Available']):
+    - status: "True"
diff --git a/tests/chainsaw/ai-conformance/kind-inference-dynamo/chainsaw-test.yaml b/tests/chainsaw/ai-conformance/kind-inference-dynamo/chainsaw-test.yaml
index 1b1f701ad..cac236b32 100644
--- a/tests/chainsaw/ai-conformance/kind-inference-dynamo/chainsaw-test.yaml
+++ b/tests/chainsaw/ai-conformance/kind-inference-dynamo/chainsaw-test.yaml
@@ -65,10 +65,10 @@ spec:
 
     # ── Monitoring ─────────────────────────────────────────────────────
     - name: assert-monitoring
-      description: Verify kube-prometheus-stack, ephemeral storage metrics, and prometheus-adapter.
+      description: Verify kind monitoring stack without Grafana.
       try:
         - assert:
-            file: ../common/assert-monitoring.yaml
+            file: ../kind-common/assert-monitoring.yaml
 
     # ── kgateway ───────────────────────────────────────────────────────
     - name: assert-kgateway
@@ -110,6 +110,8 @@ spec:
     # ── KAI Scheduler ──────────────────────────────────────────────────
     - name: assert-kai-scheduler
       description: Verify KAI scheduler is available.
+      timeouts:
+        assert: 600s
       try:
         - assert:
             file: ../common/assert-kai-scheduler.yaml
diff --git a/tests/chainsaw/ai-conformance/kind-training-kubeflow/chainsaw-test.yaml b/tests/chainsaw/ai-conformance/kind-training-kubeflow/chainsaw-test.yaml
index 382d99104..20332ad64 100644
--- a/tests/chainsaw/ai-conformance/kind-training-kubeflow/chainsaw-test.yaml
+++ b/tests/chainsaw/ai-conformance/kind-training-kubeflow/chainsaw-test.yaml
@@ -60,10 +60,10 @@ spec:
             file: ../kind-common/assert-gpu-operator.yaml
 
     - name: assert-monitoring
-      description: Verify kube-prometheus-stack, ephemeral storage metrics, and prometheus-adapter.
+      description: Verify kind monitoring stack without Grafana.
       try:
         - assert:
-            file: ../common/assert-monitoring.yaml
+            file: ../kind-common/assert-monitoring.yaml
 
     - name: assert-skyhook
       description: Verify Skyhook operator controller-manager is available.
@@ -73,6 +73,8 @@ spec:
 
     - name: assert-kubeflow-trainer
       description: Verify Kubeflow Trainer controller, validating webhook, and TrainJob CRD are available.
+      timeouts:
+        assert: 600s
       try:
         - assert:
             file: assert-kubeflow-trainer.yaml
@@ -99,6 +101,8 @@ spec:
 
     - name: assert-kai-scheduler
       description: Verify KAI scheduler is available.
+      timeouts:
+        assert: 600s
       try:
         - assert:
             file: ../common/assert-kai-scheduler.yaml