diff --git a/.github/actions/README.md b/.github/actions/README.md index cef2fd6ca..15710df7d 100644 --- a/.github/actions/README.md +++ b/.github/actions/README.md @@ -4,6 +4,13 @@ This directory contains a modular, reusable GitHub Actions architecture optimize ## Composite Actions +### Script Conventions + +Composite action helper scripts in this directory are intentionally portable +across checkout modes: keep them mode `0644` and invoke them as +`bash path/to/script.sh` from workflows or `action.yml` files. Do not rely on +executable bits or `./script.sh` invocation. + ### Core CI/CD Actions #### `security-scan/` @@ -50,7 +57,8 @@ This action runs `tools/setup-tools --skip-go --skip-docker` in auto mode, which **When to use**: When you need version values in workflow steps **Outputs**: - `go`, `goreleaser`, `ko`, `crane`, `golangci_lint`, `yamllint`, `addlicense` -- `grype`, `kubectl`, `kind`, `ctlptl`, `tilt`, `helm` +- `grype`, `kubectl`, `kind`, `nvkind`, `ctlptl`, `tilt`, `helm` +- `kind_node_image`, `h100_kind_node_image` **Example**: ```yaml diff --git a/.github/actions/aicr-build/action.yml b/.github/actions/aicr-build/action.yml index 7a973ae21..671392215 100644 --- a/.github/actions/aicr-build/action.yml +++ b/.github/actions/aicr-build/action.yml @@ -13,9 +13,17 @@ # limitations under the License. name: 'AICR Build' -description: 'Builds the aicr validator image (via Dockerfile) and CLI binary, and loads the image into kind.' +description: 'Builds the aicr CLI and optional snapshot/validator images, and loads requested images into kind.' inputs: + build_cli: + description: 'Build and stage the aicr CLI binary at the repository root' + required: false + default: 'true' + build_snapshot_agent: + description: 'Build the CUDA-based snapshot agent image and load it into kind' + required: false + default: 'true' build_validators: description: 'Deprecated: use validator_phases instead. Ignored when validator_phases is set.' required: false @@ -28,86 +36,27 @@ inputs: runs: using: 'composite' steps: - - - name: Install ko - shell: bash - run: | - KO_VERSION=$(yq eval '.build_tools.ko' .settings.yaml) - GOFLAGS= go install "github.com/google/ko@${KO_VERSION}" - - - name: Build snapshot agent image and load into kind + - name: Build aicr CLI binary + if: inputs.build_cli == 'true' || inputs.build_snapshot_agent == 'true' shell: bash env: GOFLAGS: -mod=vendor - run: | - # Build snapshot agent image with CUDA base (provides nvidia-smi for GPU detection). - # Uses cuda:base (~250MB) instead of cuda:runtime (~1.8GB) — only nvidia-smi is needed. - # GPU test workflows use --image=ko.local:smoke-test for aicr snapshot. - CGO_ENABLED=0 go build -trimpath -o dist/aicr ./cmd/aicr - docker build -t ko.local:smoke-test -f - . <<'DOCKERFILE' - FROM nvcr.io/nvidia/cuda:13.1.0-base-ubuntu24.04 - COPY dist/aicr /usr/local/bin/aicr - ENTRYPOINT ["/usr/local/bin/aicr"] - DOCKERFILE + run: bash "${{ github.action_path }}/build-cli.sh" - # Load onto all nodes. The snapshot agent requests nvidia.com/gpu but - # does not set a node selector, so it can land on any GPU-capable node - # including the control-plane (e.g., T4 smoke test). - # - # Timeout is intentionally generous (900s per attempt). H100 self-hosted - # runners transfer images over a shared Docker-in-Docker bridge; large - # CUDA base images (~250MB compressed) combined with I/O contention from - # parallel GPU operator pods regularly exceed the previous 600s limit. - timeout 900 kind load docker-image ko.local:smoke-test --name "${KIND_CLUSTER_NAME}" || { - echo "::warning::kind load attempt 1 failed for ko.local:smoke-test, retrying..." - timeout 900 kind load docker-image ko.local:smoke-test --name "${KIND_CLUSTER_NAME}" - } + - name: Build snapshot agent image and load into kind + if: inputs.build_snapshot_agent == 'true' + shell: bash + run: bash "${{ github.action_path }}/build-snapshot-agent.sh" - name: Build validator images and load into kind if: "!(inputs.validator_phases == 'none' || (inputs.validator_phases == '' && inputs.build_validators == 'false'))" shell: bash env: GOFLAGS: -mod=vendor - run: | - # Determine which validator phases to build. - # validator_phases takes precedence; build_validators is a deprecated fallback. - if [[ -n "${{ inputs.validator_phases }}" ]]; then - if [[ "${{ inputs.validator_phases }}" == "none" ]]; then - echo "Skipping validator builds (validator_phases=none)" - exit 0 - fi - PHASES="${{ inputs.validator_phases }}" - else - # Default: build all phases (backwards compatible) - PHASES="deployment,performance,conformance" - fi - - # Compile only the requested validator binaries. - mkdir -p dist/validator - for phase in ${PHASES//,/ }; do - echo "Building validator binary: ${phase}" - CGO_ENABLED=0 go build -trimpath -o "dist/validator/${phase}" "./validators/${phase}" - done - - for phase in ${PHASES//,/ }; do - mkdir -p "validators/${phase}/testdata" - docker build -t "ko.local/aicr-validators/${phase}:latest" -f - . <&2 + exit 1 + ;; + esac +} + +MAX_RECOVERY_ATTEMPTS="${MAX_RECOVERY_ATTEMPTS#"${MAX_RECOVERY_ATTEMPTS%%[![:space:]]*}"}" +MAX_RECOVERY_ATTEMPTS="${MAX_RECOVERY_ATTEMPTS%"${MAX_RECOVERY_ATTEMPTS##*[![:space:]]}"}" +if ! [[ "${MAX_RECOVERY_ATTEMPTS}" =~ ^[0-9]+$ ]]; then + echo "::error::max_recovery_attempts must be a non-negative integer, got '${MAX_RECOVERY_ATTEMPTS}'" + exit 1 +fi + +MAX_RESTARTS="${MAX_RESTARTS:-}" +MAX_RESTARTS="${MAX_RESTARTS#"${MAX_RESTARTS%%[![:space:]]*}"}" +MAX_RESTARTS="${MAX_RESTARTS%"${MAX_RESTARTS##*[![:space:]]}"}" +if [[ -n "${MAX_RESTARTS}" ]] && [[ "${MAX_RESTARTS}" != "1" ]]; then + echo "::warning::max_restarts is deprecated and ignored; use stability_window to fail on new control-plane restarts" +fi + +WAIT_TIMEOUT="${WAIT_TIMEOUT#"${WAIT_TIMEOUT%%[![:space:]]*}"}" +WAIT_TIMEOUT="${WAIT_TIMEOUT%"${WAIT_TIMEOUT##*[![:space:]]}"}" +validate_duration_input wait_timeout "${WAIT_TIMEOUT}" + +STABILITY_WINDOW="${STABILITY_WINDOW#"${STABILITY_WINDOW%%[![:space:]]*}"}" +STABILITY_WINDOW="${STABILITY_WINDOW%"${STABILITY_WINDOW##*[![:space:]]}"}" +if [[ -z "${STABILITY_WINDOW}" ]]; then + STABILITY_WINDOW="0s" +fi +validate_duration_input stability_window "${STABILITY_WINDOW}" +if [[ "${STABILITY_WINDOW}" =~ ^0+[smh]$ ]]; then + STABILITY_WINDOW="0s" +fi +STABILITY_WINDOW_SECONDS="$(duration_seconds "${STABILITY_WINDOW}")" + +STABILITY_PROBE_INTERVAL="${STABILITY_PROBE_INTERVAL:-10s}" +STABILITY_PROBE_INTERVAL="${STABILITY_PROBE_INTERVAL#"${STABILITY_PROBE_INTERVAL%%[![:space:]]*}"}" +STABILITY_PROBE_INTERVAL="${STABILITY_PROBE_INTERVAL%"${STABILITY_PROBE_INTERVAL##*[![:space:]]}"}" +validate_duration_input stability_probe_interval "${STABILITY_PROBE_INTERVAL}" +STABILITY_PROBE_INTERVAL_SECONDS="$(duration_seconds "${STABILITY_PROBE_INTERVAL}")" +if (( STABILITY_PROBE_INTERVAL_SECONDS <= 0 )); then + echo "::error::stability_probe_interval must be greater than 0, got '${STABILITY_PROBE_INTERVAL}'" + exit 1 +fi +STABILITY_PROBE_FAILURE_THRESHOLD="${STABILITY_PROBE_FAILURE_THRESHOLD:-2}" +STABILITY_PROBE_FAILURE_THRESHOLD="${STABILITY_PROBE_FAILURE_THRESHOLD#"${STABILITY_PROBE_FAILURE_THRESHOLD%%[![:space:]]*}"}" +STABILITY_PROBE_FAILURE_THRESHOLD="${STABILITY_PROBE_FAILURE_THRESHOLD%"${STABILITY_PROBE_FAILURE_THRESHOLD##*[![:space:]]}"}" +if ! [[ "${STABILITY_PROBE_FAILURE_THRESHOLD}" =~ ^[0-9]+$ ]]; then + echo "::error::stability_probe_failure_threshold must be a positive integer, got '${STABILITY_PROBE_FAILURE_THRESHOLD}'" + exit 1 +fi +if (( STABILITY_PROBE_FAILURE_THRESHOLD <= 0 )); then + echo "::error::stability_probe_failure_threshold must be greater than 0, got '${STABILITY_PROBE_FAILURE_THRESHOLD}'" + exit 1 +fi + +LEASE_COMPONENTS="${LEASE_COMPONENTS:-kube-controller-manager kube-scheduler}" +LEASE_COMPONENTS="${LEASE_COMPONENTS#"${LEASE_COMPONENTS%%[![:space:]]*}"}" +LEASE_COMPONENTS="${LEASE_COMPONENTS%"${LEASE_COMPONENTS##*[![:space:]]}"}" + +LEASE_STALE_TIMEOUT="${LEASE_STALE_TIMEOUT:-120s}" +LEASE_STALE_TIMEOUT="${LEASE_STALE_TIMEOUT#"${LEASE_STALE_TIMEOUT%%[![:space:]]*}"}" +LEASE_STALE_TIMEOUT="${LEASE_STALE_TIMEOUT%"${LEASE_STALE_TIMEOUT##*[![:space:]]}"}" +validate_duration_input lease_stale_timeout "${LEASE_STALE_TIMEOUT}" +LEASE_STALE_TIMEOUT_SECONDS="$(duration_seconds "${LEASE_STALE_TIMEOUT}")" +if (( LEASE_STALE_TIMEOUT_SECONDS <= 0 )); then + echo "::error::lease_stale_timeout must be greater than 0, got '${LEASE_STALE_TIMEOUT}'" + exit 1 +fi + +RECOVER_UNHEALTHY="${RECOVER_UNHEALTHY#"${RECOVER_UNHEALTHY%%[![:space:]]*}"}" +RECOVER_UNHEALTHY="${RECOVER_UNHEALTHY%"${RECOVER_UNHEALTHY##*[![:space:]]}"}" +case "${RECOVER_UNHEALTHY}" in + true|false) ;; + *) + echo "::error::recover_unhealthy must be true or false, got '${RECOVER_UNHEALTHY}'" + exit 1 + ;; +esac + +kubectl_kind() { + timeout 30s kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" "$@" +} + +docker_timeout() { + timeout 30s docker "$@" +} + +STATIC_POD_RECREATE_SETTLE_SECONDS=5 +RESTART_COUNT_ATTEMPTS=3 +RESTART_COUNT_RETRY_SLEEP_SECONDS=2 +declare -A RECOVERY_ATTEMPTS=() +declare -A INITIAL_RESTARTS=() + +kubectl_kind get --raw='/readyz' || true + +wait_ready() { + local component="$1" + local selector="component=${component}" + + if ! timeout "${WAIT_TIMEOUT}" kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" -n "${NAMESPACE}" \ + wait --for=condition=Ready pod -l "${selector}" --timeout="${WAIT_TIMEOUT}"; then + return 1 + fi +} + +restart_total() { + local component="$1" + local selector="component=${component}" + local restart_counts + local restart_count + local total=0 + local attempt + + for ((attempt = 1; attempt <= RESTART_COUNT_ATTEMPTS; attempt++)); do + if restart_counts=$(kubectl_kind -n "${NAMESPACE}" get pod -l "${selector}" \ + -o jsonpath='{range .items[*]}{range .status.containerStatuses[*]}{.restartCount}{"\n"}{end}{end}'); then + if [[ -n "${restart_counts}" ]]; then + break + fi + echo "::warning::no container statuses found for ${component} pods (attempt ${attempt}/${RESTART_COUNT_ATTEMPTS})" >&2 + else + echo "::warning::failed to read restart counts for ${component} pods (attempt ${attempt}/${RESTART_COUNT_ATTEMPTS})" >&2 + fi + + if (( attempt < RESTART_COUNT_ATTEMPTS )); then + sleep "${RESTART_COUNT_RETRY_SLEEP_SECONDS}" + fi + done + + if [[ -z "${restart_counts}" ]]; then + echo "::error::no container statuses found for ${component} pods after ${RESTART_COUNT_ATTEMPTS} attempts" >&2 + dump_component_diagnostics "${component}" >&2 + exit 1 + fi + + while IFS= read -r restart_count; do + [[ -z "${restart_count}" ]] && continue + total=$((total + restart_count)) + done <<< "${restart_counts}" + echo "${total}" +} + +report_restart_baseline() { + local component="$1" + local restart_count="$2" + + if (( restart_count > 0 )); then + echo "::warning::${component} has historical restartCount=${restart_count}; checking current readiness and stability window only" + return + fi + echo "${component} restartCount=${restart_count}" +} + +dump_control_plane_summary() { + echo "=== Control-plane pod restart summary ===" + kubectl_kind -n "${NAMESPACE}" get pods -l tier=control-plane -o wide || true + kubectl_kind -n "${NAMESPACE}" get pods -l tier=control-plane \ + -o jsonpath='{range .items[*]}{.metadata.name}{" restartCount="}{range .status.containerStatuses[*]}{.restartCount}{" "}{end}{"\n"}{end}' || true +} + +require_readyz() { + local reason="$1" + + if ! kubectl_kind get --raw='/readyz'; then + echo "::error::kube-apiserver /readyz failed ${reason}" + dump_all_control_plane_runtime_diagnostics + exit 1 + fi +} + +probe_control_plane_api() { + local reason="$1" + local component + local lease_summary + + if ! kubectl_kind get --raw='/readyz' >/dev/null; then + echo "::error::kube-apiserver /readyz probe failed ${reason}" + return 1 + fi + + for component in ${LEASE_COMPONENTS}; do + if ! lease_summary=$(kubectl_kind -n "${NAMESPACE}" get lease "${component}" \ + -o jsonpath='{.metadata.name}{" holder="}{.spec.holderIdentity}{" renewTime="}{.spec.renewTime}{"\n"}' 2>/dev/null); then + echo "::error::failed to read leader election lease ${component} ${reason}" + return 1 + fi + echo "${lease_summary}" + done +} + +lease_renew_epoch() { + local renew_time="$1" + + date -u -d "${renew_time}" +%s 2>/dev/null +} + +verify_leader_lease_freshness() { + local component + local now_epoch + local renew_time + local renew_epoch + local lease_age + + [[ -z "${LEASE_COMPONENTS}" ]] && return + + now_epoch="$(date -u +%s)" + echo "Checking leader election lease freshness (max age ${LEASE_STALE_TIMEOUT})..." + for component in ${LEASE_COMPONENTS}; do + if ! renew_time=$(kubectl_kind -n "${NAMESPACE}" get lease "${component}" -o jsonpath='{.spec.renewTime}' 2>/dev/null); then + echo "::error::failed to read leader election lease ${component}" + dump_all_control_plane_runtime_diagnostics + exit 1 + fi + if [[ -z "${renew_time}" ]]; then + echo "::error::leader election lease ${component} has empty spec.renewTime" + dump_all_control_plane_runtime_diagnostics + exit 1 + fi + if ! renew_epoch="$(lease_renew_epoch "${renew_time}")"; then + echo "::error::failed to parse leader election lease ${component} renewTime '${renew_time}'" + dump_all_control_plane_runtime_diagnostics + exit 1 + fi + lease_age=$((now_epoch - renew_epoch)) + if (( lease_age < 0 )); then + lease_age=0 + fi + echo "${component} lease renewTime=${renew_time} age=${lease_age}s" + if (( lease_age > LEASE_STALE_TIMEOUT_SECONDS )); then + echo "::error::leader election lease ${component} is stale: age=${lease_age}s exceeds ${LEASE_STALE_TIMEOUT}" + dump_all_control_plane_runtime_diagnostics + exit 1 + fi + done +} + +observe_stability_window() { + local label="$1" + local elapsed=0 + local probe=0 + local sleep_seconds + local consecutive_failures=0 + local total_failures=0 + + echo "Observing control-plane stability for ${STABILITY_WINDOW} (${label}); probing every ${STABILITY_PROBE_INTERVAL}, failing after ${STABILITY_PROBE_FAILURE_THRESHOLD} consecutive probe failure(s)..." + while (( elapsed < STABILITY_WINDOW_SECONDS )); do + sleep_seconds="${STABILITY_PROBE_INTERVAL_SECONDS}" + if (( elapsed + sleep_seconds > STABILITY_WINDOW_SECONDS )); then + sleep_seconds=$((STABILITY_WINDOW_SECONDS - elapsed)) + fi + if (( sleep_seconds > 0 )); then + sleep "${sleep_seconds}" + elapsed=$((elapsed + sleep_seconds)) + fi + + probe=$((probe + 1)) + echo "=== Control-plane stability probe ${probe} (${elapsed}/${STABILITY_WINDOW_SECONDS}s, ${label}) ===" + if probe_control_plane_api "during ${label} stability probe ${probe}"; then + consecutive_failures=0 + continue + fi + + total_failures=$((total_failures + 1)) + consecutive_failures=$((consecutive_failures + 1)) + echo "::warning::control-plane stability probe ${probe} failed (${consecutive_failures} consecutive, ${total_failures} total)" + if (( consecutive_failures >= STABILITY_PROBE_FAILURE_THRESHOLD )); then + echo "::error::control-plane had ${consecutive_failures} consecutive failed stability probes during ${label}" + dump_all_control_plane_runtime_diagnostics + exit 1 + fi + done + + if (( total_failures > 0 )); then + echo "::warning::control-plane had ${total_failures} transient failed stability probe(s) during ${label}; final health checks must still pass" + fi + verify_leader_lease_freshness +} + +dump_api_server_health() { + local endpoint + + for endpoint in '/livez?verbose' '/readyz?verbose' '/healthz'; do + echo "=== kube-apiserver ${endpoint} ===" + kubectl_kind get --raw="${endpoint}" || true + done +} + +dump_kind_node_runtime_summary() { + local node="${KIND_CLUSTER_NAME}-control-plane" + + if ! docker_timeout inspect "${node}" >/dev/null 2>&1; then + echo "::warning::cannot collect node runtime summary: kind node container ${node} not found" + return + fi + + echo "=== ${node} docker stats ===" + docker_timeout stats --no-stream \ + --format 'table {{.Name}}\t{{.CPUPerc}}\t{{.MemUsage}}\t{{.NetIO}}\t{{.BlockIO}}\t{{.PIDs}}' \ + "${node}" || true + + echo "=== ${node} docker inspect state ===" + docker_timeout inspect \ + --format 'status={{.State.Status}} running={{.State.Running}} oomKilled={{.State.OOMKilled}} pid={{.State.Pid}} started={{.State.StartedAt}} finished={{.State.FinishedAt}}' \ + "${node}" || true + + echo "=== ${node} node pressure snapshot ===" + docker_timeout exec "${node}" sh -c ' + date + uptime || true + free -h || true + df -h / /var/lib/containerd /var/lib/kubelet 2>/dev/null || df -h + echo "--- top cpu/memory processes ---" + ps -eo pid,ppid,stat,etime,%cpu,%mem,comm,args --sort=-%cpu | head -40 || true + ' || true + + echo "=== ${node} CRI pod/container summary ===" + docker_timeout exec "${node}" crictl pods || true + docker_timeout exec "${node}" crictl ps -a || true + docker_timeout exec "${node}" crictl stats || true +} + +dump_static_pod_runtime_diagnostics() { + local component="$1" + local node="${KIND_CLUSTER_NAME}-control-plane" + local container_ids + local container_id + local count=0 + + if ! docker_timeout inspect "${node}" >/dev/null 2>&1; then + echo "::warning::cannot collect ${component} runtime diagnostics: kind node container ${node} not found" + return + fi + + echo "=== ${node} ${component} static pod manifest ===" + docker_timeout exec "${node}" sh -c "sed -n '1,220p' /etc/kubernetes/manifests/${component}.yaml" || true + + echo "=== ${node} ${component} CRI containers ===" + docker_timeout exec "${node}" crictl ps -a --name "${component}" || true + + container_ids=$(docker_timeout exec "${node}" crictl ps -a --name "${component}" -q 2>/dev/null || true) + for container_id in ${container_ids}; do + count=$((count + 1)) + if (( count > 8 )); then + echo "Skipping remaining ${component} CRI containers after first 8 entries." + break + fi + + echo "=== ${node} crictl inspect ${component} ${container_id} ===" + docker_timeout exec "${node}" crictl inspect "${container_id}" || true + echo "=== ${node} crictl logs ${component} ${container_id} ===" + docker_timeout exec "${node}" crictl logs --tail=200 "${container_id}" || true + done + + echo "=== ${node} kubelet journal (${component}) ===" + docker_timeout exec "${node}" journalctl -u kubelet --since '45 minutes ago' --no-pager 2>/dev/null \ + | grep -Ei "${component}|static pod|mirror pod|probe|liveness|readiness|startup|back-off|backoff|container|failed|error|oom|killed" \ + | tail -200 || true + + echo "=== ${node} containerd journal (${component}) ===" + docker_timeout exec "${node}" journalctl -u containerd --since '45 minutes ago' --no-pager 2>/dev/null \ + | grep -Ei "${component}|container|task|shim|deadline|failed|error|oom|killed" \ + | tail -200 || true +} + +dump_all_control_plane_runtime_diagnostics() { + local component + + dump_control_plane_summary + dump_api_server_health + dump_kind_node_runtime_summary + for component in ${COMPONENTS}; do + dump_static_pod_runtime_diagnostics "${component}" + kubectl_kind -n "${NAMESPACE}" get lease "${component}" -o yaml 2>/dev/null || true + done +} + +dump_component_diagnostics() { + local component="$1" + local selector="component=${component}" + local pods + local pod + + dump_control_plane_summary + kubectl_kind -n "${NAMESPACE}" get pod -l "${selector}" -o wide || true + kubectl_kind -n "${NAMESPACE}" describe pod -l "${selector}" || true + kubectl_kind -n "${NAMESPACE}" get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true + + pods=$(kubectl_kind -n "${NAMESPACE}" get pod -l "${selector}" -o name 2>/dev/null || true) + while IFS= read -r pod; do + [[ -z "${pod}" ]] && continue + echo "=== ${pod} logs ===" + kubectl_kind -n "${NAMESPACE}" logs "${pod}" --all-containers --tail=100 2>/dev/null || true + echo "=== ${pod} previous logs ===" + kubectl_kind -n "${NAMESPACE}" logs "${pod}" --all-containers --previous --tail=100 2>/dev/null || true + done <<< "${pods}" + + dump_all_control_plane_runtime_diagnostics + kubectl_kind -n "${NAMESPACE}" get lease "${component}" -o yaml 2>/dev/null || true +} + +is_recovery_component() { + local component="$1" + local candidate + + for candidate in ${RECOVERY_COMPONENTS}; do + if [[ "${candidate}" == "${component}" ]]; then + return 0 + fi + done + return 1 +} + +try_recover_component() { + local component="$1" + local reason="$2" + local node="${KIND_CLUSTER_NAME}-control-plane" + local attempt + local container_ids + local container_id + + if [[ "${RECOVER_UNHEALTHY}" != "true" ]]; then + return 1 + fi + if (( MAX_RECOVERY_ATTEMPTS == 0 )); then + return 1 + fi + if ! is_recovery_component "${component}"; then + return 1 + fi + + attempt="${RECOVERY_ATTEMPTS[${component}]:-0}" + if (( attempt >= MAX_RECOVERY_ATTEMPTS )); then + return 1 + fi + RECOVERY_ATTEMPTS["${component}"]=$((attempt + 1)) + + echo "::warning::${component} is unhealthy (${reason}); restarting static pod container (attempt $((attempt + 1))/${MAX_RECOVERY_ATTEMPTS})" + dump_component_diagnostics "${component}" + + if ! docker_timeout inspect "${node}" >/dev/null 2>&1; then + echo "::warning::cannot recover ${component}: kind node container ${node} not found" + return 1 + fi + + if ! container_ids=$(docker_timeout exec "${node}" crictl ps --name "${component}" -q 2>/dev/null); then + echo "::warning::cannot recover ${component}: timed out or failed to list containers in ${node}" + return 1 + fi + if [[ -z "${container_ids}" ]]; then + echo "::warning::cannot recover ${component}: no running container found in ${node}" + return 1 + fi + + for container_id in ${container_ids}; do + echo "Stopping ${component} container ${container_id} in ${node}..." + if ! docker_timeout exec "${node}" crictl stop "${container_id}"; then + echo "::warning::failed to stop ${component} container ${container_id}" + return 1 + fi + done + + # Give kubelet a short interval to observe the stopped CRI container + # and refresh the mirror pod before kubectl wait reads pod status. + sleep "${STATIC_POD_RECREATE_SETTLE_SECONDS}" + if ! wait_ready "${component}"; then + echo "::warning::${component} did not recover after static pod container restart" + dump_component_diagnostics "${component}" + kubectl_kind get --raw='/readyz' || true + return 1 + fi + + echo "${component} recovered after static pod container restart." + return 0 +} + +check_component() { + local component="$1" + local selector="component=${component}" + local pods + local initial_restarts + + if ! pods=$(kubectl_kind -n "${NAMESPACE}" get pod -l "${selector}" -o name); then + if ! try_recover_component "${component}" "failed to list pods in ${NAMESPACE} with selector ${selector}"; then + echo "::error::failed to list ${component} pods in ${NAMESPACE} with selector ${selector}" + kubectl_kind -n "${NAMESPACE}" get pods -o wide || true + exit 1 + fi + if ! pods=$(kubectl_kind -n "${NAMESPACE}" get pod -l "${selector}" -o name); then + echo "::error::failed to list ${component} pods after recovery" + kubectl_kind -n "${NAMESPACE}" get pods -o wide || true + exit 1 + fi + fi + if [[ -z "${pods}" ]]; then + echo "::error::no ${component} pods found in ${NAMESPACE} with selector ${selector}" + kubectl_kind -n "${NAMESPACE}" get pods -o wide || true + exit 1 + fi + + if ! wait_ready "${component}"; then + if ! try_recover_component "${component}" "pods did not become Ready within ${WAIT_TIMEOUT}"; then + echo "::error::${component} pods did not become Ready within ${WAIT_TIMEOUT}" + dump_component_diagnostics "${component}" + kubectl_kind get --raw='/readyz' || true + exit 1 + fi + fi + initial_restarts=$(restart_total "${component}") + report_restart_baseline "${component}" "${initial_restarts}" + INITIAL_RESTARTS["${component}"]="${initial_restarts}" +} + +verify_stability_window() { + local component + local initial_restarts + local final_restarts + local recovered=false + + if [[ "${STABILITY_WINDOW}" == "0s" ]]; then + return + fi + + observe_stability_window "primary" + for component in ${COMPONENTS}; do + initial_restarts="${INITIAL_RESTARTS[${component}]:-}" + if [[ -z "${initial_restarts}" ]]; then + echo "::error::missing initial restart count for ${component}" + exit 1 + fi + if ! wait_ready "${component}"; then + if ! try_recover_component "${component}" "pods became unready during ${STABILITY_WINDOW}"; then + echo "::error::${component} pods became unready during ${STABILITY_WINDOW}" + dump_component_diagnostics "${component}" + kubectl_kind get --raw='/readyz' || true + exit 1 + fi + initial_restarts=$(restart_total "${component}") + report_restart_baseline "${component}" "${initial_restarts}" + INITIAL_RESTARTS["${component}"]="${initial_restarts}" + recovered=true + continue + fi + final_restarts=$(restart_total "${component}") + if (( final_restarts > initial_restarts )); then + echo "::error::${component} restartCount increased from ${initial_restarts} to ${final_restarts} during ${STABILITY_WINDOW}" + dump_component_diagnostics "${component}" + kubectl_kind get --raw='/readyz' || true + exit 1 + fi + INITIAL_RESTARTS["${component}"]="${final_restarts}" + done + + if [[ "${recovered}" != "true" ]]; then + return + fi + + echo "::warning::control-plane recovery occurred; observing one additional ${STABILITY_WINDOW} stability window" + observe_stability_window "post-recovery" + for component in ${COMPONENTS}; do + initial_restarts="${INITIAL_RESTARTS[${component}]:-}" + if [[ -z "${initial_restarts}" ]]; then + echo "::error::missing post-recovery restart count for ${component}" + exit 1 + fi + if ! wait_ready "${component}"; then + echo "::error::${component} pods became unready after recovery" + dump_component_diagnostics "${component}" + kubectl_kind get --raw='/readyz' || true + exit 1 + fi + final_restarts=$(restart_total "${component}") + if (( final_restarts > initial_restarts )); then + echo "::error::${component} restartCount increased from ${initial_restarts} to ${final_restarts} after recovery" + dump_component_diagnostics "${component}" + exit 1 + fi + INITIAL_RESTARTS["${component}"]="${final_restarts}" + done +} + +for component in ${COMPONENTS}; do + check_component "${component}" +done +verify_stability_window +require_readyz "after stability window" diff --git a/.github/actions/gpu-cluster-setup/action.yml b/.github/actions/gpu-cluster-setup/action.yml index b9bc3060f..324ce7a8f 100644 --- a/.github/actions/gpu-cluster-setup/action.yml +++ b/.github/actions/gpu-cluster-setup/action.yml @@ -15,18 +15,91 @@ name: 'GPU Cluster Setup' description: 'Creates a GPU-enabled kind cluster using nvkind with CDI-mode GPU passthrough.' +inputs: + kind_node_image: + description: 'Kind node image for nvkind cluster creation' + required: false + default: '' + min_gpu_count: + description: 'Minimum visible GPU count required before cluster setup' + required: false + default: '1' + gpu_model_pattern: + description: 'Optional grep-compatible GPU model pattern required for visible GPUs' + required: false + default: '' + min_free_disk_gb: + description: 'Minimum free disk space on / required before cluster setup' + required: false + default: '20' + min_available_memory_gb: + description: 'Minimum available system memory required before cluster setup' + required: false + default: '8' + cluster_create_timeout: + description: 'Timeout for nvkind cluster create' + required: false + default: '900s' + control_plane_resource_patches: + description: 'Apply kubeadm patches that raise control-plane static pod resource requests' + required: false + default: 'false' + control_plane_leader_election_tuning: + description: 'Increase kube-controller-manager and kube-scheduler leader election timeouts for slow CI control planes' + required: false + default: 'false' + leader_election_lease_duration: + description: 'Leader election lease duration when control_plane_leader_election_tuning is true' + required: false + default: '300s' + leader_election_renew_deadline: + description: 'Leader election renew deadline when control_plane_leader_election_tuning is true' + required: false + default: '240s' + leader_election_retry_period: + description: 'Leader election retry period when control_plane_leader_election_tuning is true' + required: false + default: '10s' + api_server_cpu_request: + description: 'kube-apiserver CPU request when control_plane_resource_patches is true' + required: false + default: '1000m' + api_server_memory_request: + description: 'kube-apiserver memory request when control_plane_resource_patches is true' + required: false + default: '1Gi' + controller_manager_cpu_request: + description: 'kube-controller-manager CPU request when control_plane_resource_patches is true' + required: false + default: '1000m' + controller_manager_memory_request: + description: 'kube-controller-manager memory request when control_plane_resource_patches is true' + required: false + default: '512Mi' + scheduler_cpu_request: + description: 'kube-scheduler CPU request when control_plane_resource_patches is true' + required: false + default: '500m' + scheduler_memory_request: + description: 'kube-scheduler memory request when control_plane_resource_patches is true' + required: false + default: '256Mi' + etcd_cpu_request: + description: 'etcd CPU request when control_plane_resource_patches is true' + required: false + default: '1000m' + etcd_memory_request: + description: 'etcd memory request when control_plane_resource_patches is true' + required: false + default: '1Gi' + runs: using: 'composite' steps: - name: Validate environment shell: bash - run: | - if [[ -z "${KIND_CLUSTER_NAME:-}" ]]; then - echo "::error::KIND_CLUSTER_NAME environment variable must be set by the calling workflow" - exit 1 - fi - + run: bash "${{ github.action_path }}/validate-env.sh" - name: Load versions id: versions uses: ./.github/actions/load-versions @@ -52,40 +125,61 @@ runs: - name: Install nvkind shell: bash - run: | - go install github.com/NVIDIA/nvkind/cmd/nvkind@latest - nvkind --help - - - name: Verify host GPU + env: + NVKIND_VERSION: ${{ steps.versions.outputs.nvkind }} + run: bash "${{ github.action_path }}/install-nvkind.sh" + - name: Runner preflight shell: bash - run: nvidia-smi -L - + env: + GPU_MODEL_PATTERN: ${{ inputs.gpu_model_pattern }} + MIN_GPU_COUNT: ${{ inputs.min_gpu_count }} + MIN_FREE_DISK_GB: ${{ inputs.min_free_disk_gb }} + MIN_AVAILABLE_MEMORY_GB: ${{ inputs.min_available_memory_gb }} + run: bash "${{ github.action_path }}/runner-preflight.sh" - name: Configure NVIDIA Container Toolkit for kind shell: bash - run: | - sudo nvidia-ctk runtime configure --runtime=docker --set-as-default --cdi.enabled - sudo nvidia-ctk config --set accept-nvidia-visible-devices-as-volume-mounts=true --in-place - sudo nvidia-ctk config --set accept-nvidia-visible-devices-envvar-when-unprivileged=false --in-place - sudo systemctl restart docker - + run: bash "${{ github.action_path }}/configure-nvidia-container-toolkit.sh" - name: Validate Docker GPU access shell: bash - run: docker run --rm -v /dev/null:/var/run/nvidia-container-devices/all ubuntu:22.04 nvidia-smi -L - + run: bash "${{ github.action_path }}/validate-docker-gpu-access.sh" - name: Increase inotify limits shell: bash - run: | - sudo sysctl -w fs.inotify.max_user_watches=524288 - sudo sysctl -w fs.inotify.max_user_instances=1024 - + run: bash "${{ github.action_path }}/increase-inotify-limits.sh" + - name: Delete stale kind cluster + shell: bash + run: bash "${{ github.action_path }}/delete-stale-kind-cluster.sh" + - name: Check runner capacity + shell: bash + env: + MIN_FREE_DISK_GB: ${{ inputs.min_free_disk_gb }} + MIN_AVAILABLE_MEMORY_GB: ${{ inputs.min_available_memory_gb }} + run: bash "${{ github.action_path }}/check-runner-capacity.sh" + - name: Warm kind node image + if: ${{ inputs.kind_node_image != '' }} + shell: bash + env: + KIND_NODE_IMAGE: ${{ inputs.kind_node_image }} + MIN_FREE_DISK_GB: ${{ inputs.min_free_disk_gb }} + run: bash "${{ github.action_path }}/warm-kind-node-image.sh" - name: Create GPU-enabled kind cluster shell: bash - run: | - nvkind cluster create --name="${KIND_CLUSTER_NAME}" || echo "::warning::nvkind cluster create returned non-zero (umount errors are expected with CDI mode)" - kubectl --context="kind-${KIND_CLUSTER_NAME}" wait --for=condition=Ready nodes --all --timeout=300s - kubectl --context="kind-${KIND_CLUSTER_NAME}" cluster-info - kubectl --context="kind-${KIND_CLUSTER_NAME}" get nodes -o wide - + env: + KIND_NODE_IMAGE: ${{ inputs.kind_node_image }} + CLUSTER_CREATE_TIMEOUT: ${{ inputs.cluster_create_timeout }} + CONTROL_PLANE_RESOURCE_PATCHES: ${{ inputs.control_plane_resource_patches }} + CONTROL_PLANE_LEADER_ELECTION_TUNING: ${{ inputs.control_plane_leader_election_tuning }} + LEADER_ELECTION_LEASE_DURATION: ${{ inputs.leader_election_lease_duration }} + LEADER_ELECTION_RENEW_DEADLINE: ${{ inputs.leader_election_renew_deadline }} + LEADER_ELECTION_RETRY_PERIOD: ${{ inputs.leader_election_retry_period }} + API_SERVER_CPU_REQUEST: ${{ inputs.api_server_cpu_request }} + API_SERVER_MEMORY_REQUEST: ${{ inputs.api_server_memory_request }} + CONTROLLER_MANAGER_CPU_REQUEST: ${{ inputs.controller_manager_cpu_request }} + CONTROLLER_MANAGER_MEMORY_REQUEST: ${{ inputs.controller_manager_memory_request }} + SCHEDULER_CPU_REQUEST: ${{ inputs.scheduler_cpu_request }} + SCHEDULER_MEMORY_REQUEST: ${{ inputs.scheduler_memory_request }} + ETCD_CPU_REQUEST: ${{ inputs.etcd_cpu_request }} + ETCD_MEMORY_REQUEST: ${{ inputs.etcd_memory_request }} + run: bash "${{ github.action_path }}/create-gpu-kind-cluster.sh" - name: Print GPUs (nvkind) shell: bash run: nvkind cluster print-gpus --name="${KIND_CLUSTER_NAME}" diff --git a/.github/actions/gpu-cluster-setup/check-runner-capacity.sh b/.github/actions/gpu-cluster-setup/check-runner-capacity.sh new file mode 100644 index 000000000..ff6c3168e --- /dev/null +++ b/.github/actions/gpu-cluster-setup/check-runner-capacity.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail +free_disk_bytes=$(df -B1 --output=avail / | tail -1 | tr -dc '0-9') +min_free_disk_bytes=$((MIN_FREE_DISK_GB * 1024 * 1024 * 1024)) +free_disk_gib=$((free_disk_bytes / 1024 / 1024 / 1024)) +if (( free_disk_bytes < min_free_disk_bytes )); then + echo "::error::free disk on / is ${free_disk_bytes} bytes (${free_disk_gib}GiB), need at least ${min_free_disk_bytes} bytes (${MIN_FREE_DISK_GB}GiB)" + exit 1 +fi + +available_memory_gb=$(free -g | awk '/^Mem:/ {print $7}') +if (( available_memory_gb < MIN_AVAILABLE_MEMORY_GB )); then + echo "::error::available memory is ${available_memory_gb}GiB, need at least ${MIN_AVAILABLE_MEMORY_GB}GiB" + exit 1 +fi + +echo "Runner capacity is sufficient: disk=${free_disk_gib}GiB (${free_disk_bytes} bytes) memory=${available_memory_gb}GiB" diff --git a/.github/actions/gpu-cluster-setup/configure-nvidia-container-toolkit.sh b/.github/actions/gpu-cluster-setup/configure-nvidia-container-toolkit.sh new file mode 100644 index 000000000..84635a988 --- /dev/null +++ b/.github/actions/gpu-cluster-setup/configure-nvidia-container-toolkit.sh @@ -0,0 +1,43 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +sudo nvidia-ctk runtime configure --runtime=docker --set-as-default --cdi.enabled +sudo nvidia-ctk config --set accept-nvidia-visible-devices-as-volume-mounts=true --in-place +sudo nvidia-ctk config --set accept-nvidia-visible-devices-envvar-when-unprivileged=false --in-place +set +e +timeout 120s sudo systemctl restart docker +restart_status=$? +set -e +if (( restart_status != 0 )); then + echo "::error::Docker restart failed after NVIDIA runtime configuration" + sudo systemctl status docker --no-pager || true + sudo journalctl -u docker --since "10 minutes ago" --no-pager || true + exit "${restart_status}" +fi + +for attempt in $(seq 1 30); do + if systemctl is-active --quiet docker && timeout 5s docker info >/dev/null 2>&1; then + echo "Docker is healthy after NVIDIA runtime configuration." + exit 0 + fi + echo "Waiting for Docker to become healthy... (${attempt}/30)" + sleep 2 +done + +echo "::error::Docker did not become healthy after NVIDIA runtime configuration" +sudo systemctl status docker --no-pager || true +exit 1 diff --git a/.github/actions/gpu-cluster-setup/create-gpu-kind-cluster.sh b/.github/actions/gpu-cluster-setup/create-gpu-kind-cluster.sh new file mode 100644 index 000000000..0c22fb845 --- /dev/null +++ b/.github/actions/gpu-cluster-setup/create-gpu-kind-cluster.sh @@ -0,0 +1,487 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +validate_duration_input() { + local input_name="$1" + local input_value="$2" + + if ! [[ "${input_value}" =~ ^[0-9]+[smh]$ ]]; then + echo "::error::${input_name} must be a duration like 300s, 10m, or 1h; got '${input_value}'" + exit 1 + fi +} + +kubectl_kind() { + timeout 30s kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" "$@" +} + +kubectl_kind_wait() { + timeout 330s kubectl --request-timeout=300s --context="kind-${KIND_CLUSTER_NAME}" "$@" +} + +docker_timeout() { + local limit="$1" + shift + timeout "${limit}" docker "$@" +} + +validate_generated_control_plane_config() { + if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then + for patch_file in "${patch_dir}"/*.yaml; do + if ! grep -Fxq 'apiVersion: v1' "${patch_file}" || + ! grep -Fxq 'kind: Pod' "${patch_file}" || + ! grep -Eq '^[[:space:]]+resources:$' "${patch_file}"; then + echo "::error::rendered static pod patch ${patch_file} is missing expected top-level YAML" + sed 's/^/ /' "${patch_file}" || true + exit 1 + fi + done + + if ! grep -Eq '^[[:space:]]*extraMounts:$' "${config_template}" || + ! grep -Fq 'directory: /patches' "${config_template}"; then + echo "::error::rendered kind config is missing control-plane patch mounts" + sed 's/^/ /' "${config_template}" || true + exit 1 + fi + fi + + if [[ "${CONTROL_PLANE_LEADER_ELECTION_TUNING}" == "true" ]]; then + for expected in \ + 'apiVersion: kubeadm.k8s.io/v1beta3' \ + 'apiVersion: kubeadm.k8s.io/v1beta4' \ + "leader-elect-lease-duration: \"${LEADER_ELECTION_LEASE_DURATION}\"" \ + "leader-elect-renew-deadline: \"${LEADER_ELECTION_RENEW_DEADLINE}\"" \ + "leader-elect-retry-period: \"${LEADER_ELECTION_RETRY_PERIOD}\"" \ + "value: \"${LEADER_ELECTION_LEASE_DURATION}\"" \ + "value: \"${LEADER_ELECTION_RENEW_DEADLINE}\"" \ + "value: \"${LEADER_ELECTION_RETRY_PERIOD}\""; do + if ! grep -Fq "${expected}" "${config_template}"; then + echo "::error::rendered kind config is missing expected leader election setting: ${expected}" + sed 's/^/ /' "${config_template}" || true + exit 1 + fi + done + fi +} + +validate_duration_input cluster_create_timeout "${CLUSTER_CREATE_TIMEOUT}" +validate_duration_input leader_election_lease_duration "${LEADER_ELECTION_LEASE_DURATION}" +validate_duration_input leader_election_renew_deadline "${LEADER_ELECTION_RENEW_DEADLINE}" +validate_duration_input leader_election_retry_period "${LEADER_ELECTION_RETRY_PERIOD}" + +CREATE_ARGS=(--name="${KIND_CLUSTER_NAME}") +if [[ -n "${KIND_NODE_IMAGE}" ]]; then + echo "Using kind node image: ${KIND_NODE_IMAGE}" + CREATE_ARGS+=(--image="${KIND_NODE_IMAGE}") +fi + +case "${CONTROL_PLANE_RESOURCE_PATCHES}" in + true) ;; + ""|false) CONTROL_PLANE_RESOURCE_PATCHES=false ;; + *) + echo "::error::control_plane_resource_patches must be true or false, got '${CONTROL_PLANE_RESOURCE_PATCHES}'" + exit 1 + ;; +esac + +case "${CONTROL_PLANE_LEADER_ELECTION_TUNING}" in + true) ;; + ""|false) CONTROL_PLANE_LEADER_ELECTION_TUNING=false ;; + *) + echo "::error::control_plane_leader_election_tuning must be true or false, got '${CONTROL_PLANE_LEADER_ELECTION_TUNING}'" + exit 1 + ;; +esac + +if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" || "${CONTROL_PLANE_LEADER_ELECTION_TUNING}" == "true" ]]; then + patch_dir="$(mktemp -d)" + config_template="$(mktemp)" + cleanup_generated_config() { + [[ -n "${patch_dir:-}" ]] && rm -rf "${patch_dir}" + [[ -n "${config_template:-}" ]] && rm -f "${config_template}" + } + trap cleanup_generated_config EXIT + + # Keep YAML heredocs at column 0; indentation is literal content. + if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then + cat > "${patch_dir}/kube-apiserver+strategic.yaml" < "${patch_dir}/kube-controller-manager+strategic.yaml" < "${patch_dir}/kube-scheduler+strategic.yaml" < "${patch_dir}/etcd+strategic.yaml" < "${config_template}" <<'EOF' +kind: Cluster +apiVersion: kind.x-k8s.io/v1alpha4 +{{- if hasKey $ "name" }} +name: {{ $.name }} +{{- end }} +nodes: +- role: control-plane + {{- if hasKey $ "image" }} + image: {{ $.image }} + {{- end }} +EOF + if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then + cat >> "${config_template}" <> "${config_template}" <<'EOF' + kubeadmConfigPatches: +EOF + if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then + cat >> "${config_template}" <<'EOF' + - | + kind: InitConfiguration + patches: + directory: /patches +EOF + fi + if [[ "${CONTROL_PLANE_LEADER_ELECTION_TUNING}" == "true" ]]; then + # kind v0.31 renders kubeadm v1beta3. Keep a v1beta4 patch too so + # this remains valid when a future kind image switches API versions. + cat >> "${config_template}" <> "${config_template}" <<'EOF' +{{- range $.workers }} +- role: worker + {{- if hasKey $ "image" }} + image: {{ $.image }} + {{- end }} + + {{- if hasKey . "devices" }} + {{- $devices := .devices }} + {{- if not (kindIs "slice" $devices) }} + {{- $devices = list .devices }} + {{- end }} + extraMounts: + # We inject all NVIDIA GPUs using the nvidia-container-runtime. + # This requires `accept-nvidia-visible-devices-as-volume-mounts = true` be set + # in `/etc/nvidia-container-runtime/config.toml` + {{- range $d := $devices }} + - hostPath: /dev/null + containerPath: /var/run/nvidia-container-devices/{{ $d }} + {{- end }} + {{- end }} +{{- end }} +EOF + if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then + echo "Applying control-plane static pod resource patches from ${patch_dir}:" + for patch_file in "${patch_dir}"/*.yaml; do + echo "--- ${patch_file}" + sed 's/^/ /' "${patch_file}" + done + fi + if [[ "${CONTROL_PLANE_LEADER_ELECTION_TUNING}" == "true" ]]; then + echo "Increasing kube-controller-manager and kube-scheduler leader election timeouts for slow CI control planes:" + echo " lease-duration=${LEADER_ELECTION_LEASE_DURATION}" + echo " renew-deadline=${LEADER_ELECTION_RENEW_DEADLINE}" + echo " retry-period=${LEADER_ELECTION_RETRY_PERIOD}" + fi + validate_generated_control_plane_config + CREATE_ARGS+=(--config-template="${config_template}") +fi + +set +e +timeout "${CLUSTER_CREATE_TIMEOUT}" nvkind cluster create "${CREATE_ARGS[@]}" +create_status=$? +set -e +case "${create_status}" in + 0) ;; + 124) + echo "::warning::nvkind cluster create timed out after ${CLUSTER_CREATE_TIMEOUT}; continuing only if post-create checks pass" + ;; + *) + echo "::error::nvkind cluster create failed with status ${create_status}" + exit "${create_status}" + ;; +esac + +kubectl_kind_wait wait --for=condition=Ready nodes --all --timeout=300s +kubectl_kind cluster-info +kubectl_kind get nodes -o wide +kubectl_kind describe nodes | \ + grep -E "^(Name:|Capacity:|Allocatable:|Allocated resources:| cpu| memory| nvidia.com/gpu)" || true + +echo "=== Kind node container resources ===" +docker_timeout 30s ps --filter "label=io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}" \ + --format '{{.Names}}' | sort | while read -r node_container; do + [[ -z "${node_container}" ]] && continue + docker_timeout 30s inspect "${node_container}" \ + --format '{{.Name}} NanoCpus={{.HostConfig.NanoCpus}} CpuShares={{.HostConfig.CpuShares}} Memory={{.HostConfig.Memory}} MemoryReservation={{.HostConfig.MemoryReservation}}' + done + +echo "=== Control-plane resource requests/limits ===" +kubectl_kind -n kube-system \ + get pods -l tier=control-plane -o json | jq -r ' + .items[] as $pod | + $pod.metadata.name, + ($pod.spec.containers[] | + " " + .name + + " requests=" + ((.resources.requests // {}) | tostring) + + " limits=" + ((.resources.limits // {}) | tostring)) + ' || true + +normalize_cpu_request() { + local cpu="$1" + + if [[ "${cpu}" =~ ^([0-9]+)000m$ ]]; then + echo "${BASH_REMATCH[1]}" + return + fi + echo "${cpu}" +} + +control_plane_request() { + local component="$1" + local resource="$2" + + kubectl_kind -n kube-system \ + get pod -l "component=${component}" \ + -o "jsonpath={.items[0].spec.containers[0].resources.requests.${resource}}" +} + +assert_control_plane_request() { + local component="$1" + local resource="$2" + local expected="$3" + local actual + + actual="$(control_plane_request "${component}" "${resource}")" + if [[ "${resource}" == "cpu" ]]; then + expected="$(normalize_cpu_request "${expected}")" + actual="$(normalize_cpu_request "${actual}")" + fi + if [[ "${actual}" != "${expected}" ]]; then + echo "::error::${component} ${resource} request is '${actual}', expected '${expected}'" + exit 1 + fi + echo "${component} ${resource} request verified: ${actual}" +} + +control_plane_command_args() { + local component="$1" + + kubectl_kind -n kube-system \ + get pod -l "component=${component}" \ + -o json | jq -r '.items[0].spec.containers[0] | ((.command // []) + (.args // []))[]?' +} + +static_pod_manifest_contains_arg() { + local component="$1" + local expected="$2" + local node="${KIND_CLUSTER_NAME}-control-plane" + + docker_timeout 30s exec "${node}" grep -Fq -- "- ${expected}" "/etc/kubernetes/manifests/${component}.yaml" +} + +running_static_pod_container_contains_arg() { + local component="$1" + local expected="$2" + local node="${KIND_CLUSTER_NAME}-control-plane" + local container_ids + local container_id + local inspect_output + + if ! container_ids="$(docker_timeout 30s exec "${node}" crictl ps --name "${component}" -q 2>/dev/null)"; then + return 1 + fi + [[ -z "${container_ids}" ]] && return 1 + + for container_id in ${container_ids}; do + inspect_output="$(docker_timeout 30s exec "${node}" crictl inspect "${container_id}" 2>/dev/null || true)" + if jq -e --arg expected "${expected}" ' + ([.info.runtimeSpec.process.args[]?, .status.info.runtimeSpec.process.args[]?] | index($expected)) != null + ' >/dev/null 2>&1 <<< "${inspect_output}" || grep -Fq -- "${expected}" <<< "${inspect_output}"; then + return 0 + fi + done + return 1 +} + +dump_running_static_pod_container_args() { + local component="$1" + local node="${KIND_CLUSTER_NAME}-control-plane" + local container_ids + local container_id + + echo "Running ${component} CRI container args:" + container_ids="$(docker_timeout 30s exec "${node}" crictl ps --name "${component}" -q 2>/dev/null || true)" + if [[ -z "${container_ids}" ]]; then + echo "(no running ${component} CRI containers found)" + return + fi + for container_id in ${container_ids}; do + echo "--- ${container_id} ---" + docker_timeout 30s exec "${node}" crictl inspect "${container_id}" 2>/dev/null | jq -r ' + [.info.runtimeSpec.process.args[]?, .status.info.runtimeSpec.process.args[]?][]? + ' || true + done +} + +dump_static_pod_manifest() { + local component="$1" + local node="${KIND_CLUSTER_NAME}-control-plane" + + echo "Static pod manifest /etc/kubernetes/manifests/${component}.yaml:" + docker_timeout 30s exec "${node}" sed -n '1,220p' "/etc/kubernetes/manifests/${component}.yaml" || true +} + +assert_control_plane_arg() { + local component="$1" + local expected="$2" + local attempt + local command_args + + for attempt in $(seq 1 12); do + command_args="$(control_plane_command_args "${component}" || true)" + if grep -Fxq -- "${expected}" <<< "${command_args}"; then + echo "${component} command/args verified: ${expected}" + return + fi + if running_static_pod_container_contains_arg "${component}" "${expected}"; then + echo "${component} running CRI container args verified: ${expected} (live mirror pod omitted it)" + return + fi + if static_pod_manifest_contains_arg "${component}" "${expected}"; then + echo "::warning::${component} static pod manifest has ${expected}, but the running container does not yet; waiting for kubelet to converge (${attempt}/12)" + sleep 5 + continue + fi + + break + done + + echo "::error::${component} running command/args does not contain ${expected}" + echo "Observed live command/args:" + echo "${command_args:-}" + dump_running_static_pod_container_args "${component}" + dump_static_pod_manifest "${component}" + exit 1 +} + +if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then + echo "Verifying control-plane resource patches..." + assert_control_plane_request kube-apiserver cpu "${API_SERVER_CPU_REQUEST}" + assert_control_plane_request kube-apiserver memory "${API_SERVER_MEMORY_REQUEST}" + assert_control_plane_request kube-controller-manager cpu "${CONTROLLER_MANAGER_CPU_REQUEST}" + assert_control_plane_request kube-controller-manager memory "${CONTROLLER_MANAGER_MEMORY_REQUEST}" + assert_control_plane_request kube-scheduler cpu "${SCHEDULER_CPU_REQUEST}" + assert_control_plane_request kube-scheduler memory "${SCHEDULER_MEMORY_REQUEST}" + assert_control_plane_request etcd cpu "${ETCD_CPU_REQUEST}" + assert_control_plane_request etcd memory "${ETCD_MEMORY_REQUEST}" +fi + +if [[ "${CONTROL_PLANE_LEADER_ELECTION_TUNING}" == "true" ]]; then + echo "Verifying control-plane leader election timeout patches..." + for component in kube-controller-manager kube-scheduler; do + assert_control_plane_arg "${component}" "--leader-elect-lease-duration=${LEADER_ELECTION_LEASE_DURATION}" + assert_control_plane_arg "${component}" "--leader-elect-renew-deadline=${LEADER_ELECTION_RENEW_DEADLINE}" + assert_control_plane_arg "${component}" "--leader-elect-retry-period=${LEADER_ELECTION_RETRY_PERIOD}" + done +fi diff --git a/.github/actions/gpu-cluster-setup/delete-stale-kind-cluster.sh b/.github/actions/gpu-cluster-setup/delete-stale-kind-cluster.sh new file mode 100644 index 000000000..5e0a81778 --- /dev/null +++ b/.github/actions/gpu-cluster-setup/delete-stale-kind-cluster.sh @@ -0,0 +1,58 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail +kind_cluster_label="io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}" +docker_timeout() { + timeout 30s docker "$@" +} + +read_kind_container_ids() { + local output + + if ! output="$(docker_timeout ps -aq --filter "label=${kind_cluster_label}" 2>&1)"; then + echo "::error::failed to query stale kind containers for ${KIND_CLUSTER_NAME}" + echo "${output}" + exit 1 + fi + + remaining_containers=() + if [[ -n "${output}" ]]; then + mapfile -t remaining_containers <<< "${output}" + fi +} + +if kind get clusters | grep -Fxq "${KIND_CLUSTER_NAME}"; then + echo "Deleting stale kind cluster: ${KIND_CLUSTER_NAME}" + if ! timeout 180s kind delete cluster --name "${KIND_CLUSTER_NAME}"; then + echo "::warning::kind delete cluster timed out or failed; falling back to direct container cleanup" + fi +else + echo "No stale kind cluster named ${KIND_CLUSTER_NAME}" +fi + +read_kind_container_ids +if (( ${#remaining_containers[@]} > 0 )); then + echo "Removing stale containers for ${KIND_CLUSTER_NAME}:" + docker_timeout ps -a --filter "label=${kind_cluster_label}" + docker_timeout rm -f "${remaining_containers[@]}" +fi + +read_kind_container_ids +if (( ${#remaining_containers[@]} > 0 )); then + echo "::error::stale containers still remain for ${KIND_CLUSTER_NAME}:" + docker_timeout ps -a --filter "label=${kind_cluster_label}" + exit 1 +fi diff --git a/.github/actions/gpu-cluster-setup/increase-inotify-limits.sh b/.github/actions/gpu-cluster-setup/increase-inotify-limits.sh new file mode 100644 index 000000000..843496a38 --- /dev/null +++ b/.github/actions/gpu-cluster-setup/increase-inotify-limits.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +sudo sysctl -w fs.inotify.max_user_watches=524288 +sudo sysctl -w fs.inotify.max_user_instances=1024 diff --git a/.github/actions/gpu-cluster-setup/install-nvkind.sh b/.github/actions/gpu-cluster-setup/install-nvkind.sh new file mode 100644 index 000000000..c2200e078 --- /dev/null +++ b/.github/actions/gpu-cluster-setup/install-nvkind.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +if [[ -z "${NVKIND_VERSION:-}" ]]; then + echo "::error::NVKIND_VERSION must be set" + exit 1 +fi + +go install "github.com/NVIDIA/nvkind/cmd/nvkind@${NVKIND_VERSION}" +nvkind_bin="${GOBIN:-$(go env GOPATH)/bin}/nvkind" +"${nvkind_bin}" --help diff --git a/.github/actions/gpu-cluster-setup/runner-preflight.sh b/.github/actions/gpu-cluster-setup/runner-preflight.sh new file mode 100644 index 000000000..678b9d419 --- /dev/null +++ b/.github/actions/gpu-cluster-setup/runner-preflight.sh @@ -0,0 +1,69 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +echo "=== Runner baseline ===" +date -u +hostname +uptime +nproc +free -h +df -h / +df -ih / + +for value_name in MIN_GPU_COUNT MIN_FREE_DISK_GB MIN_AVAILABLE_MEMORY_GB; do + value="${!value_name}" + if ! [[ "${value}" =~ ^[0-9]+$ ]]; then + echo "::error::${value_name} must be an integer, got '${value}'" + exit 1 + fi +done + +echo "=== Docker health ===" +docker info >/dev/null +docker version + +echo "=== Host GPUs ===" +nvidia-smi -L +nvidia-smi + +mapfile -t gpu_names < <(nvidia-smi --query-gpu=name --format=csv,noheader) +if [[ -n "${GPU_MODEL_PATTERN}" ]]; then + set +e + gpu_count=$(printf '%s\n' "${gpu_names[@]}" | grep -Eic -- "${GPU_MODEL_PATTERN}") + grep_status=$? + set -e + if (( grep_status == 2 )); then + echo "::error::invalid gpu_model_pattern regex: ${GPU_MODEL_PATTERN}" + exit 1 + fi + if (( grep_status != 0 )); then + gpu_count=0 + fi + echo "Visible GPUs matching '${GPU_MODEL_PATTERN}': ${gpu_count}" +else + gpu_count="${#gpu_names[@]}" + echo "Visible GPUs: ${gpu_count}" +fi + +if (( gpu_count < MIN_GPU_COUNT )); then + echo "::error::visible GPU count ${gpu_count} is below required minimum ${MIN_GPU_COUNT}" + exit 1 +fi + +echo "=== Existing kind state ===" +kind get clusters || true +docker ps -a --filter "label=io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}" || true diff --git a/.github/actions/gpu-cluster-setup/validate-docker-gpu-access.sh b/.github/actions/gpu-cluster-setup/validate-docker-gpu-access.sh new file mode 100644 index 000000000..6f01ba156 --- /dev/null +++ b/.github/actions/gpu-cluster-setup/validate-docker-gpu-access.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail +timeout 300s docker run --rm -v /dev/null:/var/run/nvidia-container-devices/all ubuntu:22.04 nvidia-smi -L diff --git a/.github/actions/gpu-cluster-setup/validate-env.sh b/.github/actions/gpu-cluster-setup/validate-env.sh new file mode 100644 index 000000000..697d077c2 --- /dev/null +++ b/.github/actions/gpu-cluster-setup/validate-env.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +if [[ -z "${KIND_CLUSTER_NAME:-}" ]]; then + echo "::error::KIND_CLUSTER_NAME environment variable must be set by the calling workflow" + exit 1 +fi diff --git a/.github/actions/gpu-cluster-setup/warm-kind-node-image.sh b/.github/actions/gpu-cluster-setup/warm-kind-node-image.sh new file mode 100644 index 000000000..b0567fa7c --- /dev/null +++ b/.github/actions/gpu-cluster-setup/warm-kind-node-image.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail +echo "=== Kind node image cache ===" +if docker image inspect "${KIND_NODE_IMAGE}" >/dev/null 2>&1; then + echo "Kind node image already cached: ${KIND_NODE_IMAGE}" +else + echo "Pulling kind node image: ${KIND_NODE_IMAGE}" + timeout 600s docker pull "${KIND_NODE_IMAGE}" +fi +free_disk_bytes=$(df -B1 --output=avail / | tail -1 | tr -dc '0-9') +min_free_disk_bytes=$((MIN_FREE_DISK_GB * 1024 * 1024 * 1024)) +free_disk_gib=$((free_disk_bytes / 1024 / 1024 / 1024)) +if (( free_disk_bytes < min_free_disk_bytes )); then + echo "::error::free disk on / is ${free_disk_bytes} bytes (${free_disk_gib}GiB) after warming ${KIND_NODE_IMAGE}, need at least ${min_free_disk_bytes} bytes (${MIN_FREE_DISK_GB}GiB)" + exit 1 +fi +echo "Runner disk remains sufficient after kind image warm-up: ${free_disk_gib}GiB (${free_disk_bytes} bytes)" diff --git a/.github/actions/gpu-debug-diagnostics/action.yml b/.github/actions/gpu-debug-diagnostics/action.yml new file mode 100644 index 000000000..e5a38b964 --- /dev/null +++ b/.github/actions/gpu-debug-diagnostics/action.yml @@ -0,0 +1,35 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: 'GPU Debug Diagnostics' +description: 'Print bounded GPU CI diagnostics while the kind cluster is still present.' + +inputs: + cluster_name: + description: 'Kind cluster name' + required: true + mode: + description: 'Diagnostic mode: smoke, training, or inference' + required: false + default: 'smoke' + +runs: + using: 'composite' + steps: + - name: Print GPU debug diagnostics + shell: bash + env: + KIND_CLUSTER_NAME: ${{ inputs.cluster_name }} + GPU_TEST_DIAGNOSTIC_MODE: ${{ inputs.mode }} + run: bash "${{ github.workspace }}/.github/scripts/gpu-debug-diagnostics.sh" diff --git a/.github/actions/gpu-operator-install/action.yml b/.github/actions/gpu-operator-install/action.yml index e2bdb300c..b30c63f2d 100644 --- a/.github/actions/gpu-operator-install/action.yml +++ b/.github/actions/gpu-operator-install/action.yml @@ -31,6 +31,14 @@ inputs: description: 'Platform for recipe generation (bundle mode only, e.g. dynamo)' required: false default: '' + wait: + description: 'Wait for bundle Helm resources during deploy' + required: false + default: 'false' + best_effort: + description: 'Continue deploying remaining bundle components after a component failure' + required: false + default: 'true' runs: using: 'composite' @@ -41,102 +49,33 @@ runs: - name: Install GPU Operator (helm) if: inputs.method == 'helm' shell: bash - run: | - helm repo add nvidia https://helm.ngc.nvidia.com/nvidia - helm repo update - helm upgrade -i \ - --kube-context="kind-${KIND_CLUSTER_NAME}" \ - --namespace gpu-operator \ - --create-namespace \ - --set driver.enabled=false \ - --set toolkit.enabled=false \ - --set dcgmExporter.enabled=false \ - --set nfd.enabled=true \ - --wait --timeout=600s \ - gpu-operator nvidia/gpu-operator - + run: bash "${{ github.action_path }}/install-gpu-operator-helm.sh" - name: Wait for GPU operands (helm) if: inputs.method == 'helm' shell: bash - run: | - echo "Waiting for device plugin to be ready..." - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \ - rollout status daemonset -l app=nvidia-device-plugin-daemonset --timeout=300s || true - echo "GPU Operator pods:" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods - + run: bash "${{ github.action_path }}/wait-gpu-operands-helm.sh" # --- Bundle mode: aicr recipe → bundle → deploy --- - name: Generate recipe if: inputs.method == 'bundle' shell: bash - run: | - PLATFORM_FLAG="" - if [[ -n "${{ inputs.platform }}" ]]; then - PLATFORM_FLAG="--platform ${{ inputs.platform }}" - fi - ./aicr recipe \ - --service kind \ - --accelerator ${{ inputs.accelerator }} \ - --os ubuntu \ - --intent ${{ inputs.intent }} \ - ${PLATFORM_FLAG} \ - --output recipe.yaml - echo "--- Recipe ---" - cat recipe.yaml - + env: + AICR_ACCELERATOR: ${{ inputs.accelerator }} + AICR_INTENT: ${{ inputs.intent }} + AICR_PLATFORM: ${{ inputs.platform }} + run: bash "${{ github.action_path }}/generate-recipe.sh" - name: Generate deployment bundle if: inputs.method == 'bundle' shell: bash - run: | - ./aicr bundle \ - --recipe recipe.yaml \ - --accelerated-node-toleration nvidia.com/gpu:NoSchedule \ - --output bundle - echo "--- Bundle contents ---" - ls -la bundle/ - + run: bash "${{ github.action_path }}/generate-bundle.sh" - name: Install bundle into cluster if: inputs.method == 'bundle' shell: bash - run: | - cd bundle - # Use --no-wait: several components (gpu-operator ClusterPolicy, - # kai-scheduler SchedulingShard, nvidia-dra-driver-gpu kubelet plugin) - # stay InProgress in kind because their CRs/DaemonSets require - # features not available in kind (DRA feature gates, driver modules). - # The explicit "Wait for GPU operands" step below gates on what - # actually matters (device plugin readiness). - # --best-effort: some components (e.g. network-operator) have Helm - # hooks that may time out in Kind; continue deploying remaining - # components so the overall stack is functional. - chmod +x deploy.sh - echo "--- deploy.sh ---" - cat deploy.sh - ./deploy.sh --no-wait --best-effort - + env: + AICR_DEPLOY_WAIT: ${{ inputs.wait }} + AICR_DEPLOY_BEST_EFFORT: ${{ inputs.best_effort }} + run: bash "${{ github.action_path }}/install-bundle.sh" - name: Wait for GPU operands (bundle) if: inputs.method == 'bundle' shell: bash - run: | - echo "Waiting for GPU operator controller to deploy operands..." - # The GPU operator controller watches ClusterPolicy and creates - # DaemonSets for device-plugin, NFD, GFD, etc. This happens - # asynchronously after the helm install completes. - for i in $(seq 1 30); do - count=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \ - get daemonset -l app=nvidia-device-plugin-daemonset --no-headers 2>/dev/null | wc -l) - if [[ "$count" -gt 0 ]]; then - echo "Device plugin DaemonSet found." - break - fi - echo "Waiting for device plugin DaemonSet to be created... (${i}/30)" - sleep 10 - done - echo "Waiting for device plugin rollout..." - # Operands are excluded from control-plane nodes via nodeAffinity in - # the kind overlay, so all scheduled pods should become ready. - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \ - rollout status daemonset -l app=nvidia-device-plugin-daemonset --timeout=300s - echo "GPU Operator pods:" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods + run: bash "${{ github.action_path }}/wait-gpu-operands-bundle.sh" diff --git a/.github/actions/gpu-operator-install/generate-bundle.sh b/.github/actions/gpu-operator-install/generate-bundle.sh new file mode 100644 index 000000000..095b68415 --- /dev/null +++ b/.github/actions/gpu-operator-install/generate-bundle.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +rm -rf bundle +./aicr bundle \ + --recipe recipe.yaml \ + --accelerated-node-toleration nvidia.com/gpu:NoSchedule \ + --output bundle +echo "--- Bundle contents ---" +ls -la bundle/ diff --git a/.github/actions/gpu-operator-install/generate-recipe.sh b/.github/actions/gpu-operator-install/generate-recipe.sh new file mode 100644 index 000000000..6015e69ed --- /dev/null +++ b/.github/actions/gpu-operator-install/generate-recipe.sh @@ -0,0 +1,29 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +RECIPE_ARGS=( + --service kind + --accelerator "${AICR_ACCELERATOR}" + --os ubuntu + --intent "${AICR_INTENT}" +) +if [[ -n "${AICR_PLATFORM}" ]]; then + RECIPE_ARGS+=(--platform "${AICR_PLATFORM}") +fi + +./aicr recipe "${RECIPE_ARGS[@]}" --output recipe.yaml +echo "Recipe written to recipe.yaml" diff --git a/.github/actions/gpu-operator-install/install-bundle.sh b/.github/actions/gpu-operator-install/install-bundle.sh new file mode 100644 index 000000000..cefa4ce5d --- /dev/null +++ b/.github/actions/gpu-operator-install/install-bundle.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +cd bundle +# The default keeps legacy bundle-mode behavior: do not wait on every +# Helm resource and keep deploying after component failures. H100 +# qualification jobs override these inputs to hard-fail and wait. +chmod +x deploy.sh +DEPLOY_ARGS=() +if [[ "${AICR_DEPLOY_WAIT}" != "true" ]]; then + DEPLOY_ARGS+=(--no-wait) +fi +if [[ "${AICR_DEPLOY_BEST_EFFORT}" == "true" ]]; then + DEPLOY_ARGS+=(--best-effort) +fi +if [[ "${#DEPLOY_ARGS[@]}" -gt 0 ]]; then + echo "Deploying bundle with args: ${DEPLOY_ARGS[*]}" +else + echo "Deploying bundle with default args" +fi +./deploy.sh "${DEPLOY_ARGS[@]}" diff --git a/.github/actions/gpu-operator-install/install-gpu-operator-helm.sh b/.github/actions/gpu-operator-install/install-gpu-operator-helm.sh new file mode 100644 index 000000000..6079cad83 --- /dev/null +++ b/.github/actions/gpu-operator-install/install-gpu-operator-helm.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +GPU_OPERATOR_CHART_VERSION="v25.10.1" + +helm repo add nvidia https://helm.ngc.nvidia.com/nvidia --force-update +helm repo update +helm upgrade -i \ + --kube-context="kind-${KIND_CLUSTER_NAME}" \ + --namespace gpu-operator \ + --create-namespace \ + --set driver.enabled=false \ + --set toolkit.enabled=false \ + --set dcgmExporter.enabled=false \ + --set nfd.enabled=true \ + --version="${GPU_OPERATOR_CHART_VERSION}" \ + --wait --timeout=600s \ + gpu-operator nvidia/gpu-operator diff --git a/.github/actions/gpu-operator-install/wait-gpu-operands-bundle.sh b/.github/actions/gpu-operator-install/wait-gpu-operands-bundle.sh new file mode 100644 index 000000000..9566fb8ba --- /dev/null +++ b/.github/actions/gpu-operator-install/wait-gpu-operands-bundle.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +echo "Waiting for GPU operator controller to deploy operands..." +# The GPU operator controller watches ClusterPolicy and creates +# DaemonSets for device-plugin, NFD, GFD, etc. This happens +# asynchronously after the helm install completes. +daemonset_found=false +for i in $(seq 1 30); do + daemonsets="" + if daemonsets=$(kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \ + get daemonset -l app=nvidia-device-plugin-daemonset --no-headers 2>/dev/null); then + if [[ -n "${daemonsets}" ]]; then + daemonset_found=true + fi + fi + if [[ "${daemonset_found}" == "true" ]]; then + echo "Device plugin DaemonSet found." + break + fi + echo "Waiting for device plugin DaemonSet to be created... (${i}/30)" + sleep 10 +done +if [[ "${daemonset_found}" != "true" ]]; then + echo "::error::device plugin DaemonSet was not created within 300s" + kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods || true + kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get events --sort-by='.lastTimestamp' || true + exit 1 +fi +echo "Waiting for device plugin rollout..." +# Operands are excluded from control-plane nodes via nodeAffinity in +# the kind overlay, so all scheduled pods should become ready. +kubectl --request-timeout=300s --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \ + rollout status daemonset -l app=nvidia-device-plugin-daemonset --timeout=300s +echo "GPU Operator pods:" +kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods diff --git a/.github/actions/gpu-operator-install/wait-gpu-operands-helm.sh b/.github/actions/gpu-operator-install/wait-gpu-operands-helm.sh new file mode 100644 index 000000000..3d3042f8a --- /dev/null +++ b/.github/actions/gpu-operator-install/wait-gpu-operands-helm.sh @@ -0,0 +1,37 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +echo "Waiting for device plugin to be ready..." +for i in $(seq 1 30); do + if kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \ + get daemonset -l app=nvidia-device-plugin-daemonset --no-headers 2>/dev/null | grep -q .; then + echo "Device plugin DaemonSet found." + break + fi + if (( i == 30 )); then + echo "::error::device plugin DaemonSet was not created within 300s" + kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods || true + exit 1 + fi + echo "Waiting for device plugin DaemonSet to be created... (${i}/30)" + sleep 10 +done + +kubectl --request-timeout=300s --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \ + rollout status daemonset -l app=nvidia-device-plugin-daemonset --timeout=300s +echo "GPU Operator pods:" +kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods diff --git a/.github/actions/gpu-smoke-nvidia-smi/action.yml b/.github/actions/gpu-smoke-nvidia-smi/action.yml new file mode 100644 index 000000000..cb61b5d0d --- /dev/null +++ b/.github/actions/gpu-smoke-nvidia-smi/action.yml @@ -0,0 +1,36 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: 'GPU Smoke nvidia-smi' +description: 'Run nvidia-smi in a GPU-backed kind pod and print its logs.' + +inputs: + cluster_name: + description: 'Kind cluster name' + required: true + +runs: + using: 'composite' + steps: + - name: Run nvidia-smi in a pod + shell: bash + env: + KIND_CLUSTER_NAME: ${{ inputs.cluster_name }} + run: bash "${{ github.workspace }}/.github/scripts/gpu-smoke-run-nvidia-smi.sh" + - name: Show nvidia-smi output + if: always() + shell: bash + env: + KIND_CLUSTER_NAME: ${{ inputs.cluster_name }} + run: bash "${{ github.workspace }}/.github/scripts/gpu-smoke-show-nvidia-smi-log.sh" diff --git a/.github/actions/gpu-snapshot-validate/action.yml b/.github/actions/gpu-snapshot-validate/action.yml index e1ee3c14b..7af987da0 100644 --- a/.github/actions/gpu-snapshot-validate/action.yml +++ b/.github/actions/gpu-snapshot-validate/action.yml @@ -26,60 +26,28 @@ inputs: cluster_name: description: 'Kind cluster name (for kubectl context)' required: true + snapshot_timeout: + description: 'Timeout for aicr snapshot' + required: false + default: '5m' runs: using: composite steps: - name: Run aicr snapshot shell: bash - run: | - ./aicr snapshot \ - --kubeconfig="${HOME}/.kube/config" \ - --namespace=default \ - --image=ko.local:smoke-test \ - --require-gpu \ - --output=snapshot.yaml - echo "--- Snapshot output ---" - cat snapshot.yaml - + env: + SNAPSHOT_TIMEOUT: ${{ inputs.snapshot_timeout }} + run: bash "${{ github.action_path }}/run-snapshot.sh" - name: Validate snapshot detected GPU shell: bash - run: | - # Query by subtype field (not index) — #502 added a "hardware" subtype before "smi". - GPU_MODEL=$(yq eval '.measurements[] | select(.type == "GPU") | .subtypes[] | select(.subtype == "smi") | .data["gpu.model"]' snapshot.yaml) - GPU_COUNT=$(yq eval '.measurements[] | select(.type == "GPU") | .subtypes[] | select(.subtype == "smi") | .data["gpu-count"]' snapshot.yaml) - echo "GPU model: ${GPU_MODEL}" - echo "GPU count: ${GPU_COUNT}" - if [[ "${GPU_MODEL}" != *"${{ inputs.gpu_model }}"* ]]; then - echo "::error::Expected ${{ inputs.gpu_model }} GPU in snapshot, got: ${GPU_MODEL}" - exit 1 - fi - if [[ "${GPU_COUNT}" -lt ${{ inputs.min_gpu_count }} ]]; then - echo "::error::Expected gpu-count >= ${{ inputs.min_gpu_count }}, got: ${GPU_COUNT}" - exit 1 - fi - echo "Snapshot correctly detected ${GPU_COUNT}x ${GPU_MODEL}" - + env: + EXPECTED_GPU_MODEL: ${{ inputs.gpu_model }} + MIN_GPU_COUNT: ${{ inputs.min_gpu_count }} + run: bash "${{ github.action_path }}/validate-snapshot-gpu.sh" - name: Debug snapshot Job if: failure() shell: bash - run: | - echo "=== Snapshot Job ===" - kubectl --context="kind-${{ inputs.cluster_name }}" -n default get job aicr -o yaml || true - echo "=== Snapshot Pods ===" - kubectl --context="kind-${{ inputs.cluster_name }}" -n default \ - get pods -l app.kubernetes.io/name=aicr -o wide || true - echo "=== Snapshot Job describe ===" - kubectl --context="kind-${{ inputs.cluster_name }}" -n default describe job aicr || true - echo "=== Snapshot Pod describe ===" - kubectl --context="kind-${{ inputs.cluster_name }}" -n default \ - describe pods -l app.kubernetes.io/name=aicr || true - echo "=== Snapshot current logs ===" - kubectl --context="kind-${{ inputs.cluster_name }}" -n default \ - logs -l app.kubernetes.io/name=aicr --all-containers --tail=200 || true - echo "=== Snapshot previous logs ===" - kubectl --context="kind-${{ inputs.cluster_name }}" -n default \ - logs -l app.kubernetes.io/name=aicr --all-containers --previous --tail=200 || true - echo "=== Snapshot ConfigMap ===" - kubectl --context="kind-${{ inputs.cluster_name }}" -n default \ - get configmap aicr-snapshot -o yaml || true + env: + KIND_CLUSTER_NAME: ${{ inputs.cluster_name }} + run: bash "${{ github.action_path }}/debug-snapshot-job.sh" diff --git a/.github/actions/gpu-snapshot-validate/debug-snapshot-job.sh b/.github/actions/gpu-snapshot-validate/debug-snapshot-job.sh new file mode 100644 index 000000000..2e0f1547f --- /dev/null +++ b/.github/actions/gpu-snapshot-validate/debug-snapshot-job.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +kubectl_kind() { + timeout 30s kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" "$@" +} + +echo "=== Snapshot Job ===" +kubectl_kind -n default get job aicr -o yaml || true +echo "=== Snapshot Pods ===" +kubectl_kind -n default get pods -l app.kubernetes.io/name=aicr -o wide || true +echo "=== Snapshot Job describe ===" +kubectl_kind -n default describe job aicr || true +echo "=== Snapshot Pod describe ===" +kubectl_kind -n default describe pods -l app.kubernetes.io/name=aicr || true +echo "=== Snapshot current logs ===" +kubectl_kind -n default logs -l app.kubernetes.io/name=aicr --all-containers --tail=200 || true +echo "=== Snapshot previous logs ===" +kubectl_kind -n default logs -l app.kubernetes.io/name=aicr --all-containers --previous --tail=200 || true +echo "=== Snapshot ConfigMap ===" +kubectl_kind -n default get configmap aicr-snapshot -o yaml || true diff --git a/.github/actions/gpu-snapshot-validate/run-snapshot.sh b/.github/actions/gpu-snapshot-validate/run-snapshot.sh new file mode 100644 index 000000000..e45b575ef --- /dev/null +++ b/.github/actions/gpu-snapshot-validate/run-snapshot.sh @@ -0,0 +1,26 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +./aicr snapshot \ + --kubeconfig="${HOME}/.kube/config" \ + --namespace=default \ + --image=ko.local:smoke-test \ + --require-gpu \ + --timeout="${SNAPSHOT_TIMEOUT}" \ + --output=snapshot.yaml +echo "--- Snapshot output ---" +cat snapshot.yaml diff --git a/.github/actions/gpu-snapshot-validate/validate-snapshot-gpu.sh b/.github/actions/gpu-snapshot-validate/validate-snapshot-gpu.sh new file mode 100644 index 000000000..5a27e6093 --- /dev/null +++ b/.github/actions/gpu-snapshot-validate/validate-snapshot-gpu.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +# Query by subtype field (not index) — #502 added a "hardware" subtype before "smi". +GPU_MODEL=$(yq eval '.measurements[] | select(.type == "GPU") | .subtypes[] | select(.subtype == "smi") | .data["gpu.model"]' snapshot.yaml) +GPU_COUNT=$(yq eval '.measurements[] | select(.type == "GPU") | .subtypes[] | select(.subtype == "smi") | .data["gpu-count"]' snapshot.yaml) +echo "GPU model: ${GPU_MODEL}" +echo "GPU count: ${GPU_COUNT}" +if ! [[ "${GPU_COUNT}" =~ ^[0-9]+$ ]]; then + echo "::error::Expected numeric gpu-count in snapshot, got: ${GPU_COUNT}" + exit 1 +fi +if [[ "${GPU_MODEL}" != *"${EXPECTED_GPU_MODEL}"* ]]; then + echo "::error::Expected ${EXPECTED_GPU_MODEL} GPU in snapshot, got: ${GPU_MODEL}" + exit 1 +fi +if [[ "${GPU_COUNT}" -lt ${MIN_GPU_COUNT} ]]; then + echo "::error::Expected gpu-count >= ${MIN_GPU_COUNT}, got: ${GPU_COUNT}" + exit 1 +fi +echo "Snapshot correctly detected ${GPU_COUNT}x ${GPU_MODEL}" diff --git a/.github/actions/gpu-test-cleanup/action.yml b/.github/actions/gpu-test-cleanup/action.yml index 30ac7831f..417130669 100644 --- a/.github/actions/gpu-test-cleanup/action.yml +++ b/.github/actions/gpu-test-cleanup/action.yml @@ -23,35 +23,34 @@ inputs: description: 'Prefix for the uploaded artifact name' required: false default: 'gpu-test-debug' + collect_artifacts: + description: 'Collect and upload debug artifacts before deleting the kind cluster' + required: false + default: 'false' runs: using: 'composite' steps: - name: Collect debug artifacts - if: failure() + if: inputs.collect_artifacts == 'true' shell: bash env: KIND_CLUSTER_NAME: ${{ inputs.cluster_name }} - run: | - mkdir -p /tmp/debug-artifacts - kubectl --context="kind-${KIND_CLUSTER_NAME}" get all --all-namespaces > /tmp/debug-artifacts/all-resources.txt || true - kubectl --context="kind-${KIND_CLUSTER_NAME}" get events --all-namespaces --sort-by='.lastTimestamp' > /tmp/debug-artifacts/events.txt || true - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods -o wide > /tmp/debug-artifacts/gpu-operator-pods.txt || true - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator logs -l app=nvidia-device-plugin-daemonset --tail=100 > /tmp/debug-artifacts/device-plugin-logs.txt || true - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator logs -l app.kubernetes.io/component=gpu-operator --tail=100 > /tmp/debug-artifacts/gpu-operator-logs.txt || true - kubectl --context="kind-${KIND_CLUSTER_NAME}" get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded > /tmp/debug-artifacts/non-running-pods.txt || true - + run: bash "${{ github.action_path }}/collect-debug-artifacts.sh" - name: Export kind logs - if: failure() + if: always() && inputs.collect_artifacts == 'true' shell: bash env: KIND_CLUSTER_NAME: ${{ inputs.cluster_name }} - run: | - mkdir -p /tmp/kind-logs - kind export logs /tmp/kind-logs --name "${KIND_CLUSTER_NAME}" || true - + run: bash "${{ github.action_path }}/export-kind-logs.sh" + - name: Cleanup + if: always() + shell: bash + env: + KIND_CLUSTER_NAME: ${{ inputs.cluster_name }} + run: bash "${{ github.action_path }}/cleanup-kind-cluster.sh" - name: Upload debug artifacts - if: failure() + if: always() && inputs.collect_artifacts == 'true' uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0 with: name: ${{ inputs.artifact_name_prefix }}-${{ github.run_id }} @@ -59,12 +58,3 @@ runs: /tmp/debug-artifacts/ /tmp/kind-logs/ retention-days: 7 - - - name: Cleanup - if: always() - shell: bash - env: - KIND_CLUSTER_NAME: ${{ inputs.cluster_name }} - run: | - kind delete cluster --name "${KIND_CLUSTER_NAME}" || true - docker system prune -f || true diff --git a/.github/actions/gpu-test-cleanup/cleanup-kind-cluster.sh b/.github/actions/gpu-test-cleanup/cleanup-kind-cluster.sh new file mode 100644 index 000000000..4603d494d --- /dev/null +++ b/.github/actions/gpu-test-cleanup/cleanup-kind-cluster.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +timeout 300s kind delete cluster --name "${KIND_CLUSTER_NAME}" || true +docker_timeout() { + local limit="$1" + shift + timeout "${limit}" docker "$@" +} +kind_cluster_label="io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}" +remaining_containers=$(docker_timeout 30s ps -aq --filter "label=${kind_cluster_label}" || true) +if [[ -n "${remaining_containers}" ]]; then + echo "Removing leftover kind containers for ${KIND_CLUSTER_NAME}:" + docker_timeout 30s ps -a --filter "label=${kind_cluster_label}" || true + docker_timeout 30s rm -f ${remaining_containers} || true +fi +docker_timeout 60s builder prune -f --filter "until=24h" || true +docker_timeout 60s system prune -f --filter "until=24h" || true diff --git a/.github/actions/gpu-test-cleanup/collect-debug-artifacts.sh b/.github/actions/gpu-test-cleanup/collect-debug-artifacts.sh new file mode 100644 index 000000000..7c780e3f7 --- /dev/null +++ b/.github/actions/gpu-test-cleanup/collect-debug-artifacts.sh @@ -0,0 +1,152 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Diagnostic artifact collection intentionally omits -e so one broken cluster +# call does not prevent later artifacts from being collected. +set -uo pipefail +rm -rf /tmp/debug-artifacts +mkdir -p /tmp/debug-artifacts +CONTROL_PLANE_COMPONENTS="kube-apiserver kube-controller-manager kube-scheduler etcd" +MAX_KIND_NODE_ARTIFACT_SECONDS="${MAX_KIND_NODE_ARTIFACT_SECONDS:-600}" +kubectl_kind() { + timeout 30s kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" "$@" +} +docker_timeout() { + local limit="$1" + shift + timeout "${limit}" docker "$@" +} + +{ + date -u || true + hostname || true + uptime || true + nproc || true + free -h || true + df -h / || true + df -ih / || true +} > /tmp/debug-artifacts/runner-baseline.txt 2>&1 || true +docker_timeout 30s version > /tmp/debug-artifacts/docker-version.txt 2>&1 || true +docker_timeout 30s info > /tmp/debug-artifacts/docker-info.txt 2>&1 || true +nvidia-smi -L > /tmp/debug-artifacts/host-gpus.txt 2>&1 || true +nvidia-smi >> /tmp/debug-artifacts/host-gpus.txt 2>&1 || true +kind get clusters > /tmp/debug-artifacts/kind-clusters.txt 2>&1 || true +docker_timeout 30s ps -a --filter "label=io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}" \ + > /tmp/debug-artifacts/kind-node-containers.txt 2>&1 || true + +kubectl_kind get all --all-namespaces > /tmp/debug-artifacts/all-resources.txt || true +kubectl_kind get events --all-namespaces --sort-by='.lastTimestamp' > /tmp/debug-artifacts/events.txt || true +kubectl_kind get --raw='/livez?verbose' > /tmp/debug-artifacts/apiserver-livez.txt 2>&1 || true +kubectl_kind get --raw='/readyz?verbose' > /tmp/debug-artifacts/apiserver-readyz.txt 2>&1 || true +kubectl_kind -n kube-system get pods -l tier=control-plane -o wide \ + > /tmp/debug-artifacts/control-plane-pods.txt 2>&1 || true +kubectl_kind -n kube-system get events --sort-by='.lastTimestamp' \ + > /tmp/debug-artifacts/kube-system-events.txt 2>&1 || true +for component in ${CONTROL_PLANE_COMPONENTS}; do + kubectl_kind -n kube-system describe pod -l "component=${component}" \ + > "/tmp/debug-artifacts/${component}-describe.txt" 2>&1 || true + kubectl_kind -n kube-system logs -l "component=${component}" --all-containers --tail=300 \ + > "/tmp/debug-artifacts/${component}-logs.txt" 2>&1 || true + kubectl_kind -n kube-system logs -l "component=${component}" --all-containers --previous --tail=300 \ + > "/tmp/debug-artifacts/${component}-previous-logs.txt" 2>&1 || true + kubectl_kind -n kube-system get lease "${component}" -o yaml \ + > "/tmp/debug-artifacts/${component}-lease.yaml" 2>&1 || true +done +kubectl_kind -n gpu-operator get pods -o wide > /tmp/debug-artifacts/gpu-operator-pods.txt || true +kubectl_kind -n gpu-operator logs -l app=nvidia-device-plugin-daemonset --tail=100 > /tmp/debug-artifacts/device-plugin-logs.txt || true +kubectl_kind -n gpu-operator logs -l app.kubernetes.io/component=gpu-operator --tail=100 > /tmp/debug-artifacts/gpu-operator-logs.txt || true +kubectl_kind -n monitoring get deployment,statefulset,daemonset,pods -o wide \ + > /tmp/debug-artifacts/monitoring-workloads.txt 2>&1 || true +kubectl_kind -n monitoring describe deployment kube-prometheus-operator \ + > /tmp/debug-artifacts/kube-prometheus-operator-deployment-describe.txt 2>&1 || true +kubectl_kind -n monitoring logs deployment/kube-prometheus-operator --all-containers --tail=300 \ + > /tmp/debug-artifacts/kube-prometheus-operator-logs.txt 2>&1 || true +kubectl_kind -n monitoring logs deployment/kube-prometheus-operator --all-containers --previous --tail=300 \ + > /tmp/debug-artifacts/kube-prometheus-operator-previous-logs.txt 2>&1 || true +kubectl_kind -n monitoring get events --sort-by='.lastTimestamp' \ + > /tmp/debug-artifacts/monitoring-events.txt 2>&1 || true +{ + kubectl_kind -n monitoring get pods -o name 2>/dev/null \ + | grep '^pod/kube-prometheus-operator-' \ + | while read -r pod; do + echo "=== ${pod} ===" + kubectl_kind -n monitoring describe "${pod}" 2>&1 || true + done +} > /tmp/debug-artifacts/kube-prometheus-operator-pods-describe.txt 2>&1 || true +kubectl_kind get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded > /tmp/debug-artifacts/non-running-pods.txt || true +tar_inputs=() +[[ -f recipe.yaml ]] && tar_inputs+=(recipe.yaml) +[[ -d bundle ]] && tar_inputs+=(bundle) +if [[ "${#tar_inputs[@]}" -gt 0 ]]; then + echo "Archiving runtime bundle inputs: ${tar_inputs[*]}" + tar -czf /tmp/debug-artifacts/aicr-runtime-bundle.tar.gz "${tar_inputs[@]}" || true +else + echo "No recipe.yaml or bundle directory found; skipping runtime bundle archive" +fi + +artifact_loop_start="$(date +%s)" +docker_timeout 30s ps --filter "label=io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}" \ + --format '{{.Names}}' | sort | while read -r node_container; do + [[ -z "${node_container}" ]] && continue + artifact_loop_elapsed=$(($(date +%s) - artifact_loop_start)) + if (( artifact_loop_elapsed > MAX_KIND_NODE_ARTIFACT_SECONDS )); then + echo "Kind node artifact collection exceeded ${MAX_KIND_NODE_ARTIFACT_SECONDS}s; stopping after partial collection." + break + fi + node_file="${node_container//[^A-Za-z0-9_.-]/_}" + docker_timeout 30s inspect "${node_container}" \ + > "/tmp/debug-artifacts/${node_file}-docker-inspect.json" 2>&1 || true + docker_timeout 30s exec "${node_container}" journalctl -u kubelet \ + --since "90 minutes ago" --no-pager \ + > "/tmp/debug-artifacts/${node_file}-kubelet-journal.txt" 2>&1 || true + docker_timeout 30s exec "${node_container}" journalctl -u containerd \ + --since "90 minutes ago" --no-pager \ + > "/tmp/debug-artifacts/${node_file}-containerd-journal.txt" 2>&1 || true + docker_timeout 30s exec "${node_container}" crictl ps -a \ + > "/tmp/debug-artifacts/${node_file}-crictl-ps-a.txt" 2>&1 || true + docker_timeout 30s exec "${node_container}" crictl pods \ + > "/tmp/debug-artifacts/${node_file}-crictl-pods.txt" 2>&1 || true + docker_timeout 30s exec "${node_container}" crictl stats \ + > "/tmp/debug-artifacts/${node_file}-crictl-stats.txt" 2>&1 || true + docker_timeout 30s exec "${node_container}" sh -c ' + date + uptime || true + free -h || true + df -h / /var/lib/containerd /var/lib/kubelet 2>/dev/null || df -h + echo "--- top cpu/memory processes ---" + ps -eo pid,ppid,stat,etime,%cpu,%mem,comm,args --sort=-%cpu | head -40 || true + ' > "/tmp/debug-artifacts/${node_file}-node-pressure.txt" 2>&1 || true + # shellcheck disable=SC2016 # Expanded inside the kind node shell. + docker_timeout 120s exec "${node_container}" sh -c ' + for component in kube-apiserver kube-controller-manager kube-scheduler etcd; do + echo "=== ${component} static pod manifest ===" + sed -n "1,220p" "/etc/kubernetes/manifests/${component}.yaml" 2>/dev/null || true + echo "=== ${component} CRI containers ===" + crictl ps -a --name "${component}" || true + count=0 + for container_id in $(crictl ps -a --name "${component}" -q 2>/dev/null); do + count=$((count + 1)) + if [ "${count}" -gt 8 ]; then + echo "Skipping remaining ${component} CRI containers after first 8 entries." + break + fi + echo "=== crictl inspect ${component} ${container_id} ===" + crictl inspect "${container_id}" || true + echo "=== crictl logs ${component} ${container_id} ===" + crictl logs --tail=300 "${container_id}" || true + done + done + ' > "/tmp/debug-artifacts/${node_file}-control-plane-cri.txt" 2>&1 || true + done || true diff --git a/.github/actions/gpu-test-cleanup/export-kind-logs.sh b/.github/actions/gpu-test-cleanup/export-kind-logs.sh new file mode 100644 index 000000000..2522481eb --- /dev/null +++ b/.github/actions/gpu-test-cleanup/export-kind-logs.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +mkdir -p /tmp/kind-logs +timeout 300s kind export logs /tmp/kind-logs --name "${KIND_CLUSTER_NAME}" || true diff --git a/.github/actions/install-karpenter-kwok/action.yml b/.github/actions/install-karpenter-kwok/action.yml index fde7bddde..f26aa38a5 100644 --- a/.github/actions/install-karpenter-kwok/action.yml +++ b/.github/actions/install-karpenter-kwok/action.yml @@ -19,6 +19,18 @@ inputs: cluster_name: description: 'Kind cluster name (used for kubectl context)' required: true + kwok_helm_timeout: + description: 'Timeout for KWOK controller Helm install' + required: false + default: '300s' + ko_build_timeout: + description: 'Timeout in seconds for Karpenter KWOK provider ko build' + required: false + default: '900' + karpenter_helm_timeout: + description: 'Timeout for Karpenter Helm install' + required: false + default: '300s' runs: using: 'composite' @@ -26,9 +38,12 @@ runs: - name: Resolve versions id: versions shell: bash - run: | - echo "karpenter=$(yq eval '.testing_tools.karpenter' .settings.yaml)" >> "$GITHUB_OUTPUT" - echo "go=$(go env GOVERSION)" >> "$GITHUB_OUTPUT" + run: bash "${{ github.action_path }}/resolve-versions.sh" + - name: Install ko + uses: ./.github/actions/setup-build-tools + with: + install_ko: 'true' + ko_version: ${{ steps.versions.outputs.ko }} - name: Cache Karpenter Go build cache uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3 @@ -46,7 +61,7 @@ runs: env: KIND_CLUSTER_NAME: ${{ inputs.cluster_name }} KARPENTER_VERSION: ${{ steps.versions.outputs.karpenter }} - run: | - set -euo pipefail - bash kwok/scripts/install-karpenter-kwok.sh - kubectl --context="kind-${KIND_CLUSTER_NAME}" apply -f kwok/manifests/karpenter/nodepool.yaml + KWOK_HELM_TIMEOUT: ${{ inputs.kwok_helm_timeout }} + KO_BUILD_TIMEOUT: ${{ inputs.ko_build_timeout }} + KARPENTER_HELM_TIMEOUT: ${{ inputs.karpenter_helm_timeout }} + run: bash "${{ github.action_path }}/install-karpenter-kwok.sh" diff --git a/.github/actions/install-karpenter-kwok/install-karpenter-kwok.sh b/.github/actions/install-karpenter-kwok/install-karpenter-kwok.sh new file mode 100644 index 000000000..8987144ab --- /dev/null +++ b/.github/actions/install-karpenter-kwok/install-karpenter-kwok.sh @@ -0,0 +1,47 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail +validate_duration_input() { + local input_name="$1" + local input_value="$2" + + if ! [[ "${input_value}" =~ ^[0-9]+[smh]$ ]]; then + echo "::error::${input_name} must be a duration like 300s, 10m, or 1h; got '${input_value}'" + exit 1 + fi +} + +validate_seconds_input() { + local input_name="$1" + local input_value="$2" + + if ! [[ "${input_value}" =~ ^[0-9]+$ ]]; then + echo "::error::${input_name} must be an integer number of seconds, got '${input_value}'" + exit 1 + fi + if (( 10#${input_value} <= 0 )); then + echo "::error::${input_name} must be greater than 0 seconds, got '${input_value}'" + exit 1 + fi +} + +validate_duration_input kwok_helm_timeout "${KWOK_HELM_TIMEOUT}" +validate_seconds_input ko_build_timeout "${KO_BUILD_TIMEOUT}" +validate_duration_input karpenter_helm_timeout "${KARPENTER_HELM_TIMEOUT}" +bash kwok/scripts/install-karpenter-kwok.sh +timeout 30s kubectl --request-timeout=10s \ + --context="kind-${KIND_CLUSTER_NAME}" \ + apply -f kwok/manifests/karpenter/nodepool.yaml diff --git a/.github/actions/install-karpenter-kwok/resolve-versions.sh b/.github/actions/install-karpenter-kwok/resolve-versions.sh new file mode 100644 index 000000000..84e85458e --- /dev/null +++ b/.github/actions/install-karpenter-kwok/resolve-versions.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +echo "karpenter=$(yq eval '.testing_tools.karpenter' .settings.yaml)" >> "$GITHUB_OUTPUT" +echo "ko=$(yq eval '.build_tools.ko' .settings.yaml)" >> "$GITHUB_OUTPUT" +echo "go=$(go env GOVERSION)" >> "$GITHUB_OUTPUT" diff --git a/.github/actions/load-versions/action.yml b/.github/actions/load-versions/action.yml index b87e321d1..b3c506d40 100644 --- a/.github/actions/load-versions/action.yml +++ b/.github/actions/load-versions/action.yml @@ -40,6 +40,9 @@ outputs: kind: description: 'Kind version' value: ${{ steps.versions.outputs.kind }} + nvkind: + description: 'nvkind git ref' + value: ${{ steps.versions.outputs.nvkind }} ctlptl: description: 'ctlptl version' value: ${{ steps.versions.outputs.ctlptl }} @@ -91,6 +94,9 @@ outputs: kind_node_image: description: 'Kind node image for testing' value: ${{ steps.versions.outputs.kind_node_image }} + h100_kind_node_image: + description: 'Kind node image for H100 GPU tests' + value: ${{ steps.versions.outputs.h100_kind_node_image }} runs: using: 'composite' @@ -121,6 +127,7 @@ runs: # Testing tools echo "kubectl=$(yq eval '.testing_tools.kubectl' .settings.yaml)" >> $GITHUB_OUTPUT echo "kind=$(yq eval '.testing_tools.kind' .settings.yaml)" >> $GITHUB_OUTPUT + echo "nvkind=$(yq eval '.testing_tools.nvkind' .settings.yaml)" >> $GITHUB_OUTPUT echo "ctlptl=$(yq eval '.testing_tools.ctlptl' .settings.yaml)" >> $GITHUB_OUTPUT echo "tilt=$(yq eval '.testing_tools.tilt' .settings.yaml)" >> $GITHUB_OUTPUT echo "helm=$(yq eval '.testing_tools.helm' .settings.yaml)" >> $GITHUB_OUTPUT @@ -141,6 +148,7 @@ runs: # Testing configuration echo "kind_node_image=$(yq eval '.testing.kind_node_image' .settings.yaml)" >> $GITHUB_OUTPUT + echo "h100_kind_node_image=$(yq eval '.testing.h100_kind_node_image' .settings.yaml)" >> $GITHUB_OUTPUT - name: Display loaded versions shell: bash @@ -158,6 +166,7 @@ runs: echo " grype: ${{ steps.versions.outputs.grype }}" echo " kubectl: ${{ steps.versions.outputs.kubectl }}" echo " kind: ${{ steps.versions.outputs.kind }}" + echo " nvkind: ${{ steps.versions.outputs.nvkind }}" echo " ctlptl: ${{ steps.versions.outputs.ctlptl }}" echo " tilt: ${{ steps.versions.outputs.tilt }}" echo " helm: ${{ steps.versions.outputs.helm }}" @@ -172,3 +181,4 @@ runs: echo " lint_timeout: ${{ steps.versions.outputs.lint_timeout }}" echo " test_timeout: ${{ steps.versions.outputs.test_timeout }}" echo " kind_node_image: ${{ steps.versions.outputs.kind_node_image }}" + echo " h100_kind_node_image: ${{ steps.versions.outputs.h100_kind_node_image }}" diff --git a/.github/scripts/gpu-chainsaw-health.sh b/.github/scripts/gpu-chainsaw-health.sh new file mode 100644 index 000000000..7098c6bf8 --- /dev/null +++ b/.github/scripts/gpu-chainsaw-health.sh @@ -0,0 +1,78 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +if [[ $# -ne 1 ]]; then + echo "::error::Usage: $0 " + exit 2 +fi +test_dir="$1" +if [[ ! -d "${test_dir}" ]]; then + echo "::error::Test directory not found: ${test_dir}" + exit 1 +fi + +CHAINSAW_TEST_TIMEOUT="${CHAINSAW_TEST_TIMEOUT:-30m}" +MONITORING_READY_TIMEOUT="${MONITORING_READY_TIMEOUT:-180s}" + +kubectl_kind() { + timeout 30s kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" "$@" +} + +kubectl_kind_wait() { + # Rollout status opens a watch that is already bounded by --timeout. Keep + # request-timeout unset here so a slow API server does not cut the watch short. + kubectl --context="kind-${KIND_CLUSTER_NAME}" "$@" +} + +print_monitoring_diagnostics() { + echo "=== Monitoring workloads ===" + kubectl_kind -n monitoring get deployment,statefulset,daemonset,pods -o wide 2>/dev/null || true + echo "=== kube-prometheus-operator deployment ===" + kubectl_kind -n monitoring get deployment kube-prometheus-operator -o wide 2>/dev/null || true + echo "=== kube-prometheus-operator deployment describe ===" + kubectl_kind -n monitoring describe deployment kube-prometheus-operator 2>/dev/null || true + echo "=== kube-prometheus-operator pods ===" + kubectl_kind -n monitoring get pods -o wide 2>/dev/null \ + | grep -E '(^NAME|^kube-prometheus-operator-)' || true + echo "=== kube-prometheus-operator logs ===" + kubectl_kind -n monitoring logs deployment/kube-prometheus-operator --all-containers --tail=200 2>/dev/null || true + echo "=== kube-prometheus-operator previous logs ===" + kubectl_kind -n monitoring logs deployment/kube-prometheus-operator --all-containers --previous --tail=200 2>/dev/null || true + echo "=== Recent events (monitoring) ===" + kubectl_kind -n monitoring get events --sort-by='.lastTimestamp' 2>/dev/null | tail -100 || true +} + +wait_for_monitoring_operator() { + echo "Waiting for monitoring/kube-prometheus-operator before Chainsaw..." + print_monitoring_diagnostics + if kubectl_kind_wait -n monitoring rollout status deployment/kube-prometheus-operator \ + --timeout="${MONITORING_READY_TIMEOUT}"; then + echo "monitoring/kube-prometheus-operator is rolled out." + return 0 + fi + + echo "::error::monitoring/kube-prometheus-operator did not become available within ${MONITORING_READY_TIMEOUT}" + print_monitoring_diagnostics + return 1 +} + +wait_for_monitoring_operator + +timeout "${CHAINSAW_TEST_TIMEOUT}" chainsaw test \ + --test-dir "${test_dir}" \ + --config tests/chainsaw/chainsaw-config.yaml \ + --skip-delete diff --git a/.github/scripts/gpu-debug-diagnostics.sh b/.github/scripts/gpu-debug-diagnostics.sh new file mode 100644 index 000000000..3db82a6e8 --- /dev/null +++ b/.github/scripts/gpu-debug-diagnostics.sh @@ -0,0 +1,256 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Diagnostic script: intentionally omits -e so each mode can keep collecting +# partial failure data. Keep -u and pipefail to catch script bugs and pipeline +# failures while individual kubectl_kind calls tolerate cluster errors. +set -uo pipefail + +mode="${GPU_TEST_DIAGNOSTIC_MODE:-smoke}" + +kubectl_kind() { + timeout 30s kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" "$@" +} + +docker_timeout() { + local limit="$1" + shift + timeout "${limit}" docker "$@" +} + +print_setup_diagnostics() { + echo "=== Runner baseline ===" + date -u || true + hostname || true + uptime || true + nproc || true + free -h || true + df -h / || true + df -ih / || true + echo "=== Docker health ===" + docker info >/dev/null 2>&1 && docker version || true + echo "=== Host GPUs ===" + nvidia-smi -L || true + nvidia-smi || true + echo "=== Kind clusters ===" + kind get clusters || true + echo "=== Kind node containers ===" + docker_timeout 30s ps -a --filter "label=io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}" || true + echo "=== Kind node container resources ===" + docker_timeout 30s ps --filter "label=io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}" \ + --format '{{.Names}}' | sort | while read -r node_container; do + [[ -z "${node_container}" ]] && continue + docker_timeout 30s inspect "${node_container}" \ + --format '{{.Name}} State={{.State.Status}} NanoCpus={{.HostConfig.NanoCpus}} CpuShares={{.HostConfig.CpuShares}} Memory={{.HostConfig.Memory}} MemoryReservation={{.HostConfig.MemoryReservation}}' || true + done || true +} + +print_workload_images() { + local ns="$1" + kubectl_kind -n "${ns}" get deployment,daemonset,statefulset -o json 2>/dev/null \ + | jq -r ' + .items[] | + [ + .kind, + .metadata.namespace + "/" + .metadata.name, + (([.spec.template.spec.containers[]?.image] + + [.spec.template.spec.initContainers[]?.image]) | unique | join(",")) + ] | @tsv + ' || true +} + +print_workload_inventory() { + local ns + echo "=== Workload image inventory ===" + for ns in "$@"; do + echo "--- ${ns} ---" + print_workload_images "${ns}" + done +} + +print_component_status_summary() { + echo "=== Component workload status ===" + kubectl_kind get deployments,statefulsets,daemonsets,pods -A -o wide 2>/dev/null || true + echo "=== Component rollout conditions ===" + kubectl_kind get deployments,statefulsets,daemonsets -A \ + -o custom-columns='KIND:.kind,NAMESPACE:.metadata.namespace,NAME:.metadata.name,READY:.status.readyReplicas,AVAILABLE:.status.availableReplicas,DESIRED:.status.replicas,UPDATED:.status.updatedReplicas,AGE:.metadata.creationTimestamp' \ + 2>/dev/null || true + echo "=== Non-ready pods ===" + kubectl_kind get pods -A \ + --field-selector=status.phase!=Running,status.phase!=Succeeded \ + -o wide 2>/dev/null || true +} + +print_kube_prometheus_operator_diagnostics() { + echo "=== Monitoring workloads ===" + kubectl_kind -n monitoring get deployment,statefulset,daemonset,pods -o wide 2>/dev/null || true + echo "=== kube-prometheus-operator deployment ===" + kubectl_kind -n monitoring get deployment kube-prometheus-operator -o wide 2>/dev/null || true + echo "=== kube-prometheus-operator deployment describe ===" + kubectl_kind -n monitoring describe deployment kube-prometheus-operator 2>/dev/null || true + echo "=== kube-prometheus-operator pod describe ===" + kubectl_kind -n monitoring get pods -o name 2>/dev/null \ + | grep '^pod/kube-prometheus-operator-' \ + | while read -r pod; do + echo "--- ${pod} ---" + kubectl_kind -n monitoring describe "${pod}" 2>/dev/null || true + done || true + echo "=== kube-prometheus-operator logs ===" + kubectl_kind -n monitoring logs deployment/kube-prometheus-operator --all-containers --tail=200 2>/dev/null || true + echo "=== kube-prometheus-operator previous logs ===" + kubectl_kind -n monitoring logs deployment/kube-prometheus-operator --all-containers --previous --tail=200 2>/dev/null || true + echo "=== Recent events (monitoring) ===" + kubectl_kind -n monitoring get events --sort-by='.lastTimestamp' 2>/dev/null | tail -80 || true +} + +print_kai_diagnostics() { + echo "=== KAI scheduler pods ===" + kubectl_kind -n kai-scheduler get pods -o wide 2>/dev/null || true + echo "=== KAI admission deployment ===" + kubectl_kind -n kai-scheduler get deployment admission -o wide 2>/dev/null || true + echo "=== KAI admission deployment describe ===" + kubectl_kind -n kai-scheduler describe deployment admission 2>/dev/null || true + echo "=== KAI admission pod describe ===" + kubectl_kind -n kai-scheduler get pods -o name 2>/dev/null \ + | grep '^pod/admission-' \ + | while read -r pod; do + kubectl_kind -n kai-scheduler describe "${pod}" 2>/dev/null || true + done || true + echo "=== KAI admission logs ===" + kubectl_kind -n kai-scheduler logs deployment/admission --all-containers --tail=200 2>/dev/null || true + echo "=== KAI scheduler logs ===" + kubectl_kind -n kai-scheduler logs deployment/kai-scheduler-default --tail=100 2>/dev/null || true + echo "=== KAI scheduler queues ===" + kubectl_kind get queues -A 2>/dev/null || true + echo "=== KAI scheduler podgroups ===" + kubectl_kind get podgroups -A 2>/dev/null || true + echo "=== Recent events (kai-scheduler) ===" + kubectl_kind -n kai-scheduler get events --sort-by='.lastTimestamp' 2>/dev/null | tail -50 || true +} + +print_custom_metrics() { + local metric + local ns + local namespaces=("$@") + + echo "=== Custom metrics API ===" + for metric in gpu_utilization gpu_memory_used gpu_power_usage; do + for ns in "${namespaces[@]}"; do + echo "--- ${ns}/${metric} ---" + kubectl_kind get --raw "/apis/custom.metrics.k8s.io/v1beta1/namespaces/${ns}/pods/*/${metric}" 2>/dev/null \ + | jq . || true + done + done +} + +print_metrics_pipeline_diagnostics() { + echo "=== prometheus-adapter pods ===" + kubectl_kind -n monitoring get pods -l app.kubernetes.io/name=prometheus-adapter -o wide 2>/dev/null || true + echo "=== DCGM Exporter pods ===" + kubectl_kind -n gpu-operator get pods -l app=nvidia-dcgm-exporter -o wide 2>/dev/null || true + echo "=== Monitoring pods ===" + kubectl_kind -n monitoring get pods -o wide 2>/dev/null || true + echo "=== DRA ResourceSlices ===" + kubectl_kind get resourceslices -o wide 2>/dev/null || true + echo "=== Node status ===" + kubectl_kind get nodes -o wide 2>/dev/null || true +} + +print_common_gpu_diagnostics() { + echo "=== ClusterPolicy status ===" + kubectl_kind get clusterpolicy -o yaml 2>/dev/null || true + echo "=== GPU Operator pods ===" + kubectl_kind -n gpu-operator get pods -o wide 2>/dev/null || true + echo "=== Non-running pods (all namespaces) ===" + kubectl_kind get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded 2>/dev/null || true + echo "=== Recent events (gpu-operator) ===" + kubectl_kind -n gpu-operator get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true +} + +print_h100_common_diagnostics() { + local metric_namespaces=("$@") + local common_namespaces=( + cert-manager + gpu-operator + monitoring + skyhook + nvsentinel + nvidia-dra-driver + nvidia-network-operator + kai-scheduler + ) + + print_setup_diagnostics + print_component_status_summary + print_workload_inventory "${common_namespaces[@]}" "${metric_namespaces[@]}" + print_common_gpu_diagnostics + print_kube_prometheus_operator_diagnostics + print_kai_diagnostics + print_custom_metrics gpu-operator "${metric_namespaces[@]}" + print_metrics_pipeline_diagnostics + echo "=== Node resources ===" + kubectl_kind describe nodes 2>/dev/null | grep -A 20 "Allocated resources" || true +} + +print_kubeflow_diagnostics() { + echo "=== Kubeflow Trainer deployment ===" + kubectl_kind -n kubeflow get deployment kubeflow-trainer-controller-manager -o wide 2>/dev/null || true + echo "=== Kubeflow pods ===" + kubectl_kind -n kubeflow get pods -o wide 2>/dev/null || true + echo "=== Kubeflow validating webhooks ===" + kubectl_kind get validatingwebhookconfigurations validator.trainer.kubeflow.org -o yaml 2>/dev/null || true + echo "=== Kubeflow Trainer CRD ===" + kubectl_kind get crd trainjobs.trainer.kubeflow.org -o yaml 2>/dev/null || true +} + +print_dynamo_diagnostics() { + echo "=== Dynamo pods ===" + kubectl_kind -n dynamo-system get pods -o wide 2>/dev/null || true + echo "=== Dynamo operator logs ===" + kubectl_kind -n dynamo-system logs deployment/dynamo-platform-dynamo-operator-controller-manager --tail=100 -c manager 2>/dev/null || true + echo "=== Recent events (dynamo-system) ===" + kubectl_kind -n dynamo-system get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true +} + +print_kgateway_diagnostics() { + echo "=== kgateway pods ===" + kubectl_kind -n kgateway-system get pods -o wide 2>/dev/null || true + echo "=== GatewayClass status ===" + kubectl_kind get gatewayclass -o yaml 2>/dev/null || true + echo "=== Gateway status ===" + kubectl_kind get gateways -A -o yaml 2>/dev/null || true +} + +case "${mode}" in + smoke) + print_setup_diagnostics + print_common_gpu_diagnostics + echo "=== Node status ===" + kubectl_kind get nodes -o wide 2>/dev/null || true + ;; + training) + print_h100_common_diagnostics kubeflow + print_kubeflow_diagnostics + ;; + inference) + print_h100_common_diagnostics dynamo-system kgateway-system + print_dynamo_diagnostics + print_kgateway_diagnostics + ;; + *) + echo "::error::unknown GPU_TEST_DIAGNOSTIC_MODE: ${mode}" + exit 1 + ;; +esac diff --git a/.github/scripts/gpu-runtime-component-health.sh b/.github/scripts/gpu-runtime-component-health.sh new file mode 100644 index 000000000..3d668d37b --- /dev/null +++ b/.github/scripts/gpu-runtime-component-health.sh @@ -0,0 +1,136 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +if [[ $# -ne 1 ]]; then + echo "::error::Usage: $0 " + exit 2 +fi + +mode="$1" +COMPONENT_HEALTH_TIMEOUT="${COMPONENT_HEALTH_TIMEOUT:-120s}" + +duration_seconds() { + local input_value="$1" + local number="${input_value%[smh]}" + local unit="${input_value: -1}" + + case "${unit}" in + s) echo "$((10#${number}))" ;; + m) echo "$((10#${number} * 60))" ;; + h) echo "$((10#${number} * 3600))" ;; + *) + echo "::error::unsupported duration '${input_value}'" >&2 + exit 1 + ;; + esac +} + +kubectl_kind() { + timeout 30s kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" "$@" +} + +kubectl_kind_wait() { + # kubectl wait opens a watch that is already bounded by --timeout. Keep + # request-timeout unset here so a slow API server does not cut the watch short. + kubectl --context="kind-${KIND_CLUSTER_NAME}" "$@" +} + +print_namespace_diagnostics() { + local ns="$1" + + echo "=== ${ns} workloads ===" + kubectl_kind -n "${ns}" get deployments,statefulsets,daemonsets,pods -o wide 2>/dev/null || true + echo "=== Recent events (${ns}) ===" + kubectl_kind -n "${ns}" get events --sort-by='.lastTimestamp' 2>/dev/null | tail -80 || true +} + +wait_for_deployments() { + local ns="$1" + shift + local deployments=("$@") + + echo "Waiting up to ${COMPONENT_HEALTH_TIMEOUT} for ${ns}: ${deployments[*]}" + if kubectl_kind_wait -n "${ns}" wait \ + --for=condition=Available \ + --timeout="${COMPONENT_HEALTH_TIMEOUT}" \ + "${deployments[@]}"; then + return 0 + fi + + echo "::error::One or more deployments in ${ns} did not become Available within ${COMPONENT_HEALTH_TIMEOUT}: ${deployments[*]}" + print_namespace_diagnostics "${ns}" + return 1 +} + +wait_for_required_object() { + local resource="$1" + local timeout_seconds + local deadline + + timeout_seconds="$(duration_seconds "${COMPONENT_HEALTH_TIMEOUT}")" + deadline=$((SECONDS + timeout_seconds)) + + echo "Waiting up to ${COMPONENT_HEALTH_TIMEOUT} for ${resource}" + while (( SECONDS <= deadline )); do + if kubectl_kind get "${resource}" >/dev/null; then + return 0 + fi + sleep 2 + done + + echo "::error::Required object is missing: ${resource}" + kubectl_kind get "${resource}" -o yaml 2>/dev/null || true + kubectl_kind describe "${resource}" 2>/dev/null || true + return 1 +} + +echo "=== Runtime component health (${mode}) ===" + +wait_for_deployments monitoring \ + deployment/kube-prometheus-operator + +wait_for_deployments kai-scheduler \ + deployment/kai-scheduler-default \ + deployment/admission \ + deployment/binder \ + deployment/kai-operator \ + deployment/pod-grouper \ + deployment/podgroup-controller \ + deployment/queue-controller + +case "${mode}" in + training) + wait_for_deployments kubeflow \ + deployment/kubeflow-trainer-controller-manager + wait_for_required_object validatingwebhookconfiguration/validator.trainer.kubeflow.org + wait_for_required_object customresourcedefinition/trainjobs.trainer.kubeflow.org + ;; + inference) + wait_for_deployments dynamo-system \ + deployment/dynamo-platform-dynamo-operator-controller-manager \ + deployment/grove-operator + wait_for_deployments kgateway-system \ + deployment/kgateway \ + deployment/inference-gateway + ;; + *) + echo "::error::unknown runtime component health mode: ${mode}" + exit 2 + ;; +esac + +echo "Runtime component health check passed." diff --git a/.github/scripts/gpu-smoke-run-nvidia-smi.sh b/.github/scripts/gpu-smoke-run-nvidia-smi.sh new file mode 100644 index 000000000..0d4ea31d7 --- /dev/null +++ b/.github/scripts/gpu-smoke-run-nvidia-smi.sh @@ -0,0 +1,55 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +KUBECTL_REQUEST_TIMEOUT="${KUBECTL_REQUEST_TIMEOUT:-10s}" +KUBECTL_WAIT_REQUEST_TIMEOUT="${KUBECTL_WAIT_REQUEST_TIMEOUT:-130s}" +POD_NAME_FILE="${POD_NAME_FILE:-/tmp/aicr-gpu-smoke-pod-name-${KIND_CLUSTER_NAME}}" + +kubectl_kind() { + kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" --context="kind-${KIND_CLUSTER_NAME}" "$@" +} + +kubectl_kind_wait() { + timeout 150s kubectl --request-timeout="${KUBECTL_WAIT_REQUEST_TIMEOUT}" --context="kind-${KIND_CLUSTER_NAME}" "$@" +} + +pod_name=$(cat <<'EOF' | kubectl_kind create -f - -o jsonpath='{.metadata.name}' +apiVersion: v1 +kind: Pod +metadata: + generateName: gpu-smoke-test- + labels: + app: gpu-smoke-test +spec: + restartPolicy: Never + containers: + - name: nvidia-smi + image: ubuntu:22.04 + command: ["nvidia-smi"] + resources: + limits: + nvidia.com/gpu: 1 +EOF +) + +echo "${pod_name}" > "${POD_NAME_FILE}" + +echo "Waiting for ${pod_name} pod to complete..." +kubectl_kind_wait wait "pod/${pod_name}" \ + --for=condition=Ready --timeout=120s || true +kubectl_kind_wait wait "pod/${pod_name}" \ + --for=jsonpath='{.status.phase}'=Succeeded --timeout=120s diff --git a/.github/scripts/gpu-smoke-show-nvidia-smi-log.sh b/.github/scripts/gpu-smoke-show-nvidia-smi-log.sh new file mode 100644 index 000000000..05bc09523 --- /dev/null +++ b/.github/scripts/gpu-smoke-show-nvidia-smi-log.sh @@ -0,0 +1,46 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +KUBECTL_REQUEST_TIMEOUT="${KUBECTL_REQUEST_TIMEOUT:-10s}" +POD_NAME_FILE="${POD_NAME_FILE:-/tmp/aicr-gpu-smoke-pod-name-${KIND_CLUSTER_NAME}}" +trap 'rm -f "${POD_NAME_FILE}"' EXIT + +kubectl_kind() { + kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" --context="kind-${KIND_CLUSTER_NAME}" "$@" +} + +pod_name="" +if [[ -f "${POD_NAME_FILE}" ]]; then + pod_name="$(cat "${POD_NAME_FILE}")" + if [[ -n "${pod_name}" ]] && ! kubectl_kind get pod "${pod_name}" >/dev/null 2>&1; then + pod_name="" + fi +fi + +if [[ -z "${pod_name}" ]]; then + pod_name=$(kubectl_kind get pods \ + -l app=gpu-smoke-test \ + --sort-by=.metadata.creationTimestamp \ + -o jsonpath='{.items[-1:].metadata.name}') +fi + +if [[ -z "${pod_name}" ]]; then + echo "::error::no gpu-smoke-test pod found" + exit 1 +fi + +kubectl_kind logs "${pod_name}" diff --git a/.github/scripts/gpu-validate-conformance.sh b/.github/scripts/gpu-validate-conformance.sh new file mode 100644 index 000000000..79550cb3a --- /dev/null +++ b/.github/scripts/gpu-validate-conformance.sh @@ -0,0 +1,29 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +AICR_VALIDATOR_IMAGE_REGISTRY=ko.local \ +./aicr validate \ + --recipe recipe.yaml \ + --phase conformance \ + --namespace gpu-operator \ + --kubeconfig="${HOME}/.kube/config" \ + --require-gpu \ + --image=ko.local:smoke-test \ + --timeout=10m \ + --toleration '*' \ + --output=validation-result.yaml \ + --evidence-dir=conformance-evidence diff --git a/.github/workflows/gpu-h100-inference-test.yaml b/.github/workflows/gpu-h100-inference-test.yaml index c5e1882d4..4f06bb396 100644 --- a/.github/workflows/gpu-h100-inference-test.yaml +++ b/.github/workflows/gpu-h100-inference-test.yaml @@ -16,12 +16,10 @@ name: GPU Inference Test (nvkind + H100 x2) on: schedule: - - cron: '15 6,18 * * *' # Every 12 hours (2x daily), offset from T4 smoke test + - cron: '15 6,18 * * *' # Every 12 hours (2x daily), offset from training test push: branches: - "pull-request/[0-9]+" - pull_request: - types: [labeled] workflow_dispatch: {} # Allow manual runs permissions: @@ -40,6 +38,8 @@ jobs: should-run: ${{ steps.filter.outputs.matched }} steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false - uses: dorny/paths-filter@fbd0ab8f3e69293af611ebaee6363fc25e6d187d # v4.0.1 id: filter with: @@ -50,13 +50,21 @@ jobs: - '.settings.yaml' - '.github/actions/gpu-cluster-setup/**' - '.github/actions/gpu-operator-install/**' + - '.github/actions/check-control-plane-health/**' - '.github/actions/aicr-build/**' - '.github/actions/setup-build-tools/**' - '.github/actions/install-karpenter-kwok/**' - 'validators/*/Dockerfile' - 'pkg/evidence/**' + - '.github/workflows/gpu-h100-kind-runtime-test.yaml' + - '.github/actions/gpu-debug-diagnostics/**' - '.github/actions/gpu-test-cleanup/**' - '.github/actions/load-versions/**' + - '.github/scripts/gpu-chainsaw-health.sh' + - '.github/scripts/gpu-debug-diagnostics.sh' + - '.github/scripts/gpu-runtime-component-health.sh' + - '.github/scripts/gpu-validate-conformance.sh' + - 'pkg/bundler/deployer/helm/**' - 'tests/chainsaw/chainsaw-config.yaml' - 'tests/chainsaw/ai-conformance/main.go' - 'tests/chainsaw/ai-conformance/common/**' @@ -82,203 +90,23 @@ jobs: - 'pkg/defaults/timeouts.go' - 'validators/conformance/**' + # NVIDIA self-hosted GPU runners reject pull_request event jobs before + # checkout. PR GPU coverage runs through the pull-request/ push + # mirror after ok-to-test approval. gpu-inference-test: needs: [check-paths] if: > always() && ( github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || - (github.event_name == 'pull_request' && github.event.label.name == 'run-gpu-tests') || (github.event_name == 'push' && needs.check-paths.outputs.should-run == 'true') ) name: GPU Inference Test (nvkind + H100 x2) - concurrency: - group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }} - cancel-in-progress: true - runs-on: linux-amd64-gpu-h100-latest-2 - timeout-minutes: 120 - - env: - KIND_CLUSTER_NAME: gpu-inference-test - - steps: - - - name: Checkout Code - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - persist-credentials: false - - - name: Set up GPU cluster - uses: ./.github/actions/gpu-cluster-setup - - - name: Build aicr - uses: ./.github/actions/aicr-build - with: - validator_phases: 'conformance' - - - name: Install runtime bundle - id: bundle-install - uses: ./.github/actions/gpu-operator-install - with: - method: bundle - accelerator: h100 - platform: dynamo - - # --- Snapshot and GPU validation --- - - - name: Snapshot and validate GPU - uses: ./.github/actions/gpu-snapshot-validate - with: - gpu_model: H100 - min_gpu_count: '2' - cluster_name: ${{ env.KIND_CLUSTER_NAME }} - - # --- Install Karpenter + KWOK early to give monitoring stack settle time --- - - - name: Install Karpenter + KWOK - uses: ./.github/actions/install-karpenter-kwok - with: - cluster_name: ${{ env.KIND_CLUSTER_NAME }} - - # --- Health checks --- - - - name: Prepare chainsaw - id: versions - uses: ./.github/actions/load-versions - - - name: Install chainsaw - uses: ./.github/actions/setup-build-tools - with: - install_chainsaw: 'true' - chainsaw_version: '${{ steps.versions.outputs.chainsaw }}' - - - name: Run chainsaw health checks - run: | - chainsaw test \ - --test-dir tests/chainsaw/ai-conformance/kind-inference-dynamo \ - --config tests/chainsaw/chainsaw-config.yaml - - # --- CNCF AI Conformance validation --- - # Runs after the stack health checks so gateway and metrics validators - # see a settled inference stack. - - - name: Verify expected resources exist - run: | - go run ./tests/chainsaw/ai-conformance/ \ - --dir tests/chainsaw/ai-conformance/kind-inference-dynamo \ - --dir tests/chainsaw/ai-conformance/common \ - --dir tests/chainsaw/ai-conformance/kind-common \ - --kubeconfig="${HOME}/.kube/config" \ - --debug - - - name: Validate CNCF AI Conformance - id: validate-conformance - run: | - AICR_VALIDATOR_IMAGE_REGISTRY=ko.local \ - ./aicr validate \ - --recipe recipe.yaml \ - --phase conformance \ - --namespace gpu-operator \ - --kubeconfig="${HOME}/.kube/config" \ - --require-gpu \ - --image=ko.local:smoke-test \ - --timeout=10m \ - --toleration '*' \ - --output=validation-result.yaml \ - --evidence-dir=conformance-evidence - - # Dynamo smoke is intentionally disabled for now. The vLLM runtime image - # adds significant latency and flakiness in Kind CI, and training has no - # matching smoke path yet. Reintroduce it later alongside a symmetric - # training smoke test if needed. - # --- Validation artifacts --- - - # Collect a post-run resource snapshot regardless of whether conformance - # validation ran, so triage always has a cluster-state artifact. - - name: Collect validation artifacts - if: >- - always() - && !cancelled() - && steps.bundle-install.outcome == 'success' - continue-on-error: true - shell: bash - run: | - set -o pipefail - mkdir -p conformance-evidence - go run ./tests/chainsaw/ai-conformance/ \ - --dir tests/chainsaw/ai-conformance/kind-inference-dynamo \ - --dir tests/chainsaw/ai-conformance/common \ - --dir tests/chainsaw/ai-conformance/kind-common \ - --kubeconfig="${HOME}/.kube/config" \ - --debug | tee conformance-evidence/resource-existence-post.txt - - - name: Upload validation artifacts - if: always() - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 - with: - name: conformance-evidence - path: | - conformance-evidence/ - validation-result.yaml - if-no-files-found: warn - - - name: Debug diagnostics - if: failure() - run: | - echo "=== ClusterPolicy status ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" get clusterpolicy -o yaml 2>/dev/null || true - echo "=== GPU Operator pods ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods -o wide 2>/dev/null || true - echo "=== Non-running pods (all namespaces) ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded 2>/dev/null || true - echo "=== Recent events (gpu-operator) ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true - echo "=== Dynamo pods ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system get pods -o wide 2>/dev/null || true - echo "=== Dynamo operator logs ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system \ - logs deployment/dynamo-operator-controller-manager --tail=100 -c manager 2>/dev/null || true - echo "=== Recent events (dynamo-system) ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true - echo "=== Custom metrics API ===" - for METRIC in gpu_utilization gpu_memory_used gpu_power_usage; do - echo "--- ${METRIC} ---" - for NS in gpu-operator dynamo-system; do - kubectl --context="kind-${KIND_CLUSTER_NAME}" get --raw \ - "/apis/custom.metrics.k8s.io/v1beta1/namespaces/${NS}/pods/*/${METRIC}" 2>/dev/null | jq . || true - done - done - echo "=== Grafana deployment ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring get deployment grafana -o wide 2>/dev/null || true - echo "=== Grafana pods ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring get pods \ - -l app.kubernetes.io/name=grafana -o wide 2>/dev/null || true - echo "=== Grafana deployment describe ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring describe deployment grafana 2>/dev/null || true - echo "=== Grafana pod describe ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring describe pods \ - -l app.kubernetes.io/name=grafana 2>/dev/null || true - echo "=== prometheus-adapter pods ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring get pods -l app.kubernetes.io/name=prometheus-adapter -o wide 2>/dev/null || true - echo "=== kgateway pods ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kgateway-system get pods -o wide 2>/dev/null || true - echo "=== GatewayClass status ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" get gatewayclass -o yaml 2>/dev/null || true - echo "=== Gateway status ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" get gateways -A -o yaml 2>/dev/null || true - echo "=== DCGM Exporter pods ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \ - get pods -l app=nvidia-dcgm-exporter -o wide 2>/dev/null || true - echo "=== Monitoring pods ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring get pods -o wide 2>/dev/null || true - echo "=== DRA ResourceSlices ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" get resourceslices -o wide 2>/dev/null || true - echo "=== Node status ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" get nodes -o wide 2>/dev/null || true - - - name: GPU Test Cleanup - if: always() - uses: ./.github/actions/gpu-test-cleanup - with: - cluster_name: ${{ env.KIND_CLUSTER_NAME }} - artifact_name_prefix: gpu-inference-test-debug + uses: ./.github/workflows/gpu-h100-kind-runtime-test.yaml + with: + job_name: GPU Inference Test (nvkind + H100 x2) + cluster_name: gpu-inference-test + intent: inference + platform: dynamo + chainsaw_path: tests/chainsaw/ai-conformance/kind-inference-dynamo + artifact_name_prefix: gpu-inference-test-debug diff --git a/.github/workflows/gpu-h100-kind-runtime-test.yaml b/.github/workflows/gpu-h100-kind-runtime-test.yaml new file mode 100644 index 000000000..6d0f8757b --- /dev/null +++ b/.github/workflows/gpu-h100-kind-runtime-test.yaml @@ -0,0 +1,221 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: GPU H100 Kind Runtime Test + +on: + workflow_call: + inputs: + job_name: + description: 'Display name for the H100 runtime job' + required: true + type: string + cluster_name: + description: 'Kind cluster name' + required: true + type: string + intent: + description: 'Runtime intent passed to the bundle installer' + required: true + type: string + platform: + description: 'Runtime platform passed to the bundle installer' + required: true + type: string + chainsaw_path: + description: 'Chainsaw health-check directory' + required: true + type: string + artifact_name_prefix: + description: 'Prefix for uploaded debug artifacts' + required: true + type: string + +permissions: + contents: read + +jobs: + gpu-h100-kind-runtime-test: + name: ${{ inputs.job_name }} + runs-on: linux-amd64-gpu-h100-latest-2 + # Cold self-hosted H100 runners can spend most of this budget pulling + # images and loading Kind nodes before validation starts. + timeout-minutes: 180 + concurrency: + group: gpu-h100-${{ inputs.cluster_name }}-${{ github.event_name }}-${{ github.ref }} + cancel-in-progress: true + + env: + KIND_CLUSTER_NAME: ${{ inputs.cluster_name }} + + steps: + - name: Checkout Code + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + + - name: Load GPU test versions + id: versions + uses: ./.github/actions/load-versions + + - name: Set up GPU cluster + uses: ./.github/actions/gpu-cluster-setup + with: + kind_node_image: ${{ steps.versions.outputs.h100_kind_node_image }} + min_gpu_count: '2' + gpu_model_pattern: H100 + min_free_disk_gb: '50' + min_available_memory_gb: '16' + cluster_create_timeout: 900s + control_plane_resource_patches: 'true' + control_plane_leader_election_tuning: 'true' + + - name: Build aicr and snapshot agent image + uses: ./.github/actions/aicr-build + with: + build_snapshot_agent: 'true' + validator_phases: 'none' + + # Fast readiness gate after cluster setup. Stability windows start after + # runtime install, where component rollouts can stress the control plane. + - name: Check control plane health + uses: ./.github/actions/check-control-plane-health + with: + cluster_name: ${{ env.KIND_CLUSTER_NAME }} + wait_timeout: 120s + stability_window: 0s + recover_unhealthy: 'true' + + - name: Install runtime bundle + id: bundle-install + uses: ./.github/actions/gpu-operator-install + with: + method: bundle + accelerator: h100 + intent: ${{ inputs.intent }} + platform: ${{ inputs.platform }} + wait: 'true' + best_effort: 'false' + + - name: Check runtime component health + run: bash .github/scripts/gpu-runtime-component-health.sh "${{ inputs.intent }}" + + # Runtime install creates many CRDs, webhooks, and controllers. Keep a + # stability window here to catch KCM/scheduler restarts before snapshot. + - name: Check control plane health + uses: ./.github/actions/check-control-plane-health + with: + cluster_name: ${{ env.KIND_CLUSTER_NAME }} + wait_timeout: 120s + stability_window: 60s + recover_unhealthy: 'true' + + - name: Snapshot and validate GPU + uses: ./.github/actions/gpu-snapshot-validate + with: + gpu_model: H100 + min_gpu_count: '2' + cluster_name: ${{ env.KIND_CLUSTER_NAME }} + snapshot_timeout: 10m + + # Snapshot deploys a GPU Job and exercises cluster discovery; verify the + # control plane stayed stable before adding Karpenter/KWOK. + - name: Check control plane health + uses: ./.github/actions/check-control-plane-health + with: + cluster_name: ${{ env.KIND_CLUSTER_NAME }} + wait_timeout: 120s + stability_window: 60s + recover_unhealthy: 'true' + + - name: Install Karpenter + KWOK + uses: ./.github/actions/install-karpenter-kwok + with: + cluster_name: ${{ env.KIND_CLUSTER_NAME }} + kwok_helm_timeout: 600s + ko_build_timeout: '1200' + karpenter_helm_timeout: 600s + + - name: Install chainsaw + uses: ./.github/actions/setup-build-tools + with: + install_chainsaw: 'true' + chainsaw_version: '${{ steps.versions.outputs.chainsaw }}' + + # This gate is effectively post-Karpenter/KWOK; the chainsaw setup above + # only installs a runner-side binary. + - name: Check control plane health + uses: ./.github/actions/check-control-plane-health + with: + cluster_name: ${{ env.KIND_CLUSTER_NAME }} + wait_timeout: 120s + stability_window: 60s + recover_unhealthy: 'true' + + - name: Run chainsaw health checks + run: bash .github/scripts/gpu-chainsaw-health.sh "${{ inputs.chainsaw_path }}" + + - name: Build conformance validator image + uses: ./.github/actions/aicr-build + with: + build_cli: 'false' + build_snapshot_agent: 'false' + validator_phases: 'conformance' + + # Validator image build/load can contend with Docker and kind containerd; + # verify the control plane before the final conformance workload. + - name: Check control plane health + uses: ./.github/actions/check-control-plane-health + with: + cluster_name: ${{ env.KIND_CLUSTER_NAME }} + wait_timeout: 120s + stability_window: 60s + recover_unhealthy: 'true' + + - name: Validate CNCF AI Conformance + id: validate-conformance + run: bash .github/scripts/gpu-validate-conformance.sh + + - name: Upload validation artifacts + if: always() + timeout-minutes: 5 + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: conformance-evidence + path: | + conformance-evidence/ + validation-result.yaml + if-no-files-found: warn + + - name: Debug diagnostics + if: failure() + timeout-minutes: 5 + uses: ./.github/actions/gpu-debug-diagnostics + with: + cluster_name: ${{ env.KIND_CLUSTER_NAME }} + mode: ${{ inputs.intent }} + + - name: Mark debug artifact collection + id: gpu-debug-artifacts + if: failure() || cancelled() + shell: bash + run: echo "collect=true" >> "${GITHUB_OUTPUT}" + + - name: GPU Test Cleanup + if: always() + uses: ./.github/actions/gpu-test-cleanup + with: + cluster_name: ${{ env.KIND_CLUSTER_NAME }} + artifact_name_prefix: ${{ inputs.artifact_name_prefix }} + collect_artifacts: ${{ steps.gpu-debug-artifacts.outputs.collect == 'true' }} diff --git a/.github/workflows/gpu-h100-training-test.yaml b/.github/workflows/gpu-h100-training-test.yaml index d3a04de03..51fbed8ba 100644 --- a/.github/workflows/gpu-h100-training-test.yaml +++ b/.github/workflows/gpu-h100-training-test.yaml @@ -20,8 +20,6 @@ on: push: branches: - "pull-request/[0-9]+" - pull_request: - types: [labeled] workflow_dispatch: {} # Allow manual runs permissions: @@ -40,6 +38,8 @@ jobs: should-run: ${{ steps.filter.outputs.matched }} steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false - uses: dorny/paths-filter@fbd0ab8f3e69293af611ebaee6363fc25e6d187d # v4.0.1 id: filter with: @@ -50,13 +50,21 @@ jobs: - '.settings.yaml' - '.github/actions/gpu-cluster-setup/**' - '.github/actions/gpu-operator-install/**' + - '.github/actions/check-control-plane-health/**' - '.github/actions/aicr-build/**' - '.github/actions/setup-build-tools/**' - '.github/actions/install-karpenter-kwok/**' - 'validators/*/Dockerfile' - 'pkg/evidence/**' + - '.github/workflows/gpu-h100-kind-runtime-test.yaml' + - '.github/actions/gpu-debug-diagnostics/**' - '.github/actions/gpu-test-cleanup/**' - '.github/actions/load-versions/**' + - '.github/scripts/gpu-chainsaw-health.sh' + - '.github/scripts/gpu-debug-diagnostics.sh' + - '.github/scripts/gpu-runtime-component-health.sh' + - '.github/scripts/gpu-validate-conformance.sh' + - 'pkg/bundler/deployer/helm/**' - 'tests/chainsaw/chainsaw-config.yaml' - 'tests/chainsaw/ai-conformance/main.go' - 'tests/chainsaw/ai-conformance/common/**' @@ -78,187 +86,23 @@ jobs: - 'pkg/defaults/timeouts.go' - 'validators/conformance/**' + # NVIDIA self-hosted GPU runners reject pull_request event jobs before + # checkout. PR GPU coverage runs through the pull-request/ push + # mirror after ok-to-test approval. gpu-training-test: needs: [check-paths] if: > always() && ( github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || - (github.event_name == 'pull_request' && github.event.label.name == 'run-gpu-tests') || (github.event_name == 'push' && needs.check-paths.outputs.should-run == 'true') ) name: GPU Training Test (nvkind + H100 x2) - concurrency: - group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }} - cancel-in-progress: true - runs-on: linux-amd64-gpu-h100-latest-2 - timeout-minutes: 120 - - env: - KIND_CLUSTER_NAME: gpu-training-test - - steps: - - - name: Checkout Code - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - persist-credentials: false - - - name: Set up GPU cluster - uses: ./.github/actions/gpu-cluster-setup - - - name: Build aicr - uses: ./.github/actions/aicr-build - with: - validator_phases: 'conformance' - - - name: Install runtime bundle - id: bundle-install - uses: ./.github/actions/gpu-operator-install - with: - method: bundle - accelerator: h100 - intent: training - platform: kubeflow - - # --- Snapshot and GPU validation --- - - - name: Snapshot and validate GPU - uses: ./.github/actions/gpu-snapshot-validate - with: - gpu_model: H100 - min_gpu_count: '2' - cluster_name: ${{ env.KIND_CLUSTER_NAME }} - - # --- Install Karpenter + KWOK early to give monitoring stack settle time --- - - - name: Install Karpenter + KWOK - uses: ./.github/actions/install-karpenter-kwok - with: - cluster_name: ${{ env.KIND_CLUSTER_NAME }} - - # --- Health checks --- - - - name: Prepare chainsaw - id: versions - uses: ./.github/actions/load-versions - - - name: Install chainsaw - uses: ./.github/actions/setup-build-tools - with: - install_chainsaw: 'true' - chainsaw_version: '${{ steps.versions.outputs.chainsaw }}' - - - name: Run chainsaw health checks - run: | - chainsaw test \ - --test-dir tests/chainsaw/ai-conformance/kind-training-kubeflow \ - --config tests/chainsaw/chainsaw-config.yaml - - # --- CNCF AI Conformance validation --- - # Runs last to ensure the DCGM → Prometheus → adapter pipeline - # has had time to bootstrap (pod-autoscaling check needs live metric data). - - - name: Verify expected resources exist - run: | - go run ./tests/chainsaw/ai-conformance/ \ - --dir tests/chainsaw/ai-conformance/kind-training-kubeflow \ - --dir tests/chainsaw/ai-conformance/common \ - --dir tests/chainsaw/ai-conformance/kind-common \ - --kubeconfig="${HOME}/.kube/config" \ - --debug - - - name: Validate CNCF AI Conformance - id: validate-conformance - run: | - AICR_VALIDATOR_IMAGE_REGISTRY=ko.local \ - ./aicr validate \ - --recipe recipe.yaml \ - --phase conformance \ - --namespace gpu-operator \ - --kubeconfig="${HOME}/.kube/config" \ - --require-gpu \ - --image=ko.local:smoke-test \ - --timeout=10m \ - --toleration '*' \ - --output=validation-result.yaml \ - --evidence-dir=conformance-evidence - - # --- Validation artifacts --- - - # Collect a post-run resource snapshot regardless of whether conformance - # validation ran, so triage always has a cluster-state artifact. - - name: Collect validation artifacts - if: >- - always() - && !cancelled() - && steps.bundle-install.outcome == 'success' - continue-on-error: true - shell: bash - run: | - set -o pipefail - mkdir -p conformance-evidence - go run ./tests/chainsaw/ai-conformance/ \ - --dir tests/chainsaw/ai-conformance/kind-training-kubeflow \ - --dir tests/chainsaw/ai-conformance/common \ - --dir tests/chainsaw/ai-conformance/kind-common \ - --kubeconfig="${HOME}/.kube/config" \ - --debug | tee conformance-evidence/resource-existence-post.txt - - - name: Upload validation artifacts - if: always() - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 - with: - name: conformance-evidence - path: | - conformance-evidence/ - validation-result.yaml - if-no-files-found: warn - - # --- Debug diagnostics (before cleanup so resources still exist) --- - - - name: Debug diagnostics - if: failure() - run: | - echo "=== Grafana deployment ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring get deployment grafana -o wide 2>/dev/null || true - echo "=== Grafana pods ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring get pods \ - -l app.kubernetes.io/name=grafana -o wide 2>/dev/null || true - echo "=== Grafana deployment describe ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring describe deployment grafana 2>/dev/null || true - echo "=== Grafana pod describe ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring describe pods \ - -l app.kubernetes.io/name=grafana 2>/dev/null || true - echo "=== KAI scheduler pods ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kai-scheduler get pods -o wide 2>/dev/null || true - echo "=== KAI scheduler logs ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kai-scheduler \ - logs deployment/kai-scheduler-default --tail=100 2>/dev/null || true - echo "=== KAI scheduler queues ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" get queues -A 2>/dev/null || true - echo "=== KAI scheduler podgroups ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" get podgroups -A 2>/dev/null || true - echo "=== Kubeflow Trainer deployment ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kubeflow get deployment kubeflow-trainer-controller-manager -o wide 2>/dev/null || true - echo "=== Kubeflow pods ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kubeflow get pods -o wide 2>/dev/null || true - echo "=== Kubeflow validating webhooks ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" get validatingwebhookconfigurations validator.trainer.kubeflow.org -o yaml 2>/dev/null || true - echo "=== Kubeflow Trainer CRD ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" get crd trainjobs.trainer.kubeflow.org -o yaml 2>/dev/null || true - echo "=== Non-running pods (all namespaces) ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" get pods -A \ - --field-selector=status.phase!=Running,status.phase!=Succeeded 2>/dev/null || true - echo "=== GPU Operator pods ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods -o wide 2>/dev/null || true - echo "=== Node resources ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" describe nodes 2>/dev/null | \ - grep -A 20 "Allocated resources" || true - - - name: GPU Test Cleanup - if: always() - uses: ./.github/actions/gpu-test-cleanup - with: - cluster_name: ${{ env.KIND_CLUSTER_NAME }} - artifact_name_prefix: gpu-training-test-debug + uses: ./.github/workflows/gpu-h100-kind-runtime-test.yaml + with: + job_name: GPU Training Test (nvkind + H100 x2) + cluster_name: gpu-training-test + intent: training + platform: kubeflow + chainsaw_path: tests/chainsaw/ai-conformance/kind-training-kubeflow + artifact_name_prefix: gpu-training-test-debug diff --git a/.github/workflows/gpu-smoke-test.yaml b/.github/workflows/gpu-smoke-test.yaml index d5b8c5c74..af8d3860c 100644 --- a/.github/workflows/gpu-smoke-test.yaml +++ b/.github/workflows/gpu-smoke-test.yaml @@ -20,8 +20,6 @@ on: push: branches: - "pull-request/[0-9]+" - pull_request: - types: [labeled] workflow_dispatch: {} # Allow manual runs permissions: @@ -40,6 +38,8 @@ jobs: should-run: ${{ steps.filter.outputs.matched }} steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false - uses: dorny/paths-filter@fbd0ab8f3e69293af611ebaee6363fc25e6d187d # v4.0.1 id: filter with: @@ -50,8 +50,13 @@ jobs: - '.github/actions/gpu-cluster-setup/**' - '.github/actions/gpu-operator-install/**' - '.github/actions/aicr-build/**' + - '.github/actions/gpu-debug-diagnostics/**' - '.github/actions/gpu-test-cleanup/**' + - '.github/actions/gpu-smoke-nvidia-smi/**' - '.github/actions/load-versions/**' + - '.github/scripts/gpu-debug-diagnostics.sh' + - '.github/scripts/gpu-smoke-run-nvidia-smi.sh' + - '.github/scripts/gpu-smoke-show-nvidia-smi-log.sh' - 'pkg/collector/**' - 'pkg/snapshotter/**' - '.github/actions/gpu-snapshot-validate/**' @@ -62,11 +67,13 @@ jobs: gpu-smoke-test: needs: [check-paths] + # NVIDIA self-hosted GPU runners reject pull_request event jobs before + # checkout. PR GPU coverage runs through the pull-request/ push + # mirror after ok-to-test approval. if: > always() && ( github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || - (github.event_name == 'pull_request' && github.event.label.name == 'run-gpu-tests') || (github.event_name == 'push' && needs.check-paths.outputs.should-run == 'true') ) name: GPU Smoke Test (nvkind + L40G) @@ -88,6 +95,12 @@ jobs: - name: Set up GPU cluster uses: ./.github/actions/gpu-cluster-setup + with: + # Keep smoke runner preflight explicit so action default changes do not + # silently alter L40G coverage. + min_gpu_count: '1' + min_free_disk_gb: '20' + min_available_memory_gb: '8' - name: Build aicr uses: ./.github/actions/aicr-build @@ -100,31 +113,9 @@ jobs: method: helm - name: Run nvidia-smi in a pod - run: | - cat <<'EOF' | kubectl --context="kind-${KIND_CLUSTER_NAME}" apply -f - - apiVersion: v1 - kind: Pod - metadata: - name: gpu-smoke-test - spec: - restartPolicy: Never - containers: - - name: nvidia-smi - image: ubuntu:22.04 - command: ["nvidia-smi"] - resources: - limits: - nvidia.com/gpu: 1 - EOF - - echo "Waiting for gpu-smoke-test pod to complete..." - kubectl --context="kind-${KIND_CLUSTER_NAME}" wait pod/gpu-smoke-test \ - --for=condition=Ready --timeout=120s || true - kubectl --context="kind-${KIND_CLUSTER_NAME}" wait pod/gpu-smoke-test \ - --for=jsonpath='{.status.phase}'=Succeeded --timeout=120s - - - name: Show nvidia-smi output - run: kubectl --context="kind-${KIND_CLUSTER_NAME}" logs gpu-smoke-test + uses: ./.github/actions/gpu-smoke-nvidia-smi + with: + cluster_name: ${{ env.KIND_CLUSTER_NAME }} # --- Snapshot and validation --- @@ -137,20 +128,20 @@ jobs: - name: Debug diagnostics if: failure() - run: | - echo "=== ClusterPolicy status ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" get clusterpolicy -o yaml 2>/dev/null || true - echo "=== GPU Operator pods ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods -o wide 2>/dev/null || true - echo "=== Non-running pods (all namespaces) ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded 2>/dev/null || true - echo "=== Recent events (gpu-operator) ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true - echo "=== Node status ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" get nodes -o wide 2>/dev/null || true + uses: ./.github/actions/gpu-debug-diagnostics + with: + cluster_name: ${{ env.KIND_CLUSTER_NAME }} + mode: smoke + + - name: Mark debug artifact collection + id: gpu-debug-artifacts + if: failure() || cancelled() + shell: bash + run: echo "collect=true" >> "${GITHUB_OUTPUT}" - name: GPU Test Cleanup if: always() uses: ./.github/actions/gpu-test-cleanup with: cluster_name: ${{ env.KIND_CLUSTER_NAME }} + collect_artifacts: ${{ steps.gpu-debug-artifacts.outputs.collect == 'true' }} diff --git a/.settings.yaml b/.settings.yaml index 75b4559b1..5ef15198f 100644 --- a/.settings.yaml +++ b/.settings.yaml @@ -40,6 +40,7 @@ security_tools: testing_tools: kubectl: 'v1.35.0' kind: '0.31.0' + nvkind: '78a0a514c41c3e77ac0d935f38d971d3b4455138' ctlptl: '0.9.0' tilt: '0.37.0' helm: 'v4.1.1' @@ -71,6 +72,7 @@ docs_tools: # Testing Configuration testing: kind_node_image: 'kindest/node:v1.32.0' + h100_kind_node_image: 'kindest/node:v1.35.0' # Component test harness configuration # Used by tools/component-test/ scripts to validate individual components diff --git a/docs/user/cli-reference.md b/docs/user/cli-reference.md index 302684d5a..c641eee07 100644 --- a/docs/user/cli-reference.md +++ b/docs/user/cli-reference.md @@ -1308,6 +1308,8 @@ Unknown flags are rejected with an error to catch typos (e.g., `--best-effort`). The deploy script retries failed `helm upgrade --install` and `kubectl apply` operations with exponential backoff. By default, each operation is retried up to 5 times (6 total attempts). The backoff delay increases quadratically: 5s, 20s, 45s, 80s, 120s (capped) between retries. +On slower H100 CI runners, `kube-prometheus-stack` can hit Grafana's Deployment progress deadline before a longer Helm timeout would help. The deploy script intentionally keeps the default timeout and retry budget for `kube-prometheus-stack` so subsequent upgrade attempts can succeed after image pulls and controllers settle. Kind H100 Chainsaw health checks do not require Grafana because AICR conformance metrics use Prometheus, DCGM exporter, and prometheus-adapter directly. + Use `--retries 0` to disable retries (fail-fast behavior). When `--best-effort` is also set, retries are exhausted first before falling through to best-effort handling. **Pre-install manifests and CRD ordering:** @@ -1318,7 +1320,11 @@ After `helm install`, the same manifests are re-applied as post-install to ensur **Async components:** -Components that use operator patterns with custom resources that reconcile asynchronously (e.g., `kai-scheduler`) are installed without `--wait` to avoid Helm timing out on CR readiness. +Components that use operator patterns with custom resources that reconcile asynchronously use component-specific install behavior: + +- `kai-scheduler` installs without `--wait` to avoid Helm timing out on CR readiness. It uses a 30-minute per-attempt timeout and caps the retry budget at 1 retry so hook diagnostics surface quickly on cold runners. +- `dynamo-platform` has `deploy.sh` attempt `--server-side=false` so retries and upgrades do not conflict with Grove-managed webhook certificate Secret data, while preserving Helm `--wait` behavior. The script only adds `--server-side=false` when Helm v4.0.5 or later is detected; with older Helm clients it logs a warning and proceeds without that mitigation. Dynamo installs use a 20-minute per-attempt timeout and cap the retry budget at 3 retries because cold-start failures often involve operator webhook and certificate resources settling across attempts. +- `kube-prometheus-stack` keeps the default timeout and retry budget because Grafana can hit its Deployment progress deadline before a longer Helm timeout would help, especially on slower H100 CI runners under image-pull and control-plane load. ##### DRA kubelet plugin registration diff --git a/kwok/scripts/install-karpenter-kwok.sh b/kwok/scripts/install-karpenter-kwok.sh index 72b64dae1..d6a17481f 100755 --- a/kwok/scripts/install-karpenter-kwok.sh +++ b/kwok/scripts/install-karpenter-kwok.sh @@ -41,7 +41,9 @@ KARPENTER_VERSION="${KARPENTER_VERSION:-v1.8.0}" KIND_CLUSTER_NAME="${KIND_CLUSTER_NAME:?KIND_CLUSTER_NAME must be set}" KARPENTER_NAMESPACE="${KARPENTER_NAMESPACE:-karpenter}" KARPENTER_CLONE_DIR="${KARPENTER_CLONE_DIR:-/tmp/karpenter}" +KWOK_HELM_TIMEOUT="${KWOK_HELM_TIMEOUT:-300s}" KO_BUILD_TIMEOUT="${KO_BUILD_TIMEOUT:-900}" # 15 minutes +KARPENTER_HELM_TIMEOUT="${KARPENTER_HELM_TIMEOUT:-300s}" RED='\033[0;31m' GREEN='\033[0;32m' @@ -68,7 +70,7 @@ install_kwok() { helm upgrade --install kwok-controller kwok/kwok \ --namespace kube-system \ --set hostNetwork=true \ - --wait --timeout 300s + --wait --timeout "${KWOK_HELM_TIMEOUT}" helm upgrade --install kwok-stage-fast kwok/stage-fast \ --namespace kube-system @@ -98,11 +100,16 @@ build_karpenter() { # Redirect stderr to avoid Go compilation warnings corrupting the image reference. # Output format: kind.local/: # Hard timeout prevents a slow/stuck compilation from consuming the entire job. + local ko_stderr="${KARPENTER_CLONE_DIR}/ko-build.stderr" CONTROLLER_IMG=$(timeout "${KO_BUILD_TIMEOUT}" \ env KO_DOCKER_REPO=kind.local \ KIND_CLUSTER_NAME="${KIND_CLUSTER_NAME}" \ - ko build sigs.k8s.io/karpenter/kwok 2>/dev/null) || { + ko build sigs.k8s.io/karpenter/kwok 2>"${ko_stderr}") || { log_error "ko build failed or timed out after ${KO_BUILD_TIMEOUT}s" + if [[ -s "${ko_stderr}" ]]; then + log_error "ko build stderr:" + sed 's/^/ /' "${ko_stderr}" || true + fi exit 1 } @@ -187,7 +194,7 @@ deploy_karpenter() { --set 'controller.extraVolumeMounts[0].readOnly=true' \ --set 'controller.env[0].name=INSTANCE_TYPES_FILE_PATH' \ --set 'controller.env[0].value=/etc/karpenter/instance-types/instance-types.json' \ - --wait --timeout 300s \ + --wait --timeout "${KARPENTER_HELM_TIMEOUT}" \ || { log_error "Helm install failed. Diagnostics:" kubectl -n "${KARPENTER_NAMESPACE}" get pods -o wide 2>/dev/null || true @@ -213,6 +220,7 @@ main() { log_info "Karpenter version: ${KARPENTER_VERSION}" log_info "Kind cluster: ${KIND_CLUSTER_NAME}" log_info "Namespace: ${KARPENTER_NAMESPACE}" + log_info "Timeouts: kwok=${KWOK_HELM_TIMEOUT} ko-build=${KO_BUILD_TIMEOUT}s karpenter=${KARPENTER_HELM_TIMEOUT}" install_kwok build_karpenter diff --git a/kwok/scripts/run-all-recipes.sh b/kwok/scripts/run-all-recipes.sh index 459b054b5..6b4af1549 100755 --- a/kwok/scripts/run-all-recipes.sh +++ b/kwok/scripts/run-all-recipes.sh @@ -37,6 +37,31 @@ log_info() { echo -e "${GREEN}[INFO]${NC} $*"; } log_warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } log_error() { echo -e "${RED}[ERROR]${NC} $*"; } +retry_command() { + local description="$1" + shift + + local max_attempts="${KWOK_COMMAND_RETRIES:-3}" + local delay="${KWOK_COMMAND_RETRY_DELAY:-5}" + local attempt=1 + + while true; do + if "$@"; then + return 0 + fi + + if ((attempt >= max_attempts)); then + log_error "${description} failed after ${attempt} attempt(s)" + return 1 + fi + + log_warn "${description} failed (attempt ${attempt}/${max_attempts}); retrying in ${delay}s..." + sleep "${delay}" + attempt=$((attempt + 1)) + delay=$((delay * 2)) + done +} + # Find recipes with service criteria (testable cloud configurations) get_recipes() { for overlay in "${OVERLAYS_DIR}"/*.yaml; do @@ -68,10 +93,13 @@ ensure_cluster() { if ! kubectl get deployment -n kube-system kwok-controller &>/dev/null; then log_info "Installing KWOK controller..." - helm repo add kwok https://kwok.sigs.k8s.io/charts/ --force-update - helm upgrade --install kwok-controller kwok/kwok \ + retry_command "Adding KWOK Helm repository" \ + helm repo add kwok https://kwok.sigs.k8s.io/charts/ --force-update + retry_command "Installing KWOK controller" \ + helm upgrade --install kwok-controller kwok/kwok \ --namespace kube-system --set hostNetwork=true --wait - helm upgrade --install kwok-stage-fast kwok/stage-fast --namespace kube-system + retry_command "Installing KWOK stage-fast" \ + helm upgrade --install kwok-stage-fast kwok/stage-fast --namespace kube-system fi # Patch kindnet to exclude KWOK nodes diff --git a/pkg/bundler/deployer/helm/helm_test.go b/pkg/bundler/deployer/helm/helm_test.go index 435948e8f..33dc2c6e9 100644 --- a/pkg/bundler/deployer/helm/helm_test.go +++ b/pkg/bundler/deployer/helm/helm_test.go @@ -20,6 +20,7 @@ import ( "os" "os/exec" "path/filepath" + "regexp" "strings" "testing" "time" @@ -493,9 +494,9 @@ func TestGenerate_DeployScriptKaiSchedulerTimeout(t *testing.T) { } script := string(content) - // kai-scheduler should get a custom 20m timeout override - if !strings.Contains(script, `COMPONENT_HELM_TIMEOUT="20m"`) { - t.Error("deploy.sh missing kai-scheduler 20m timeout override") + // kai-scheduler should get a custom 30m timeout override + if !strings.Contains(script, `COMPONENT_HELM_TIMEOUT="30m"`) { + t.Error("deploy.sh missing kai-scheduler 30m timeout override") } // Other components should use the default HELM_TIMEOUT if !strings.Contains(script, `COMPONENT_HELM_TIMEOUT="${HELM_TIMEOUT}"`) { @@ -505,15 +506,241 @@ func TestGenerate_DeployScriptKaiSchedulerTimeout(t *testing.T) { if !strings.Contains(script, `COMPONENT_MAX_RETRIES="1"`) { t.Error("deploy.sh missing kai-scheduler retry override") } + if !strings.Contains(script, `if [[ "${COMPONENT_MAX_RETRIES}" -gt 1 ]]`) { + t.Error("deploy.sh missing kai-scheduler retry cap") + } if !strings.Contains(script, `dump_kai_scheduler_helm_diagnostics "${namespace}"`) { t.Error("deploy.sh missing kai-scheduler diagnostics hook") } - if !strings.Contains(script, `kubectl get jobs -n "${namespace}"`) { + if !strings.Contains(script, `kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" get jobs -n "${namespace}"`) { t.Error("deploy.sh missing job diagnostics") } - if !strings.Contains(script, `kubectl describe pods -n "${namespace}"`) { + if !strings.Contains(script, `kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" describe pods -n "${namespace}"`) { t.Error("deploy.sh missing pod diagnostics") } + + rootReadmeContent, err := os.ReadFile(filepath.Join(outputDir, "README.md")) + if err != nil { + t.Fatalf("failed to read root README: %v", err) + } + componentReadmeContent, err := os.ReadFile(filepath.Join(outputDir, "kai-scheduler", "README.md")) + if err != nil { + t.Fatalf("failed to read component README: %v", err) + } + rootReadme := string(rootReadmeContent) + componentReadme := string(componentReadmeContent) + if !strings.Contains(rootReadme, `--timeout 30m`) { + t.Error("root README missing kai-scheduler 30m timeout") + } + if !strings.Contains(componentReadme, `--timeout 30m`) { + t.Error("component README missing kai-scheduler 30m timeout") + } + if strings.Contains(componentReadme, `--wait --timeout 30m`) { + t.Error("component README should document kai-scheduler without --wait") + } + if strings.Contains(componentReadme, `--wait --timeout 10m`) { + t.Error("component README should not use default timeout for kai-scheduler") + } +} + +func TestGenerate_DeployScriptComponentTimeouts(t *testing.T) { + retryCapPattern := regexp.MustCompile(`(?m)(if \[\[ "\$\{COMPONENT_MAX_RETRIES\}" -gt \d+ \]\]|COMPONENT_MAX_RETRIES="\d+")`) + applyArgsExpansion := `${COMPONENT_HELM_APPLY_ARGS[@]+"${COMPONENT_HELM_APPLY_ARGS[@]}"}` + tests := []struct { + name string + component recipe.ComponentRef + wantTimeout string + wantRetryAssignment string + wantRetryCap string + wantApplyArgs string + wantComment string + wantSnippets []string + wantReadmeSnippets []string + rejectSnippets []string + rejectScriptSnippets []string + rejectReadmeSnippets []string + rejectRetryCap bool + }{ + { + name: "dynamo-platform", + component: recipe.ComponentRef{ + Name: "dynamo-platform", + Namespace: "dynamo-system", + Chart: "dynamo-platform", + Version: "0.9.0", + Type: recipe.ComponentTypeHelm, + Source: "oci://nvcr.io/nvidia/ai-dynamo", + }, + wantTimeout: `COMPONENT_HELM_TIMEOUT="20m"`, + wantRetryAssignment: `COMPONENT_MAX_RETRIES="3"`, + wantRetryCap: `if [[ "${COMPONENT_MAX_RETRIES}" -gt 3 ]]`, + wantApplyArgs: `COMPONENT_HELM_APPLY_ARGS=(--server-side=false)`, + wantSnippets: []string{ + `helm_supports_server_side_false_install`, + `Require v4.0.5+ before relying on`, + `--request-timeout="${KUBECTL_REQUEST_TIMEOUT}"`, + `dynamo-platform conflict mitigation requires Helm v4.0.5+`, + `dump_dynamo_platform_helm_diagnostics "${component}" "${namespace}"`, + `deployment/dynamo-platform-dynamo-operator-controller-manager`, + `--previous --tail=200`, + }, + wantReadmeSnippets: []string{ + `--server-side=false`, + `requires Helm v4.0.5 or later`, + `--wait --timeout 20m`, + }, + rejectScriptSnippets: []string{ + `local prerelease`, + `if [[ -n "${prerelease}" ]]`, + }, + }, + { + name: "kube-prometheus-stack", + component: recipe.ComponentRef{ + Name: "kube-prometheus-stack", + Namespace: "monitoring", + Chart: "kube-prometheus-stack", + Version: "82.8.0", + Type: recipe.ComponentTypeHelm, + Source: "https://prometheus-community.github.io/helm-charts", + }, + wantTimeout: `COMPONENT_HELM_TIMEOUT="${HELM_TIMEOUT}"`, + wantComment: `preserve the default retry`, + rejectRetryCap: true, + }, + { + name: "ordinary component defaults", + component: recipe.ComponentRef{ + Name: "gpu-operator", + Namespace: "gpu-operator", + Chart: "gpu-operator", + Version: "v25.10.1", + Type: recipe.ComponentTypeHelm, + Source: "https://helm.ngc.nvidia.com/nvidia", + }, + wantTimeout: `COMPONENT_HELM_TIMEOUT="${HELM_TIMEOUT}"`, + wantApplyArgs: `COMPONENT_HELM_APPLY_ARGS=()`, + wantReadmeSnippets: []string{ + `--wait --timeout 10m`, + }, + rejectSnippets: []string{ + `--server-side=false`, + `COMPONENT_MAX_RETRIES="1"`, + `COMPONENT_MAX_RETRIES="3"`, + }, + rejectReadmeSnippets: []string{ + `--server-side=false`, + `--wait --timeout 20m`, + `--timeout 30m`, + }, + rejectRetryCap: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + ctx := context.Background() + outputDir := t.TempDir() + + g := &Generator{ + RecipeResult: &recipe.RecipeResult{ + Kind: "RecipeResult", + APIVersion: "aicr.nvidia.com/v1alpha1", + ComponentRefs: []recipe.ComponentRef{tt.component}, + DeploymentOrder: []string{tt.component.Name}, + }, + ComponentValues: map[string]map[string]any{ + tt.component.Name: {}, + }, + Version: "v1.0.0", + } + + _, err := g.Generate(ctx, outputDir) + if err != nil { + t.Fatalf("Generate failed: %v", err) + } + + content, err := os.ReadFile(filepath.Join(outputDir, "deploy.sh")) + if err != nil { + t.Fatalf("failed to read deploy.sh: %v", err) + } + script := string(content) + + blockStart := strings.Index(script, `Installing `+tt.component.Name) + if blockStart == -1 { + t.Fatalf("deploy.sh missing %s install block", tt.component.Name) + } + blockEnd := strings.Index(script[blockStart:], `helm upgrade --install `+tt.component.Name) + if blockEnd == -1 { + t.Fatalf("deploy.sh missing %s helm install command", tt.component.Name) + } + componentBlock := script[blockStart : blockStart+blockEnd] + + if !strings.Contains(componentBlock, tt.wantTimeout) { + t.Errorf("deploy.sh missing %s timeout override %q", tt.component.Name, tt.wantTimeout) + } + if tt.wantRetryAssignment != "" && !strings.Contains(componentBlock, tt.wantRetryAssignment) { + t.Errorf("deploy.sh missing %s retry override %q", tt.component.Name, tt.wantRetryAssignment) + } + if tt.wantRetryCap != "" && !strings.Contains(componentBlock, tt.wantRetryCap) { + t.Errorf("deploy.sh missing %s retry cap %q", tt.component.Name, tt.wantRetryCap) + } + if tt.wantApplyArgs != "" && !strings.Contains(componentBlock, tt.wantApplyArgs) { + t.Errorf("deploy.sh missing %s apply args %q", tt.component.Name, tt.wantApplyArgs) + } + if tt.wantApplyArgs != "" && !strings.Contains(script[blockStart:], applyArgsExpansion) { + t.Errorf("deploy.sh missing %s apply args in helm command", tt.component.Name) + } + if tt.wantComment != "" && !strings.Contains(componentBlock, tt.wantComment) { + t.Errorf("deploy.sh missing %s retry rationale", tt.component.Name) + } + for _, snippet := range tt.wantSnippets { + if !strings.Contains(script, snippet) { + t.Errorf("deploy.sh missing %s snippet %q", tt.component.Name, snippet) + } + } + for _, snippet := range tt.rejectSnippets { + if strings.Contains(componentBlock, snippet) { + t.Errorf("deploy.sh should not include %s snippet %q", tt.component.Name, snippet) + } + } + for _, snippet := range tt.rejectScriptSnippets { + if strings.Contains(script, snippet) { + t.Errorf("deploy.sh should not include %s script snippet %q", tt.component.Name, snippet) + } + } + if tt.rejectRetryCap && retryCapPattern.MatchString(componentBlock) { + t.Errorf("deploy.sh should not cap %s retries in its component block", tt.component.Name) + } + + rootReadmeContent, err := os.ReadFile(filepath.Join(outputDir, "README.md")) + if err != nil { + t.Fatalf("failed to read root README: %v", err) + } + componentReadmeContent, err := os.ReadFile(filepath.Join(outputDir, tt.component.Name, "README.md")) + if err != nil { + t.Fatalf("failed to read component README: %v", err) + } + rootReadme := string(rootReadmeContent) + componentReadme := string(componentReadmeContent) + for _, snippet := range tt.wantReadmeSnippets { + if !strings.Contains(rootReadme, snippet) { + t.Errorf("root README missing %s snippet %q", tt.component.Name, snippet) + } + if !strings.Contains(componentReadme, snippet) { + t.Errorf("component README missing %s snippet %q", tt.component.Name, snippet) + } + } + for _, snippet := range tt.rejectReadmeSnippets { + if strings.Contains(rootReadme, snippet) { + t.Errorf("root README should not include %s snippet %q", tt.component.Name, snippet) + } + if strings.Contains(componentReadme, snippet) { + t.Errorf("component README should not include %s snippet %q", tt.component.Name, snippet) + } + } + }) + } } func TestGenerate_UndeployScriptExecutable(t *testing.T) { diff --git a/pkg/bundler/deployer/helm/templates/README.md.tmpl b/pkg/bundler/deployer/helm/templates/README.md.tmpl index 3c3e874f4..1ef2f252b 100644 --- a/pkg/bundler/deployer/helm/templates/README.md.tmpl +++ b/pkg/bundler/deployer/helm/templates/README.md.tmpl @@ -77,21 +77,36 @@ kustomize build '{{ .Repository }}//{{ .Path }}{{ if .Tag }}?ref={{ .Tag }}{{ en ```bash {{ if .IsOCI -}} helm upgrade --install {{ .Name }} {{ .Repository }}/{{ .ChartName }} \ + {{ if eq .Name "dynamo-platform" }}--server-side=false \ + {{ end -}} --version {{ .Version }} \ -n {{ .Namespace }} --create-namespace \ -f {{ .Name }}/values.yaml \ -f {{ .Name }}/cluster-values.yaml \ - --wait --timeout 10m + {{ if eq .Name "kai-scheduler" -}} + --timeout 30m + {{ else -}} + --wait --timeout {{ if eq .Name "dynamo-platform" }}20m{{ else }}10m{{ end }} + {{ end -}} {{ else -}} helm upgrade --install {{ .Name }} {{ .ChartName }} \ + {{ if eq .Name "dynamo-platform" }}--server-side=false \ + {{ end -}} --repo {{ .Repository }} \ --version {{ .Version }} \ -n {{ .Namespace }} --create-namespace \ -f {{ .Name }}/values.yaml \ -f {{ .Name }}/cluster-values.yaml \ - --wait --timeout 10m + {{ if eq .Name "kai-scheduler" -}} + --timeout 30m + {{ else -}} + --wait --timeout {{ if eq .Name "dynamo-platform" }}20m{{ else }}10m{{ end }} + {{ end -}} {{ end -}} ``` +{{ if eq .Name "dynamo-platform" }} +`--server-side=false` requires Helm v4.0.5 or later. `deploy.sh` checks the Helm version before adding the flag and warns before falling back when unavailable. +{{ end -}} {{ end -}} {{ if .HasManifests }} ```bash @@ -119,7 +134,9 @@ Each Helm component has two values files in its directory: ## Upgrade -To upgrade a specific Helm component: +To upgrade a specific Helm component, use the generic form below. Some +components require component-specific flags; use the component subdirectory +`README.md` for the exact command. ```bash helm upgrade --version -n -f /values.yaml -f /cluster-values.yaml --wait --timeout 10m diff --git a/pkg/bundler/deployer/helm/templates/component-README.md.tmpl b/pkg/bundler/deployer/helm/templates/component-README.md.tmpl index 068bfcd28..66762ac7f 100644 --- a/pkg/bundler/deployer/helm/templates/component-README.md.tmpl +++ b/pkg/bundler/deployer/helm/templates/component-README.md.tmpl @@ -43,21 +43,36 @@ Namespace: {{ .Namespace }} ```bash {{ if .IsOCI -}} helm upgrade --install {{ .Name }} {{ .Repository }}/{{ .ChartName }} \ + {{ if eq .Name "dynamo-platform" }}--server-side=false \ + {{ end -}} --version {{ .Version }} \ -n {{ .Namespace }} --create-namespace \ -f values.yaml \ -f cluster-values.yaml \ - --wait --timeout 10m + {{ if eq .Name "kai-scheduler" -}} + --timeout 30m + {{ else -}} + --wait --timeout {{ if eq .Name "dynamo-platform" }}20m{{ else }}10m{{ end }} + {{ end -}} {{ else -}} helm upgrade --install {{ .Name }} {{ .ChartName }} \ + {{ if eq .Name "dynamo-platform" }}--server-side=false \ + {{ end -}} --repo {{ .Repository }} \ --version {{ .Version }} \ -n {{ .Namespace }} --create-namespace \ -f values.yaml \ -f cluster-values.yaml \ - --wait --timeout 10m + {{ if eq .Name "kai-scheduler" -}} + --timeout 30m + {{ else -}} + --wait --timeout {{ if eq .Name "dynamo-platform" }}20m{{ else }}10m{{ end }} + {{ end -}} {{ end -}} ``` +{{ if eq .Name "dynamo-platform" }} +`--server-side=false` requires Helm v4.0.5 or later. `deploy.sh` checks the Helm version before adding the flag and warns before falling back when unavailable. +{{ end -}} {{ if .HasManifests }} After the chart is installed, apply additional manifests: @@ -70,19 +85,31 @@ kubectl apply -f manifests/ ```bash {{ if .IsOCI -}} helm upgrade {{ .Name }} {{ .Repository }}/{{ .ChartName }} \ + {{ if eq .Name "dynamo-platform" }}--server-side=false \ + {{ end -}} --version {{ .Version }} \ -n {{ .Namespace }} \ -f values.yaml \ -f cluster-values.yaml \ - --wait --timeout 10m + {{ if eq .Name "kai-scheduler" -}} + --timeout 30m + {{ else -}} + --wait --timeout {{ if eq .Name "dynamo-platform" }}20m{{ else }}10m{{ end }} + {{ end -}} {{ else -}} helm upgrade {{ .Name }} {{ .ChartName }} \ + {{ if eq .Name "dynamo-platform" }}--server-side=false \ + {{ end -}} --repo {{ .Repository }} \ --version {{ .Version }} \ -n {{ .Namespace }} \ -f values.yaml \ -f cluster-values.yaml \ - --wait --timeout 10m + {{ if eq .Name "kai-scheduler" -}} + --timeout 30m + {{ else -}} + --wait --timeout {{ if eq .Name "dynamo-platform" }}20m{{ else }}10m{{ end }} + {{ end -}} {{ end -}} ``` diff --git a/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl b/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl index 0f83eb71c..6b946a969 100644 --- a/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl +++ b/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl @@ -24,6 +24,7 @@ trap 'rm -rf "${HELM_WORKDIR}"' EXIT cd "${HELM_WORKDIR}" HELM_TIMEOUT="10m" +KUBECTL_REQUEST_TIMEOUT="10s" NO_WAIT=false BEST_EFFORT=false FAILED_COMPONENTS="" @@ -59,6 +60,34 @@ function backoff_seconds() { echo "${seconds}" } +function helm_supports_server_side_false_install() { + local version + local major + local minor + local patch + + # Helm v4.0.0-v4.0.4 advertise --server-side=false but ignore it for the + # upgrade --install install-fallback path. Require v4.0.5+ before relying on + # the flag for Dynamo's webhook Secret conflict mitigation. + version="$(helm version --short 2>/dev/null | head -n 1 || true)" + version="${version#v}" + version="${version%%+*}" + if ! [[ "${version}" =~ ^([0-9]+)\.([0-9]+)\.([0-9]+)(-[0-9A-Za-z.-]+)?$ ]]; then + return 1 + fi + major="${BASH_REMATCH[1]}" + minor="${BASH_REMATCH[2]}" + patch="${BASH_REMATCH[3]}" + if (( major < 4 )); then + return 1 + fi + if (( major == 4 )) && (( minor == 0 )) && (( patch < 5 )); then + return 1 + fi + + helm help upgrade 2>/dev/null | grep -q -- '--server-side' +} + function retry() { local desc="$1"; shift local attempt=0 @@ -86,7 +115,7 @@ function retry() { function cleanup_helm_hooks() { local namespace="$1" local job_names - job_names=$(kubectl get jobs -n "${namespace}" \ + job_names=$(kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" get jobs -n "${namespace}" \ --field-selector=status.successful=0 \ -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' \ 2>/dev/null || true) @@ -97,7 +126,7 @@ function cleanup_helm_hooks() { [[ -z "${name}" ]] && continue # Get the full Job JSON to reliably check annotations and status local job_json - job_json=$(kubectl get job "${name}" -n "${namespace}" -o json 2>/dev/null || true) + job_json=$(kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" get job "${name}" -n "${namespace}" -o json 2>/dev/null || true) [[ -z "${job_json}" ]] && continue # Skip non-hook Jobs (no helm.sh/hook annotation) local hook_val @@ -106,13 +135,13 @@ function cleanup_helm_hooks() { # Capture diagnostics before deleting. This helps diagnose transient hook # failures (e.g., dynamo ssh-keygen) that are otherwise lost after cleanup. echo " --- Failed hook Job ${name} diagnostics ---" - kubectl describe job "${name}" -n "${namespace}" 2>/dev/null | tail -50 || true + kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" describe job "${name}" -n "${namespace}" 2>/dev/null | tail -50 || true local pod_names - pod_names=$(kubectl get pods -n "${namespace}" -l "job-name=${name}" \ + pod_names=$(kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" get pods -n "${namespace}" -l "job-name=${name}" \ -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' 2>/dev/null || true) for pod_name in ${pod_names}; do echo " --- Hook pod ${pod_name} describe ---" - kubectl describe pod "${pod_name}" -n "${namespace}" 2>/dev/null | tail -50 || true + kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" describe pod "${pod_name}" -n "${namespace}" 2>/dev/null | tail -50 || true done echo " --- End diagnostics for ${name} ---" # Delete any non-succeeded hook Job. This function only runs after a Helm @@ -120,7 +149,7 @@ function cleanup_helm_hooks() { # retry — whether it failed, is stuck Pending (timed out before the pod # started), or is still active with a stuck container. echo " Cleaning up stale Helm hook Job ${name} in ${namespace}..." - kubectl delete job "${name}" -n "${namespace}" --ignore-not-found 2>/dev/null || true + kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" delete job "${name}" -n "${namespace}" --ignore-not-found 2>/dev/null || true done <<< "${job_names}" } @@ -132,28 +161,58 @@ function dump_kai_scheduler_helm_diagnostics() { echo " --- ${namespace} diagnostics ---" echo " Jobs:" - kubectl get jobs -n "${namespace}" 2>/dev/null || true + kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" get jobs -n "${namespace}" 2>/dev/null || true echo " Job descriptions:" - kubectl describe jobs -n "${namespace}" 2>/dev/null || true + kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" describe jobs -n "${namespace}" 2>/dev/null || true echo " Pods:" - kubectl get pods -n "${namespace}" -o wide 2>/dev/null || true + kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" get pods -n "${namespace}" -o wide 2>/dev/null || true echo " Pod descriptions:" - kubectl describe pods -n "${namespace}" 2>/dev/null || true + kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" describe pods -n "${namespace}" 2>/dev/null || true echo " Recent events:" - kubectl get events -n "${namespace}" --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true + kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" get events -n "${namespace}" --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true + echo " --- End ${namespace} diagnostics ---" +} + +function dump_dynamo_platform_helm_diagnostics() { + local component="$1" + local namespace="$2" + if [[ "${component}" != "dynamo-platform" ]]; then + return + fi + + echo " --- ${namespace} diagnostics ---" + echo " Deployments:" + kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" get deployments -n "${namespace}" -o wide 2>/dev/null || true + echo " Jobs:" + kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" get jobs -n "${namespace}" 2>/dev/null || true + echo " Pods:" + kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" get pods -n "${namespace}" -o wide 2>/dev/null || true + echo " Pod descriptions:" + kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" describe pods -n "${namespace}" 2>/dev/null || true + echo " Dynamo operator manager logs:" + kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" logs deployment/dynamo-platform-dynamo-operator-controller-manager -n "${namespace}" -c manager --tail=200 2>/dev/null || true + echo " Dynamo operator manager previous logs:" + kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" logs deployment/dynamo-platform-dynamo-operator-controller-manager -n "${namespace}" -c manager --previous --tail=200 2>/dev/null || true + echo " Grove operator logs:" + kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" logs deployment/grove-operator -n "${namespace}" --all-containers --tail=100 2>/dev/null || true + echo " Grove operator previous logs:" + kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" logs deployment/grove-operator -n "${namespace}" --all-containers --previous --tail=100 2>/dev/null || true + echo " Recent events:" + kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" get events -n "${namespace}" --sort-by='.lastTimestamp' 2>/dev/null | tail -50 || true echo " --- End ${namespace} diagnostics ---" } # helm_retry contract: -# helm_retry "" "" "" [args...] -# Callers must pass the retry budget as the third positional argument before the -# command to execute. This keeps per-component retry tuning explicit at the -# callsite instead of relying on the global MAX_RETRIES fallback. +# helm_retry "" "" "" "" [args...] +# Callers must pass the component name and retry budget before the command to +# execute. This keeps per-component retry tuning and diagnostics explicit at the +# callsite instead of relying on global fallbacks. function helm_retry() { local desc="$1" - local namespace="$2" - local max_retries="$3" - shift 3 + local component="$2" + local namespace="$3" + local max_retries="$4" + shift 4 local attempt=0 while true; do if "$@"; then @@ -161,6 +220,7 @@ function helm_retry() { fi attempt=$((attempt + 1)) dump_kai_scheduler_helm_diagnostics "${namespace}" + dump_dynamo_platform_helm_diagnostics "${component}" "${namespace}" if [[ ${attempt} -gt ${max_retries} ]]; then echo "ERROR: ${desc} failed after ${attempt} attempts" return 1 @@ -371,13 +431,35 @@ retry "{{ .Name }} pre-install manifests" apply_ignoring_crd_race "${SCRIPT_DIR} || helm_failed "{{ .Name }}" {{ end -}} # Per-component timeout override. Most components use HELM_TIMEOUT (10m). -# Components with slow hooks (e.g., kai-scheduler crd-upgrader image pull -# on cold runners) get a longer timeout to avoid unnecessary retry cycles. +# Components with slow hooks on cold runners get a longer timeout to avoid +# unnecessary retry cycles. COMPONENT_HELM_TIMEOUT="${HELM_TIMEOUT}" COMPONENT_MAX_RETRIES="${MAX_RETRIES}" +COMPONENT_HELM_APPLY_ARGS=() {{ if eq .Name "kai-scheduler" -}} +COMPONENT_HELM_TIMEOUT="30m" +if [[ "${COMPONENT_MAX_RETRIES}" -gt 1 ]]; then + COMPONENT_MAX_RETRIES="1" +fi +{{ else if eq .Name "dynamo-platform" -}} COMPONENT_HELM_TIMEOUT="20m" -COMPONENT_MAX_RETRIES="1" +# Grove owns the generated webhook certificate Secret data after install. +# Client-side apply avoids server-side field ownership conflicts during retries. +# This flag requires Helm v4.0.5+; earlier Helm v4 releases advertise the flag +# but ignore --server-side=false on a fresh upgrade --install fallback. +if helm_supports_server_side_false_install; then + COMPONENT_HELM_APPLY_ARGS=(--server-side=false) +else + echo "::warning::dynamo-platform conflict mitigation requires Helm v4.0.5+ with working --server-side=false install fallback; proceeding without this flag" +fi +if [[ "${COMPONENT_MAX_RETRIES}" -gt 3 ]]; then + COMPONENT_MAX_RETRIES="3" +fi +{{ else if eq .Name "kube-prometheus-stack" -}} +# Grafana can trip its Deployment progress deadline before a longer Helm +# timeout helps, especially on slower H100 CI runners under image-pull and +# control-plane load. Keep the default 10m timeout and preserve the default retry +# budget so later upgrades can succeed after images and controllers settle. {{ end -}} # Derive wait args: global --wait/--no-wait behavior + component timeout. if [[ "${NO_WAIT}" == "true" ]]; then @@ -391,9 +473,11 @@ if echo "${ASYNC_COMPONENTS}" | grep -qw "{{ .Name }}"; then echo " (async component — skipping --wait, keeping --timeout for hooks)" fi {{ if .IsOCI -}} -helm_retry "{{ .Name }} helm install" "{{ .Namespace }}" \ +helm_retry "{{ .Name }} helm install" "{{ .Name }}" \ + "{{ .Namespace }}" \ "${COMPONENT_MAX_RETRIES}" \ helm upgrade --install {{ .Name }} {{ .Repository }}/{{ .ChartName }} \ + ${COMPONENT_HELM_APPLY_ARGS[@]+"${COMPONENT_HELM_APPLY_ARGS[@]}"} \ {{ if .Version }}--version {{ .Version }} \ {{ end -}} -n {{ .Namespace }} --create-namespace \ @@ -402,9 +486,11 @@ helm_retry "{{ .Name }} helm install" "{{ .Namespace }}" \ ${COMPONENT_WAIT_ARGS} \ || helm_failed "{{ .Name }}" {{ else -}} -helm_retry "{{ .Name }} helm install" "{{ .Namespace }}" \ +helm_retry "{{ .Name }} helm install" "{{ .Name }}" \ + "{{ .Namespace }}" \ "${COMPONENT_MAX_RETRIES}" \ helm upgrade --install {{ .Name }} {{ .ChartName }} \ + ${COMPONENT_HELM_APPLY_ARGS[@]+"${COMPONENT_HELM_APPLY_ARGS[@]}"} \ --repo {{ .Repository }} \ {{ if .Version }}--version {{ .Version }} \ {{ end -}} diff --git a/recipes/overlays/kind.yaml b/recipes/overlays/kind.yaml index b0d8dbd76..791016135 100644 --- a/recipes/overlays/kind.yaml +++ b/recipes/overlays/kind.yaml @@ -115,6 +115,11 @@ spec: - name: kube-prometheus-stack type: Helm overrides: + # CI only needs component health, not the full upstream alerting rule + # set. Skipping default rules reduces PrometheusRule churn during + # install on small kind control planes. + defaultRules: + create: false prometheus: prometheusSpec: # Smaller storage for local testing @@ -132,7 +137,35 @@ spec: memory: 1Gi # Shorter retention for local testing retention: 7d + prometheusOperator: + # Keep operator-owned monitoring custom resources in the monitoring + # namespace for kind. Do not scope ServiceMonitor discovery here; + # GPU, Kubeflow, and Dynamo monitors may live in their own namespaces. + alertmanagerInstanceNamespaces: + - monitoring + alertmanagerConfigNamespaces: + - monitoring + prometheusInstanceNamespaces: + - monitoring + thanosRulerInstanceNamespaces: + - monitoring + # CI kind control planes can be slow under image pulls and controller + # startup. Avoid restarting the operator on short health probe stalls. + livenessProbe: + timeoutSeconds: 10 + failureThreshold: 10 + readinessProbe: + timeoutSeconds: 10 + failureThreshold: 6 + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: 500m + memory: 512Mi grafana: + enabled: false resources: requests: cpu: 100m diff --git a/tests/chainsaw/ai-conformance/README.md b/tests/chainsaw/ai-conformance/README.md index b1a88e9d4..a69b88f13 100644 --- a/tests/chainsaw/ai-conformance/README.md +++ b/tests/chainsaw/ai-conformance/README.md @@ -73,10 +73,11 @@ tests/chainsaw/ai-conformance/ │ ├── assert-cert-manager.yaml # cert-manager healthy │ ├── assert-dra-driver.yaml # DRA driver healthy │ ├── assert-kai-scheduler.yaml # KAI scheduler healthy -│ ├── assert-monitoring.yaml # Prometheus stack healthy +│ ├── assert-monitoring.yaml # Prometheus stack healthy with Grafana │ └── assert-skyhook.yaml # Skyhook operator healthy ├── kind-common/ # Shared Kind-only assertions │ ├── assert-gpu-operator.yaml # GPU operator healthy on kind +│ ├── assert-monitoring.yaml # Prometheus stack healthy without Grafana │ ├── assert-network-operator.yaml # Network operator healthy on kind │ └── assert-nvsentinel.yaml # NVSentinel healthy on kind ├── kind-inference-dynamo/ # Kind + H100 + inference + dynamo leaf suite diff --git a/tests/chainsaw/ai-conformance/kind-common/assert-monitoring.yaml b/tests/chainsaw/ai-conformance/kind-common/assert-monitoring.yaml new file mode 100644 index 000000000..868be3fea --- /dev/null +++ b/tests/chainsaw/ai-conformance/kind-common/assert-monitoring.yaml @@ -0,0 +1,85 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Assert kind monitoring stack components required by H100 CI are healthy. +# Grafana is intentionally not asserted here because conformance metrics use +# Prometheus, DCGM exporter, and prometheus-adapter directly. + +# Prometheus Operator - manages Prometheus, Alertmanager, and ServiceMonitor CRs +apiVersion: apps/v1 +kind: Deployment +metadata: + name: kube-prometheus-operator + namespace: monitoring +status: + (conditions[?type == 'Available']): + - status: "True" +--- +# kube-state-metrics - Kubernetes object state metrics +apiVersion: apps/v1 +kind: Deployment +metadata: + name: kube-state-metrics + namespace: monitoring +status: + (conditions[?type == 'Available']): + - status: "True" +--- +# Prometheus StatefulSet - time series database +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: prometheus-kube-prometheus-prometheus + namespace: monitoring +status: + (readyReplicas > `0`): true +--- +# Alertmanager StatefulSet - alert routing and silencing +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: alertmanager-kube-prometheus-alertmanager + namespace: monitoring +status: + (readyReplicas > `0`): true +--- +# Prometheus Node Exporter DaemonSet - node-level hardware/OS metrics +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: prometheus-node-exporter + namespace: monitoring +status: + (numberReady > `0`): true + (desiredNumberScheduled > `0`): true +--- +# k8s-ephemeral-storage-metrics - ephemeral storage usage metrics +apiVersion: apps/v1 +kind: Deployment +metadata: + name: k8s-ephemeral-storage-metrics + namespace: monitoring +status: + (conditions[?type == 'Available']): + - status: "True" +--- +# Prometheus Adapter - custom metrics API for HPA +apiVersion: apps/v1 +kind: Deployment +metadata: + name: prometheus-adapter + namespace: monitoring +status: + (conditions[?type == 'Available']): + - status: "True" diff --git a/tests/chainsaw/ai-conformance/kind-inference-dynamo/chainsaw-test.yaml b/tests/chainsaw/ai-conformance/kind-inference-dynamo/chainsaw-test.yaml index 1b1f701ad..cac236b32 100644 --- a/tests/chainsaw/ai-conformance/kind-inference-dynamo/chainsaw-test.yaml +++ b/tests/chainsaw/ai-conformance/kind-inference-dynamo/chainsaw-test.yaml @@ -65,10 +65,10 @@ spec: # ── Monitoring ───────────────────────────────────────────────────── - name: assert-monitoring - description: Verify kube-prometheus-stack, ephemeral storage metrics, and prometheus-adapter. + description: Verify kind monitoring stack without Grafana. try: - assert: - file: ../common/assert-monitoring.yaml + file: ../kind-common/assert-monitoring.yaml # ── kgateway ─────────────────────────────────────────────────────── - name: assert-kgateway @@ -110,6 +110,8 @@ spec: # ── KAI Scheduler ────────────────────────────────────────────────── - name: assert-kai-scheduler description: Verify KAI scheduler is available. + timeouts: + assert: 600s try: - assert: file: ../common/assert-kai-scheduler.yaml diff --git a/tests/chainsaw/ai-conformance/kind-training-kubeflow/chainsaw-test.yaml b/tests/chainsaw/ai-conformance/kind-training-kubeflow/chainsaw-test.yaml index 382d99104..20332ad64 100644 --- a/tests/chainsaw/ai-conformance/kind-training-kubeflow/chainsaw-test.yaml +++ b/tests/chainsaw/ai-conformance/kind-training-kubeflow/chainsaw-test.yaml @@ -60,10 +60,10 @@ spec: file: ../kind-common/assert-gpu-operator.yaml - name: assert-monitoring - description: Verify kube-prometheus-stack, ephemeral storage metrics, and prometheus-adapter. + description: Verify kind monitoring stack without Grafana. try: - assert: - file: ../common/assert-monitoring.yaml + file: ../kind-common/assert-monitoring.yaml - name: assert-skyhook description: Verify Skyhook operator controller-manager is available. @@ -73,6 +73,8 @@ spec: - name: assert-kubeflow-trainer description: Verify Kubeflow Trainer controller, validating webhook, and TrainJob CRD are available. + timeouts: + assert: 600s try: - assert: file: assert-kubeflow-trainer.yaml @@ -99,6 +101,8 @@ spec: - name: assert-kai-scheduler description: Verify KAI scheduler is available. + timeouts: + assert: 600s try: - assert: file: ../common/assert-kai-scheduler.yaml