diff --git a/demos/slinky-slurm-demo.html b/demos/slinky-slurm-demo.html new file mode 100644 index 000000000..4085f77f7 --- /dev/null +++ b/demos/slinky-slurm-demo.html @@ -0,0 +1,693 @@ + + + + + + + +AICR · Slinky Slurm · Demo — slides + + + + +
+ NVIDIA AICR · AICR · Slinky Slurm · Demo + +
+ + +
+
+
NVIDIA AI Cluster Runtime
+

AICR · Slinky Slurm · Demo

+

A compact walkthrough of the AICR flow: generate a recipe, render a bundle, deploy to a cluster, validate the Slurm stack.

+
+ Recipe criteria to overlay chain + Bundle deployable artifacts + Deploy ordered cluster install + Validate conformance proof +
+

← → or Space to navigate · F for fullscreen

+
+
+ + +
+
+
01 · Normal workflow
+

AICR's usual shape starts with a snapshot

+
+ + + + + + + 0. Snapshot + capture cluster + snapshot.yaml + + + + 1. Recipe + choose plan + recipe.yaml + + + + 2. Bundle + render artifacts + bundle/ + + + + 3. Deploy + install stack + deploy.sh + + + + 4. Validate + prove result + report.json + + + + + + +
+
+

Snapshot

+

A point-in-time capture of the cluster. It records facts like Kubernetes version, OS, GPUs, labels, taints, topology, and runtime state.

+
+
+ Slurm nuance + Today's Slinky Slurm leaves are generated from criteria flags, not from snapshot intake. Validation still captures or loads snapshot data for pre-flight checks before running Slurm health checks. +
+
+
+ + +
+
+
02 · End-to-end
+

Slinky Slurm e2e flow

+
+ + + + + + + 1. Recipe + Query Mode + recipe.yaml + + 2. Bundle + scheduling values + bundle/ + + 3. Deploy + install stack + ./deploy.sh + + 4. Validate + deployment + conformance + report.json + + + + + + E2E expectation + EKS + H100 + Ubuntu + training + Slinky Slurm + + base → eks → eks-training → h100-eks-training → h100-eks-ubuntu-training → h100-eks-ubuntu-training-slurm + Output: Slinky operator CRDs, Slinky operator, Slinky Slurm cluster chart, constraints, and deployment order + Result: Slurm controller, login pod, rest API, and GPU-backed NodeSet ready + + +
+
+
+ + +
+
+
03 · Generate
+

Recipe: criteria become a resolved plan

+
+
recipe
+
$ aicr recipe --service eks --accelerator h100 \
+    --intent training --os ubuntu --platform slurm \
+    --output recipe.yaml
+
+
+ + + + + + + base + common stack + + + eks + AWS add-ons + + + eks-training + training defaults + + + h100-eks + GPU tuning + + + ubuntu + slurm + OS + platform leaf + + + recipe.yaml + component refs + constraints + deployment order + validation checks + +
+
+ Presenter line + Slurm leaves are generated from criteria flags today. The recipe resolves the Slinky CRDs, operator, cluster chart, GPU GRES settings, constraints, and install order. +
+
+
+ + +
+
+
04 · Choose
+

Slurm leaves are selected with Query Mode

+
+
+

EKS

+

aicr recipe --service eks --accelerator h100 --intent training --os ubuntu --platform slurm

+

Leaf overlay: h100-eks-ubuntu-training-slurm.

+
+
+

GKE

+

aicr recipe --service gke --accelerator h100 --intent training --os cos --platform slurm

+

Leaf overlay: h100-gke-cos-training-slurm.

+
+
+

Kind

+

aicr recipe --service kind --accelerator h100 --intent training --platform slurm

+

CPU-only NodeSet path for smoke and CI-style checks.

+
+
+
+ Shared Slinky shape + Cloud H100 leaves bake in Gres=gpu:h100:8 and matching nvidia.com/gpu: 8 slurmd limits so srun --gres=gpu:N works after deploy. +
+
+
+ + +
+
+
05 · Render
+

Bundle: deployment artifacts from the recipe

+
+
bundle
+
$ aicr bundle --recipe recipe.yaml \
+    --accelerated-node-selector nodeGroup=gpu-worker \
+    --accelerated-node-toleration dedicated=worker-workload:NoSchedule \
+    --accelerated-node-toleration dedicated=worker-workload:NoExecute \
+    --system-node-selector nodeGroup=system-worker \
+    --system-node-toleration dedicated=system-workload:NoSchedule \
+    --system-node-toleration dedicated=system-workload:NoExecute \
+    --storage-class <storage-class> \
+    --set slinkyslurm:nodesets.slinky.replicas=1 \
+    --output bundle
+
+
+

System placement

cert-manager, monitoring, and the Slinky operator land on the system pool.

+

Accelerated placement

nodesets.slinky lands on GPU workers with matching selectors and tolerations.

+

Slurm overrides

slinkyslurm and slurmcluster target the cluster chart values.

+
+
+
bundle layout
+
bundle/
+  deploy.sh
+  cert-manager/
+  prometheus-operator-crds/
+  kube-prometheus-stack/
+  prometheus-adapter/
+  k8s-ephemeral-storage-metrics/
+  gpu-operator/
+  nvidia-dra-driver-gpu/
+  nvsentinel/
+  nodewright-operator/
+  nodewright-customizations/
+  aws-ebs-csi-driver/
+  aws-efa/
+  slinky-slurm-operator-crds/
+  slinky-slurm-operator/
+  slinky-slurm/
+
+
+
+ + +
+
+
06 · Cluster shape
+

Three-pool bundle: Slurm control plane on CPU workers

+ + + + + + + +
RoleInstanceLabelTaint
GPU workerp5.48xlargenodeGroup=gpu-workerdedicated=worker-workload:NoSchedule
dedicated=worker-workload:NoExecute
Systemm4.16xlargenodeGroup=system-workerdedicated=system-workload:NoSchedule
dedicated=system-workload:NoExecute
CPU workerm4.16xlargenodeGroup=cpu-workerdedicated=worker-workload:NoSchedule
dedicated=worker-workload:NoExecute
+
+
bundle · cpu-worker Slurm control plane
+
$ WORKER_TOLS='[{"key":"dedicated","operator":"Equal","value":"worker-workload","effect":"NoSchedule"},{"key":"dedicated","operator":"Equal","value":"worker-workload","effect":"NoExecute"}]'
+
+$ aicr bundle \
+  --recipe recipe.yaml \
+  --deployer helm \
+  --system-node-selector nodeGroup=system-worker \
+  --system-node-toleration dedicated=system-workload:NoSchedule \
+  --system-node-toleration dedicated=system-workload:NoExecute \
+  --accelerated-node-selector nodeGroup=gpu-worker \
+  --accelerated-node-toleration dedicated=worker-workload:NoSchedule \
+  --accelerated-node-toleration dedicated=worker-workload:NoExecute \
+  --set awsebscsidriver:enabled=false \
+  --set nodewright:enabled=false \
+  --set nodewrightcustomizations:enabled=false \
+  --set-json "slinkyslurm:controller.podSpec={\"nodeSelector\":{\"nodeGroup\":\"cpu-worker\"},\"tolerations\":${WORKER_TOLS}}" \
+  --set-json "slinkyslurm:restapi.podSpec={\"nodeSelector\":{\"nodeGroup\":\"cpu-worker\"},\"tolerations\":${WORKER_TOLS}}" \
+  --set-json "slinkyslurm:loginsets.slinky.podSpec={\"nodeSelector\":{\"nodeGroup\":\"cpu-worker\"},\"tolerations\":${WORKER_TOLS}}" \
+  --output ./bundles
+
+
+
+ + +
+
+
07 · Cluster shape
+

Two-pool bundle: Slurm control plane on GPU workers

+ + + + + + +
RoleInstanceLabelTaint
GPU workerp5.48xlargenodeGroup=gpu-workerdedicated=worker-workload:NoSchedule
dedicated=worker-workload:NoExecute
Systemm4.16xlargenodeGroup=system-workerdedicated=system-workload:NoSchedule
dedicated=system-workload:NoExecute
+
+
bundle · gpu-worker Slurm control plane
+
$ WORKER_TOLS='[{"key":"dedicated","operator":"Equal","value":"worker-workload","effect":"NoSchedule"},{"key":"dedicated","operator":"Equal","value":"worker-workload","effect":"NoExecute"}]'
+
+$ aicr bundle \
+  --recipe recipe.yaml \
+  --deployer helm \
+  --system-node-selector nodeGroup=system-worker \
+  --system-node-toleration dedicated=system-workload:NoSchedule \
+  --system-node-toleration dedicated=system-workload:NoExecute \
+  --accelerated-node-selector nodeGroup=gpu-worker \
+  --accelerated-node-toleration dedicated=worker-workload:NoSchedule \
+  --accelerated-node-toleration dedicated=worker-workload:NoExecute \
+  --set-json "slinkyslurm:controller.podSpec={\"nodeSelector\":{\"nodeGroup\":\"gpu-worker\"},\"tolerations\":${WORKER_TOLS}}" \
+  --set-json "slinkyslurm:restapi.podSpec={\"nodeSelector\":{\"nodeGroup\":\"gpu-worker\"},\"tolerations\":${WORKER_TOLS}}" \
+  --set-json "slinkyslurm:loginsets.slinky.podSpec={\"nodeSelector\":{\"nodeGroup\":\"gpu-worker\"},\"tolerations\":${WORKER_TOLS}}" \
+  --output ./bundles
+
+
+ Two-pool shape + Without a CPU-worker pool, the Slurm controller, REST API, and login pod are pinned to GPU workers with the same worker tolerations. +
+
+
+ + +
+
+
08 · Apply
+

Deploy: install in dependency order

+
+
deploy
+
$ cd bundle
+$ ./deploy.sh
+
+
+ + + + + + Foundations + certs + monitoring + + Slurm API + CRDs + operator + + Cluster chart + controller + login + + NodeSet + slurmd workers + + + + + Slinky-managed Slurm cluster + controller · login pod · restapi · GPU NodeSet · Gres=gpu:h100:8 · nvidia.com/gpu limits + +
+
+ What AICR contributes + Deploy order for the Slurm path is explicit: cert-manager, then Slinky CRDs, then the Slinky operator, then the Slinky Slurm cluster chart. +
+
+
+ + +
+
+
09 · Prove
+

Validate: deployment and Slurm conformance

+
+
validate
+
$ aicr validate --recipe recipe.yaml \
+    --node-selector nodeGroup=system-worker \
+    --toleration dedicated=system-workload:NoSchedule \
+    --toleration dedicated=system-workload:NoExecute \
+    --phase deployment --phase conformance
+
+
+

Deployment phase

Chainsaw component health: CRDs, Deployments, DaemonSets, and Slinky Slurm readiness.

+

Conformance phase

slinky-slurm-health runs from the login pod and checks Slurm behavior.

+

Performance phase

Not supported yet for Slurm leaves; a Kubernetes Pod benchmark would bypass slurmd.

+

scontrol ping

Confirms the controller path responds.

+

Node state gate

Checks idle or mixed nodes before submitting work.

+

Bounded srun

Runs srun --immediate=5 --time=0:03 hostname.

+
+
+ Takeaway + Deployment says the resources are installed. Conformance says Slinky can actually drive Slurm from the login pod. +
+
+
+ +
+ +
+ 1 / 10 + ← → · Space · F fullscreen +
+ +
+
+ + + + diff --git a/docs/user/component-catalog.md b/docs/user/component-catalog.md index d7f5db4b2..b9befbf0e 100644 --- a/docs/user/component-catalog.md +++ b/docs/user/component-catalog.md @@ -38,7 +38,7 @@ The source of truth is [`recipes/registry.yaml`](https://github.com/NVIDIA/aicr/ | **kubeflow-trainer** | Kubeflow Training Operator for distributed training jobs (PyTorch, etc.). Manages multi-node training job lifecycle with JobSet integration. | [Kubeflow Trainer](https://github.com/kubeflow/trainer) | | **slinky-slurm-operator-crds** | Custom Resource Definitions for the SchedMD Slinky Slurm operator. Installs the `slinky.slurm.net` CRDs (Controller, NodeSet, LoginSet, Accounting, RestApi, Token). Installed separately to support CRD lifecycle management. | [Slinky Slurm Operator](https://github.com/SlinkyProject/slurm-operator) | | **slinky-slurm-operator** | SchedMD Slinky Slurm operator and admission webhook. Manages the lifecycle of Slurm clusters declared via Slinky CRs (Controller, NodeSet, LoginSet, Accounting, RestApi, Token). **Known limitation:** chart v1.1.0 silently ignores `operator.nodeSelector` and `webhook.nodeSelector` (current chart behavior, not a planned feature); tracking [SlinkyProject/slurm-operator#187](https://github.com/SlinkyProject/slurm-operator/pull/187) for the upstream fix. | [Slinky Slurm Operator](https://github.com/SlinkyProject/slurm-operator) | -| **slinky-slurm** | Slinky-managed Slurm cluster instance: Controller (slurmctld) + LoginSet (sackd/sshd) + NodeSet (slurmd) + RestApi (slurmrestd). Reconciled by `slinky-slurm-operator`. Declared inline per slurm leaf overlay alongside `slinky-slurm-operator-crds` and `slinky-slurm-operator` (matching the dynamo-platform pattern) so each leaf can carry its own GPU/GRES tuning. Accounting (slurmdbd) requires an external MariaDB and is disabled in defaults — see `recipes/components/slinky-slurm/values.yaml`. | [Slinky Slurm Cluster Chart](https://github.com/SlinkyProject/slurm-operator/tree/main/helm/slurm) | +| **slinky-slurm** | Slinky-managed Slurm cluster instance: Controller (slurmctld) + LoginSet (sackd/sshd) + NodeSet (slurmd) + RestApi (slurmrestd). Reconciled by `slinky-slurm-operator`. Declared inline per slurm leaf overlay alongside `slinky-slurm-operator-crds` and `slinky-slurm-operator` (matching the dynamo-platform pattern) so each leaf can carry its own GPU/GRES tuning. IMEX-capable leaves attach a fixed NVIDIA DRA `ComputeDomain` as a pre-manifest before the Slurm chart; the DRA driver reconciles it asynchronously into the `ResourceClaimTemplate` consumed by the NodeSet. Accounting (slurmdbd) requires an external MariaDB and is disabled in defaults — see `recipes/components/slinky-slurm/values.yaml`. | [Slinky Slurm Cluster Chart](https://github.com/SlinkyProject/slurm-operator/tree/main/helm/slurm) | | **nfd-ocp-olm** | OLM installer for Node Feature Discovery on OpenShift. Creates the OperatorGroup and Subscription resources that install NFD via the Operator Lifecycle Manager. Paired with `nfd-ocp`. OCP-specific. | [Node Feature Discovery (Certified)](https://catalog.redhat.com/software/container-stacks/detail/5ec53e8c110f56bd24f5f8db) | | **nfd-ocp** | Node Feature Discovery CR for OpenShift. Configures NFD's operand (worker, topology updater) via a NodeFeatureDiscovery custom resource. Deployed after `nfd-ocp-olm`. OCP-specific. | [Node Feature Discovery](https://github.com/kubernetes-sigs/node-feature-discovery) | | **gpu-operator-ocp-olm** | OLM installer for the GPU Operator on OpenShift. Creates the OperatorGroup and Subscription resources that install the certified GPU Operator via the Operator Lifecycle Manager. Paired with `gpu-operator-ocp`. OCP-specific. | [NVIDIA GPU Operator (Certified)](https://catalog.redhat.com/software/container-stacks/detail/5e7b210b8a3c1e00013d636d) | @@ -53,7 +53,7 @@ Not every component appears in every recipe. The recipe engine selects component - **Base components** (cert-manager, kube-prometheus-stack) appear in most recipes. - **Cloud-specific components** (aws-efa, aws-ebs-csi-driver) are added when the service matches. OCP recipes replace base components (gpu-operator, nfd, network-operator) with OLM+CR pairs (e.g., `gpu-operator-ocp-olm` + `gpu-operator-ocp`). - **Intent-specific components** (agentgateway, agentgateway-crds) are added based on workload intent (e.g., inference recipes include the inference gateway). -- **Platform-specific components** (slinky-slurm-operator, slinky-slurm, kubeflow-trainer, dynamo-platform) are added when the recipe selects a matching `--platform`. For `--platform slurm`, all three Slinky pieces (`slinky-slurm-operator-crds`, `slinky-slurm-operator`, `slinky-slurm`) are declared inline per slurm leaf overlay — the same shape `dynamo-platform` uses across `*-inference-dynamo` leaves. Leaves that want the operator only inline the CRDs + operator and omit the `slinky-slurm` componentRef. For an end-to-end walkthrough (recipe → bundle → install → validate → `srun` smoke job on EKS, GKE, or Kind), see [`demos/cuj1-slinky-slurm.md`](https://github.com/NVIDIA/aicr/blob/main/demos/cuj1-slinky-slurm.md). +- **Platform-specific components** (slinky-slurm-operator, slinky-slurm, kubeflow-trainer, dynamo-platform) are added when the recipe selects a matching `--platform`. For `--platform slurm`, all three core Slinky pieces (`slinky-slurm-operator-crds`, `slinky-slurm-operator`, `slinky-slurm`) are declared inline per slurm leaf overlay — the same shape `dynamo-platform` uses across `*-inference-dynamo` leaves. IMEX-capable Slurm leaves attach a fixed ComputeDomain through `slinky-slurm.preManifestFiles` so slurmd pods can consume DRA-provisioned IMEX channels. Leaves that want the operator only inline the CRDs + operator and omit the `slinky-slurm` componentRef. For an end-to-end walkthrough (recipe → bundle → install → validate → `srun` smoke job on EKS, GKE, or Kind), see [`demos/cuj1-slinky-slurm.md`](https://github.com/NVIDIA/aicr/blob/main/demos/cuj1-slinky-slurm.md). - **Accelerator/OS-specific tuning** (nodewright-customizations, nvidia-dra-driver-gpu) varies by hardware and OS combination. ### NFD Topology Updater diff --git a/docs/user/recipe-health.md b/docs/user/recipe-health.md index 04c6a865f..aee813477 100644 --- a/docs/user/recipe-health.md +++ b/docs/user/recipe-health.md @@ -34,8 +34,8 @@ The matrix is computed **hermetically and offline**: every signal is a pure read {/* BEGIN AICR-HEALTH */} ## Summary -- Recipes: **39** -- Pass: **39** · Warn: **0** · Fail: **0** · Unknown: **0** +- Recipes: **43** +- Pass: **43** · Warn: **0** · Fail: **0** · Unknown: **0** ## Recipes @@ -46,6 +46,7 @@ The matrix is computed **hermetically and offline**: every signal is a pure read | gb200-any | — | gb200 | — | — | — | pass | R:0 D:4 P:0 C:0 | pending | | h100-any | — | h100 | — | — | — | pass | R:0 D:4 P:0 C:0 | pending | | h200-any | — | h200 | — | — | — | pass | R:0 D:4 P:0 C:0 | pending | +| l40s-any | — | l40s | — | — | — | pass | R:0 D:4 P:0 C:0 | pending | | rtx-pro-6000-any | — | rtx-pro-6000 | — | — | — | pass | R:0 D:4 P:0 C:0 | pending | | monitoring-hpa | — | — | — | — | — | pass | R:0 D:0 P:0 C:0 | pending | | a100-aks-ubuntu-training-kubeflow | aks | a100 | ubuntu | training | kubeflow | pass | R:0 D:4 P:0 C:10 | pending | @@ -56,6 +57,7 @@ The matrix is computed **hermetically and offline**: every signal is a pure read | a100-eks-ubuntu-training-kubeflow | eks | a100 | ubuntu | training | kubeflow | pass | R:0 D:4 P:0 C:10 | pending | | gb200-eks-ubuntu-inference-dynamo | eks | gb200 | ubuntu | inference | dynamo | pass | R:0 D:4 P:1 C:10 | pending | | gb200-eks-ubuntu-training-kubeflow | eks | gb200 | ubuntu | training | kubeflow | pass | R:0 D:4 P:2 C:8 | pending | +| gb200-eks-ubuntu-training-slurm | eks | gb200 | ubuntu | training | slurm | pass | R:0 D:4 P:0 C:10 | pending | | h100-eks-ubuntu-inference-dynamo | eks | h100 | ubuntu | inference | dynamo | pass | R:0 D:4 P:1 C:11 | pending | | h100-eks-ubuntu-inference-nim | eks | h100 | ubuntu | inference | nim | pass | R:0 D:4 P:0 C:11 | pending | | h100-eks-ubuntu-training-kubeflow | eks | h100 | ubuntu | training | kubeflow | pass | R:0 D:4 P:1 C:10 | pending | @@ -80,5 +82,7 @@ The matrix is computed **hermetically and offline**: every signal is a pure read | a100-oke-ubuntu-training-kubeflow | oke | a100 | ubuntu | training | kubeflow | pass | R:0 D:4 P:0 C:8 | pending | | gb200-oke-ubuntu-inference-dynamo | oke | gb200 | ubuntu | inference | dynamo | pass | R:0 D:4 P:1 C:10 | pending | | gb200-oke-ubuntu-training-kubeflow | oke | gb200 | ubuntu | training | kubeflow | pass | R:0 D:4 P:1 C:8 | pending | +| l40s-oke-inference | oke | l40s | ol | inference | — | pass | R:0 D:4 P:0 C:8 | pending | +| l40s-oke-training | oke | l40s | ol | training | — | pass | R:0 D:4 P:0 C:8 | pending | {/* END AICR-HEALTH */} diff --git a/pkg/collector/k8s/server_test.go b/pkg/collector/k8s/server_test.go index 06b778fd9..6f07f2540 100644 --- a/pkg/collector/k8s/server_test.go +++ b/pkg/collector/k8s/server_test.go @@ -16,6 +16,7 @@ package k8s import ( "context" + "os" "testing" "time" @@ -92,6 +93,16 @@ func TestKubernetesCollector_CollectWithTimeout(t *testing.T) { } func TestKubernetesCollector_ErrorRecovery_NilClient(t *testing.T) { + // Match the client package's discovery-isolation pattern so this test + // cannot select a real workstation kubeconfig. + t.Setenv("KUBECONFIG", os.Getenv("KUBECONFIG")) + if err := os.Unsetenv("KUBECONFIG"); err != nil { + t.Fatalf("unset KUBECONFIG: %v", err) + } + home := t.TempDir() + t.Setenv("HOME", home) + t.Setenv("USERPROFILE", home) + ctx := context.TODO() // Create collector without a valid client diff --git a/pkg/recipe/deployment_order_guard_test.go b/pkg/recipe/deployment_order_guard_test.go index 61ce26480..7750820b6 100644 --- a/pkg/recipe/deployment_order_guard_test.go +++ b/pkg/recipe/deployment_order_guard_test.go @@ -175,6 +175,30 @@ func TestDeploymentOrderGuards(t *testing.T) { {"gpu-operator", "nvsentinel"}, }, }, + { + name: "gb200-eks-ubuntu-training-slurm", + criteria: func() *Criteria { + c := NewCriteria() + c.Service = CriteriaServiceEKS + c.Accelerator = CriteriaAcceleratorGB200 + c.OS = CriteriaOSUbuntu + c.Intent = CriteriaIntentTraining + c.Platform = CriteriaPlatformSlurm + return c + }, + requiredDeps: map[string][]string{ + "slinky-slurm-operator": {"cert-manager", "slinky-slurm-operator-crds"}, + "slinky-slurm": {"nvidia-dra-driver-gpu", "slinky-slurm-operator", "slinky-slurm-operator-crds"}, + }, + requiredOrdering: [][2]string{ + {"nvidia-dra-driver-gpu", "slinky-slurm"}, + {"cert-manager", "slinky-slurm-operator"}, + {"slinky-slurm-operator-crds", "slinky-slurm-operator"}, + {"slinky-slurm-operator", "slinky-slurm"}, + {"slinky-slurm-operator-crds", "slinky-slurm"}, + {"gpu-operator", "nvsentinel"}, + }, + }, { name: "h100-eks-ubuntu-training-slurm", criteria: func() *Criteria { diff --git a/pkg/recipe/metadata_store_test.go b/pkg/recipe/metadata_store_test.go index d27b10419..474128a4c 100644 --- a/pkg/recipe/metadata_store_test.go +++ b/pkg/recipe/metadata_store_test.go @@ -27,6 +27,7 @@ import ( "testing" aicrerrors "github.com/NVIDIA/aicr/pkg/errors" + "github.com/NVIDIA/aicr/pkg/manifest" "golang.org/x/sync/errgroup" "gopkg.in/yaml.v3" ) @@ -631,6 +632,7 @@ func TestSlurmLeavesClearInheritedPerformancePhase(t *testing.T) { } for _, name := range []string{ + "gb200-eks-ubuntu-training-slurm", "h100-eks-ubuntu-training-slurm", "h100-gke-cos-training-slurm", } { @@ -676,6 +678,18 @@ func TestSlurmLeavesAppendConformanceHealthCheck(t *testing.T) { "secure-accelerator-access", "slinky-slurm-health", } + gb200ConformanceChecks := []string{ + "platform-health", + "gpu-operator-health", + "dra-support", + "accelerator-metrics", + "ai-service-metrics", + "gang-scheduling", + "pod-autoscaling", + "cluster-autoscaling", + "slinky-slurm-health", + "slinky-slurm-imex-channel", + } kindConformanceChecks := []string{ "platform-health", "gpu-operator-health", @@ -693,6 +707,7 @@ func TestSlurmLeavesAppendConformanceHealthCheck(t *testing.T) { name string want []string }{ + {name: "gb200-eks-ubuntu-training-slurm", want: gb200ConformanceChecks}, {name: "h100-eks-ubuntu-training-slurm", want: conformanceChecks}, {name: "h100-gke-cos-training-slurm", want: conformanceChecks}, {name: "h100-kind-training-slurm", want: kindConformanceChecks}, @@ -718,6 +733,204 @@ func TestSlurmLeavesAppendConformanceHealthCheck(t *testing.T) { } } +func TestGB200EKSSlurmWiresIMEXComputeDomain(t *testing.T) { + ctx := context.Background() + store, err := loadMetadataStore(ctx) + if err != nil { + t.Fatalf("failed to load metadata store: %v", err) + } + + leaf, ok := store.GetRecipeByName("gb200-eks-ubuntu-training-slurm") + if !ok { + t.Fatal("overlay gb200-eks-ubuntu-training-slurm not found in store") + } + result, err := store.BuildRecipeResult(ctx, leaf.Spec.Criteria) + if err != nil { + t.Fatalf("BuildRecipeResult failed: %v", err) + } + if !slices.ContainsFunc( + result.Constraints, + func(c Constraint) bool { + return c.Name == "K8s.server.version" && c.Value == ">= 1.34" + }, + ) { + + t.Errorf("constraints = %v, want K8s.server.version >= 1.34 for DRA v1", result.Constraints) + } + + if computeDomain := result.GetComponentRef("slinky-slurm-imex-compute-domain"); computeDomain != nil { + t.Errorf("standalone IMEX ComputeDomain component = %+v, want absent", computeDomain) + } + slurm := result.GetComponentRef("slinky-slurm") + if slurm == nil { + t.Fatal("slinky-slurm component missing") + } + const manifestPath = "components/slinky-slurm/manifests/compute-domain.yaml" + if !slices.Contains(slurm.PreManifestFiles, manifestPath) { + t.Errorf("slinky-slurm preManifestFiles = %v, want %q", slurm.PreManifestFiles, manifestPath) + } + if !slices.Contains(slurm.DependencyRefs, "nvidia-dra-driver-gpu") { + t.Errorf("slinky-slurm dependencyRefs = %v, want nvidia-dra-driver-gpu", slurm.DependencyRefs) + } + + values, err := result.GetValuesForComponent("slinky-slurm") + if err != nil { + t.Fatalf("GetValuesForComponent(slinky-slurm) failed: %v", err) + } + if got := valueAtPath[string](t, values, "controller", "extraConfMap", "SwitchType"); got != "switch/nvidia_imex" { + t.Errorf("controller.extraConfMap.SwitchType = %q, want switch/nvidia_imex", got) + } + if got := valueAtPath[string](t, values, "nodesets", "slinky", "extraConfMap", "Gres"); got != "gpu:gb200:4" { + t.Errorf("nodesets.slinky.extraConfMap.Gres = %q, want gpu:gb200:4", got) + } + + podClaims := valueAtPath[[]any](t, values, "nodesets", "slinky", "podSpec", "resourceClaims") + assertSingleNameField(t, podClaims, "name", "imex-channels") + assertSingleNameField(t, podClaims, "resourceClaimTemplateName", "slinky-slurm-imex-channels") + nodeSetClaim, ok := podClaims[0].(map[string]any) + if !ok { + t.Fatalf("podClaims[0] = %T, want map[string]any", podClaims[0]) + } + nodeSetRCTName, ok := nodeSetClaim["resourceClaimTemplateName"].(string) + if !ok { + t.Fatalf("podClaims[0].resourceClaimTemplateName = %T, want string", nodeSetClaim["resourceClaimTemplateName"]) + } + containerClaims := valueAtPath[[]any](t, values, "nodesets", "slinky", "slurmd", "resources", "claims") + assertSingleNameField(t, containerClaims, "name", "imex-channels") + slurmd := valueAtPath[map[string]any](t, values, "nodesets", "slinky", "slurmd") + if got, ok := slurmd["securityContext"]; ok { + t.Errorf("nodesets.slinky.slurmd.securityContext = %v, want omitted to use chart default", got) + } + + content, err := GetManifestContentWithContext(ctx, result.DataProvider(), manifestPath) + if err != nil { + t.Fatalf("GetManifestContentWithContext(%q) failed: %v", manifestPath, err) + } + rendered, err := manifest.Render(content, manifest.RenderInput{ + ComponentName: slurm.Name, + Namespace: slurm.Namespace, + ChartName: slurm.Chart, + ChartVersion: slurm.Version, + Values: values, + }) + if err != nil { + t.Fatalf("render ComputeDomain manifest: %v", err) + } + var computeDomain map[string]any + if err := yaml.Unmarshal(rendered, &computeDomain); err != nil { + t.Fatalf("unmarshal rendered ComputeDomain: %v", err) + } + computeDomainRCTName := valueAtPath[string](t, computeDomain, "spec", "channel", "resourceClaimTemplate", "name") + if computeDomainRCTName != nodeSetRCTName { + t.Errorf("ComputeDomain RCT name = %q, NodeSet RCT name = %q", computeDomainRCTName, nodeSetRCTName) + } +} + +func TestSlinkySlurmIMEXComputeDomainFixedIdentityCannotBeOverridden(t *testing.T) { + ctx := context.Background() + const manifestPath = "components/slinky-slurm/manifests/compute-domain.yaml" + content, err := GetManifestContentWithContext(ctx, nil, manifestPath) + if err != nil { + t.Fatalf("GetManifestContentWithContext(%q) failed: %v", manifestPath, err) + } + + // Scalar --set and typed --set-json/--set-file converge on this final + // values map before local manifests are rendered. None may change the + // immutable ComputeDomain integration contract. + tests := []struct { + name string + values map[string]any + }{ + { + name: "scalar --set", + values: map[string]any{ + "name": "from-set", + "allocationMode": "Immediate", + "resourceClaimTemplateName": "from-set", + }, + }, + { + name: "typed --set-json or --set-file", + values: map[string]any{ + "name": "from-typed-set", + "allocationMode": "Immediate", + "resourceClaimTemplateName": "from-typed-set", + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + rendered, renderErr := manifest.Render(content, manifest.RenderInput{ + ComponentName: "slinky-slurm", + Namespace: "slurm", + ChartName: "slurm", + ChartVersion: "1.1.0", + Values: tt.values, + }) + if renderErr != nil { + t.Fatalf("render ComputeDomain manifest: %v", renderErr) + } + + for _, want := range []string{ + "name: slinky-slurm-imex", + "allocationMode: All", + "name: slinky-slurm-imex-channels", + } { + if !strings.Contains(string(rendered), want) { + t.Errorf("rendered ComputeDomain manifest missing fixed value %q:\n%s", want, rendered) + } + } + for _, unwanted := range []string{"from-set", "from-typed-set", "allocationMode: Immediate"} { + if strings.Contains(string(rendered), unwanted) { + t.Errorf("rendered ComputeDomain manifest contains override value %q:\n%s", unwanted, rendered) + } + } + }) + } +} + +func valueAtPath[T any](t *testing.T, root map[string]any, path ...string) T { + t.Helper() + + if len(path) == 0 { + t.Fatal("value path must not be empty") + } + + var current any = root + for _, key := range path { + m, ok := current.(map[string]any) + if !ok { + t.Fatalf("%q parent is %T, want map[string]any", key, current) + } + current, ok = m[key] + if !ok { + t.Fatalf("missing nested key path %v", path) + } + } + value, ok := current.(T) + if !ok { + var expected T + t.Fatalf("nested path %v = %T, want %T", path, current, expected) + } + return value +} + +func assertSingleNameField(t *testing.T, items []any, field, want string) { + t.Helper() + + if len(items) != 1 { + t.Fatalf("items length = %d, want 1: %v", len(items), items) + } + item, ok := items[0].(map[string]any) + if !ok { + t.Fatalf("items[0] = %T, want map[string]any", items[0]) + } + if got, ok := item[field].(string); !ok || got != want { + t.Fatalf("items[0].%s = %v, want %q", field, item[field], want) + } +} + // TestEvaluatorFailingLeafExcludesCandidate verifies that when a leaf overlay's // constraints fail evaluation, no ancestor overlay is used as a fallback // candidate. With maximal leaf selection, ancestors are not independent diff --git a/recipes/components/slinky-slurm/manifests/compute-domain.yaml b/recipes/components/slinky-slurm/manifests/compute-domain.yaml new file mode 100644 index 000000000..3bf024787 --- /dev/null +++ b/recipes/components/slinky-slurm/manifests/compute-domain.yaml @@ -0,0 +1,35 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# These defaults must stay aligned with the Slinky NodeSet in the GB200 Slurm +# overlay. If a customer overrides the claim name or ResourceClaimTemplate name +# and introduces a mismatch, Slurm node pods can remain Pending. +--- +apiVersion: resource.nvidia.com/v1beta1 +kind: ComputeDomain +metadata: + annotations: + # preManifestFiles and dependencyRefs provide ordering for this CR. + aicr/skip-hook-validation: "true" + labels: + app.kubernetes.io/created-by: aicr + app.kubernetes.io/managed-by: {{ .Release.Service }} + helm.sh/chart: {{ printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} + name: slinky-slurm-imex + namespace: {{ .Release.Namespace }} +spec: + channel: + allocationMode: All + resourceClaimTemplate: + name: slinky-slurm-imex-channels diff --git a/recipes/overlays/gb200-eks-ubuntu-training-slurm.yaml b/recipes/overlays/gb200-eks-ubuntu-training-slurm.yaml new file mode 100644 index 000000000..e09e13f32 --- /dev/null +++ b/recipes/overlays/gb200-eks-ubuntu-training-slurm.yaml @@ -0,0 +1,125 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +kind: RecipeMetadata +apiVersion: aicr.run/v1alpha2 +metadata: + name: gb200-eks-ubuntu-training-slurm + +spec: + # GB200 + EKS + Ubuntu + training with the Slinky operator and a + # Slinky-managed Slurm cluster. GB200 GPU GRES is declared inline below + # (pod-side limit + slurmd-side Gres= line); remaining EKS-specific + # tuning (gp3 storage, DCGM job-mapping) is layered at install time + # via `aicr bundle ... --set slinkyslurm:...` or a valuesFile. + base: gb200-eks-ubuntu-training + + criteria: + service: eks + accelerator: gb200 + os: ubuntu + intent: training + platform: slurm + + mixins: + - os-ubuntu + + constraints: + # The IMEX ComputeDomain reconciles a ResourceClaimTemplate through the + # GA resource.k8s.io/v1 DRA API, available from Kubernetes 1.34. + - name: K8s.server.version + value: ">= 1.34" + + # The Slinky operator (CRDs + operator + cluster instance) is declared + # inline per slurm leaf, mirroring the dynamo-platform pattern in + # h100-*-inference-dynamo leaves. Inlining lets each leaf carry its + # own GPU/GRES tuning without fighting the mixin-vs-leaf identity-field + # guard in mixinComponentRefSafeForMerge (pkg/recipe/metadata_store.go), + # and keeps base.yaml free of platform-specific components. + # + # GPU GRES on slinky-slurm must be declared in two places because the + # chart does not derive Gres= in slurm.conf from pod resource limits + # (see comment in components/slinky-slurm/values.yaml): + # 1. nodesets.slinky.extraConfMap.Gres — adds `Gres=gpu:gb200:4` to + # slurmd's --conf so slurmctld knows it has GPUs to allocate via + # `srun --gres=gpu:N`. + # 2. nodesets.slinky.slurmd.resources.limits.nvidia.com/gpu — reserves + # 4 GB200s on the slurmd pod so the NVIDIA device plugin injects + # /dev/nvidia* into the container. Without this `gres.conf`'s + # AutoDetect=nvidia finds nothing. `requests` is omitted: Kubernetes + # auto-mirrors requests=limits for extended resources. + # The count is per Kubernetes GPU node / slurmd pod (p6e-gb200.36xlarge + # exposes nvidia.com/gpu.count=4), not cluster-total or rack-total capacity. + # Accelerated nodeSelector/tolerations on slurmd are injected via the + # registry's nodesets.slinky.podSpec.{nodeSelector,tolerations} paths. + componentRefs: + - name: slinky-slurm-operator-crds + type: Helm + valuesFile: components/slinky-slurm-operator-crds/values.yaml + + - name: slinky-slurm-operator + type: Helm + valuesFile: components/slinky-slurm-operator/values.yaml + dependencyRefs: + - cert-manager + - slinky-slurm-operator-crds + + - name: slinky-slurm + type: Helm + valuesFile: components/slinky-slurm/values.yaml + preManifestFiles: + # Submit this immutable ComputeDomain after the NVIDIA DRA driver and + # before the Slinky chart creates NodeSet pods that consume its RCT. + # RCT reconciliation may complete asynchronously after this apply. + - components/slinky-slurm/manifests/compute-domain.yaml + dependencyRefs: + - nvidia-dra-driver-gpu + - slinky-slurm-operator + - slinky-slurm-operator-crds + overrides: + controller: + extraConfMap: + SwitchType: "switch/nvidia_imex" + nodesets: + slinky: + extraConfMap: + Gres: "gpu:gb200:4" + podSpec: + resourceClaims: + # The claim name and ResourceClaimTemplate name are internal + # integration values and must not be overridden. They must + # stay aligned with the ComputeDomain pre-manifest; a mismatch + # can leave Slurm node pods Pending. + - name: imex-channels + resourceClaimTemplateName: slinky-slurm-imex-channels + slurmd: + resources: + limits: + nvidia.com/gpu: 4 + claims: + - name: imex-channels + + # K8s-native nccl-all-reduce-bw checks are dropped on Slinky leaves: + # those checks launch Pods against the cluster scheduler, so on a + # Slinky-managed cluster they bypass slurmd entirely and measure the + # wrong path. Slurm-specific health is covered by the conformance check. + validation: + conformance: + checks: + - slinky-slurm-health + # Selected only by IMEX-capable Slinky Slurm recipes. + - slinky-slurm-imex-channel + performance: + checks: [] + constraints: [] diff --git a/recipes/validators/README.md b/recipes/validators/README.md index f6ff1791a..555044ace 100644 --- a/recipes/validators/README.md +++ b/recipes/validators/README.md @@ -63,6 +63,8 @@ Applied by `catalog.Load` (`pkg/validator/catalog/catalog.go`) in order: | `cluster-autoscaling` | Verify cluster autoscaling with Karpenter | 10m | | `robust-controller` | Verify Dynamo operator controller and webhooks | 5m | | `secure-accelerator-access` | Verify secure GPU access via DRA (no host device mounts) | 10m | +| `slinky-slurm-health` | Verify Slinky Slurm controller, node inventory, and job submission health | 5m | +| `slinky-slurm-imex-channel` | Verify fixed IMEX resources and distinct channels for concurrent Slinky Slurm jobs | 5m | | `gpu-operator-health` | Verify GPU operator health (conformance diagnostic) | 2m | | `platform-health` | Verify platform component health (conformance diagnostic) | 5m | diff --git a/recipes/validators/catalog.yaml b/recipes/validators/catalog.yaml index 372881afe..ac889253a 100644 --- a/recipes/validators/catalog.yaml +++ b/recipes/validators/catalog.yaml @@ -215,6 +215,13 @@ validators: timeout: 5m args: ["slinky-slurm-health"] env: [] + - name: slinky-slurm-imex-channel + phase: conformance + description: "Verify fixed IMEX resources and distinct channels for concurrent Slinky Slurm jobs" + image: ghcr.io/nvidia/aicr-validators/conformance:latest + timeout: 5m + args: ["slinky-slurm-imex-channel"] + env: [] - name: gpu-operator-health phase: conformance description: "Verify GPU operator health (conformance diagnostic)" diff --git a/validators/conformance/consts.go b/validators/conformance/consts.go index 186976ad3..20b4a5407 100644 --- a/validators/conformance/consts.go +++ b/validators/conformance/consts.go @@ -27,7 +27,8 @@ const ( resourceCRDs = "customresourcedefinitions" // versionV1alpha1 is the API version used by legacy NVIDIA and TrainJob CRDs. versionV1alpha1 = "v1alpha1" - // versionV1beta1 is the API version used by DynamoGraphDeployment in Dynamo 1.2. + // versionV1beta1 is the API version used by DynamoGraphDeployment in + // Dynamo 1.2 and NVIDIA ComputeDomain. versionV1beta1 = "v1beta1" // labelNVIDIAGPUPresent is the "key=value" selector for GPU-bearing nodes // when scaled-up via the cluster autoscaler. diff --git a/validators/conformance/main.go b/validators/conformance/main.go index e9ade7f02..b5b8dcef0 100644 --- a/validators/conformance/main.go +++ b/validators/conformance/main.go @@ -38,6 +38,7 @@ func main() { "robust-controller": CheckRobustController, "secure-accelerator-access": CheckSecureAcceleratorAccess, "slinky-slurm-health": CheckSlinkySlurmHealth, + "slinky-slurm-imex-channel": CheckSlinkySlurmIMEXChannel, "gpu-operator-health": CheckGPUOperatorHealth, "platform-health": CheckPlatformHealth, }) diff --git a/validators/conformance/slinky_slurm_imex_channel_check.go b/validators/conformance/slinky_slurm_imex_channel_check.go new file mode 100644 index 000000000..fcb3873e2 --- /dev/null +++ b/validators/conformance/slinky_slurm_imex_channel_check.go @@ -0,0 +1,231 @@ +// Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "fmt" + "strings" + + "github.com/NVIDIA/aicr/pkg/errors" + "github.com/NVIDIA/aicr/validators" + "golang.org/x/sync/errgroup" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime/schema" +) + +const ( + slinkySlurmIMEXComputeDomainName = "slinky-slurm-imex" + slinkySlurmIMEXResourceClaimTemplateName = "slinky-slurm-imex-channels" + slinkySlurmIMEXChannelPrefix = "/dev/nvidia-caps-imex-channels/channel" + + // slinkySlurmIMEXChannelShell runs two overlapping but bounded Slurm jobs. + // Each direct srun requests one GPU and a small CPU/memory footprint so the + // two allocations can run concurrently without reserving the whole node. + // --immediate prevents an unschedulable job from remaining pending; --time + // bounds each allocation even if the shell or channel check misbehaves. + slinkySlurmIMEXChannelShell = `run() { + srun \ + --immediate=30 \ + --time=1:00 \ + --nodes=1 \ + --ntasks=1 \ + --cpus-per-task=1 \ + --mem=128M \ + --gres=gpu:1 \ + /bin/sh -c ' + channel=$(find /dev/nvidia-caps-imex-channels -maxdepth 1 -type c -name "channel*" -print) || { + printf "IMEX_CHANNEL_ERROR=find failed\n" >&2 + exit 1 + } + if test -n "$channel"; then + channel_count=$(printf "%s\n" "$channel" | wc -l) + else + channel_count=0 + fi + printf "IMEX_CHANNEL_COUNT=%s\n" "$channel_count" >&2 + printf "IMEX_CHANNEL_CANDIDATES:\n%s\n" "$channel" >&2 + if test "$channel_count" -ne 1; then + printf "IMEX_CHANNEL_ERROR=expected exactly one channel\n" >&2 + exit 1 + fi + printf "IMEX_CHANNEL=%s\n" "$channel" + # Hold longer than --immediate so two successful jobs must overlap. + sleep 40 + ' +} +run & first=$! +run & second=$! +wait "$first"; first_rc=$? +wait "$second"; second_rc=$? +test "$first_rc" -eq 0 && test "$second_rc" -eq 0` +) + +var slinkySlurmIMEXComputeDomainGVR = schema.GroupVersionResource{ + Group: "resource.nvidia.com", + Version: versionV1beta1, + Resource: "computedomains", +} + +// CheckSlinkySlurmIMEXChannel verifies that two concurrent Slurm jobs +// each receive a distinct NVIDIA IMEX channel. IMEX-capable recipes opt in by +// selecting this check explicitly, without coupling it to a hardware name. +func CheckSlinkySlurmIMEXChannel(ctx *validators.Context) error { + if ctx.Clientset == nil { + return errors.New(errors.ErrCodeInvalidRequest, "kubernetes client is not available") + } + if ctx.RESTConfig == nil { + return errors.New(errors.ErrCodeInvalidRequest, "RESTConfig is not available") + } + if ctx.ValidationInput == nil { + return errors.New(errors.ErrCodeInvalidRequest, "validation is not available") + } + if !recipeHasComponent(ctx, slinkySlurmComponent) { + return validators.Skip("slinky-slurm component not present in recipe") + } + + namespace := resolveSlinkySlurmNamespace(ctx) + if err := discoverSlinkySetAPIs(ctx); err != nil { + return err + } + if err := skipIfAllNodeSetPodsAreKWOK(ctx, namespace); err != nil { + return err + } + if err := requireSlinkySlurmIMEXResources(ctx, namespace); err != nil { + return err + } + + loginPod, err := findReadySlinkyLoginPod(ctx, namespace) + if err != nil { + return err + } + result, execErr := slinkyExecCommand( + ctx.Ctx, + ctx, + namespace, + loginPod.Name, + []string{"/bin/sh", "-c", slinkySlurmIMEXChannelShell}, + slinkyLoginPodExecOptions, + ) + recordSlinkySlurmIMEXResult(ctx, namespace, loginPod.Name, result, execErr) + if execErr != nil { + return errors.Wrap(errors.ErrCodeInternal, "failed to run concurrent Slinky Slurm IMEX jobs", execErr) + } + if result.ExitCode != 0 { + return errors.New(errors.ErrCodeInternal, + fmt.Sprintf("concurrent Slinky Slurm IMEX jobs: exit code %d", result.ExitCode)) + } + + channels, err := parseSlinkySlurmIMEXChannels(result.Stdout) + if err != nil { + return err + } + recordRawTextArtifact(ctx, "Slinky Slurm IMEX channels", "", + fmt.Sprintf("first=%s\nsecond=%s", channels[0], channels[1])) + return nil +} + +func requireSlinkySlurmIMEXResources(ctx *validators.Context, namespace string) error { + dynamicClient, err := getDynamicClient(ctx) + if err != nil { + return err + } + + group, groupCtx := errgroup.WithContext(ctx.Ctx) + group.Go(func() error { + _, getErr := dynamicClient.Resource(slinkySlurmIMEXComputeDomainGVR).Namespace(namespace).Get( + groupCtx, + slinkySlurmIMEXComputeDomainName, + metav1.GetOptions{}, + ) + if getErr != nil { + return slinkySlurmIMEXResourceLookupError( + "ComputeDomain", namespace, slinkySlurmIMEXComputeDomainName, getErr) + } + return nil + }) + group.Go(func() error { + _, getErr := ctx.Clientset.ResourceV1().ResourceClaimTemplates(namespace).Get( + groupCtx, + slinkySlurmIMEXResourceClaimTemplateName, + metav1.GetOptions{}, + ) + if getErr != nil { + return slinkySlurmIMEXResourceLookupError( + "ResourceClaimTemplate", namespace, slinkySlurmIMEXResourceClaimTemplateName, getErr) + } + return nil + }) + + if err := group.Wait(); err != nil { + return err + } + return nil +} + +func slinkySlurmIMEXResourceLookupError(kind, namespace, name string, err error) error { + code := errors.ErrCodeInternal + if apierrors.IsNotFound(err) { + code = errors.ErrCodeNotFound + } + return errors.Wrap(code, fmt.Sprintf("failed to get %s %s/%s", kind, namespace, name), err) +} + +func parseSlinkySlurmIMEXChannels(stdout string) ([2]string, error) { + channels := make([]string, 0, 2) + for _, line := range strings.Split(stdout, "\n") { + channel, found := strings.CutPrefix(strings.TrimSpace(line), "IMEX_CHANNEL=") + if !found { + continue + } + channel = strings.TrimSpace(channel) + suffix := strings.TrimPrefix(channel, slinkySlurmIMEXChannelPrefix) + if suffix == "" || suffix == channel || strings.Contains(suffix, "/") { + return [2]string{}, errors.New(errors.ErrCodeInternal, + fmt.Sprintf("invalid IMEX channel path %q", channel)) + } + channels = append(channels, channel) + } + if len(channels) != 2 { + return [2]string{}, errors.New(errors.ErrCodeInternal, + fmt.Sprintf("expected two IMEX channels, got %d", len(channels))) + } + if channels[0] == channels[1] { + return [2]string{}, errors.New(errors.ErrCodeInternal, + fmt.Sprintf("concurrent Slurm jobs received the same IMEX channel %q", channels[0])) + } + return [2]string{channels[0], channels[1]}, nil +} + +func recordSlinkySlurmIMEXResult( + ctx *validators.Context, + namespace, podName string, + result podExecResult, + execErr error, +) { + + var body strings.Builder + fmt.Fprintf(&body, "Pod: %s/%s\n", namespace, podName) + fmt.Fprintf(&body, "Command: /bin/sh -c %s\n", slinkySlurmIMEXChannelShell) + fmt.Fprintf(&body, "ExitCode: %d\n", result.ExitCode) + if execErr != nil { + fmt.Fprintf(&body, "Error: %v\n", execErr) + } + fmt.Fprintf(&body, "\nstdout:\n%s\n\nstderr:\n%s\n", result.Stdout, result.Stderr) + + recordRawTextArtifact(ctx, "Slinky Slurm IMEX channel result", + fmt.Sprintf("kubectl exec -n %s %s -- /bin/sh -c ''", namespace, podName), + body.String()) +} diff --git a/validators/conformance/slinky_slurm_imex_channel_check_test.go b/validators/conformance/slinky_slurm_imex_channel_check_test.go new file mode 100644 index 000000000..38b5111ac --- /dev/null +++ b/validators/conformance/slinky_slurm_imex_channel_check_test.go @@ -0,0 +1,344 @@ +// Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "context" + "strings" + "testing" + + "github.com/NVIDIA/aicr/pkg/errors" + "github.com/NVIDIA/aicr/pkg/recipe" + v1 "github.com/NVIDIA/aicr/pkg/validator/v1" + "github.com/NVIDIA/aicr/validators" + resourcev1 "k8s.io/api/resource/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + k8sfake "k8s.io/client-go/kubernetes/fake" + "k8s.io/client-go/rest" +) + +func TestCheckSlinkySlurmIMEXChannelSkipsWithoutEnabledSlinkyComponent(t *testing.T) { + tests := []struct { + name string + componentRefs []recipe.ComponentRef + }{ + { + name: "component absent", + componentRefs: []recipe.ComponentRef{{Name: "gpu-operator"}}, + }, + { + name: "component disabled", + componentRefs: []recipe.ComponentRef{{ + Name: slinkySlurmComponent, + Overrides: map[string]any{"enabled": false}, + }}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + ctx := &validators.Context{ + Ctx: context.Background(), + Clientset: k8sfake.NewSimpleClientset(), + RESTConfig: &rest.Config{Host: "https://example.test"}, + ValidationInput: &v1.ValidationInput{ + ComponentRefs: tt.componentRefs, + }, + } + + err := CheckSlinkySlurmIMEXChannel(ctx) + if !isSkipLike(err, "slinky-slurm") { + t.Fatalf("error = %v, want skip mentioning slinky-slurm", err) + } + }) + } +} + +func TestCheckSlinkySlurmIMEXChannel(t *testing.T) { + // Channel numbers are arbitrary; only distinct, well-formed paths matter. + tests := []struct { + name string + stdout string + result podExecResult + execErr error + wantErr string + }{ + { + name: "distinct channels pass", + stdout: "IMEX_CHANNEL=/dev/nvidia-caps-imex-channels/channel2\n" + + "IMEX_CHANNEL=/dev/nvidia-caps-imex-channels/channel3\n", + }, + { + name: "missing channel fails", + stdout: "IMEX_CHANNEL=/dev/nvidia-caps-imex-channels/channel2\n", + wantErr: "expected two IMEX channels", + }, + { + name: "duplicate channel fails", + stdout: "IMEX_CHANNEL=/dev/nvidia-caps-imex-channels/channel2\n" + + "IMEX_CHANNEL=/dev/nvidia-caps-imex-channels/channel2\n", + wantErr: "same IMEX channel", + }, + { + name: "nonzero command fails", + result: podExecResult{ExitCode: 1, Stderr: "srun failed"}, + wantErr: "exit code 1", + }, + { + name: "exec error fails", + execErr: errors.New(errors.ErrCodeInternal, "exec failed"), + wantErr: "exec failed", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + ctx := slinkyIMEXTestContext(t, true, true) + restore := replaceSlinkyExecForTest(func( + context.Context, + *validators.Context, + string, + string, + []string, + podExecOptions, + ) (podExecResult, error) { + + result := tt.result + if result.Stdout == "" { + result.Stdout = tt.stdout + } + return result, tt.execErr + }) + defer restore() + + err := CheckSlinkySlurmIMEXChannel(ctx) + if tt.wantErr == "" && err != nil { + t.Fatalf("error = %v, want nil", err) + } + if tt.wantErr != "" && (err == nil || !strings.Contains(err.Error(), tt.wantErr)) { + t.Fatalf("error = %v, want containing %q", err, tt.wantErr) + } + }) + } +} + +func TestCheckSlinkySlurmIMEXChannelRequiresFixedResources(t *testing.T) { + tests := []struct { + name string + includeComputeDomain bool + includeResourceClaimTemplate bool + wantErr string + }{ + { + name: "missing ComputeDomain", + includeResourceClaimTemplate: true, + wantErr: "ComputeDomain slurm/slinky-slurm-imex", + }, + { + name: "missing ResourceClaimTemplate", + includeComputeDomain: true, + wantErr: "ResourceClaimTemplate slurm/slinky-slurm-imex-channels", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + ctx := slinkyIMEXTestContext(t, tt.includeComputeDomain, tt.includeResourceClaimTemplate) + execCalled := false + restore := replaceSlinkyExecForTest(func( + context.Context, + *validators.Context, + string, + string, + []string, + podExecOptions, + ) (podExecResult, error) { + + execCalled = true + return podExecResult{ + Stdout: "IMEX_CHANNEL=/dev/nvidia-caps-imex-channels/channel2\n" + + "IMEX_CHANNEL=/dev/nvidia-caps-imex-channels/channel3\n", + }, nil + }) + defer restore() + + err := CheckSlinkySlurmIMEXChannel(ctx) + if err == nil || !strings.Contains(err.Error(), tt.wantErr) { + t.Fatalf("error = %v, want containing %q", err, tt.wantErr) + } + if execCalled { + t.Fatal("Slurm exec was called before fixed IMEX resources were verified") + } + }) + } +} + +func TestCheckSlinkySlurmIMEXChannelSkipsKWOKBeforeRequiringFixedResources(t *testing.T) { + ctx := slurmReadyTestContext(t, true) + restore := replaceSlinkyExecForTest(func( + context.Context, + *validators.Context, + string, + string, + []string, + podExecOptions, + ) (podExecResult, error) { + + t.Fatal("exec should not run when all NodeSet pods are on KWOK nodes") + return podExecResult{}, nil + }) + defer restore() + + err := CheckSlinkySlurmIMEXChannel(ctx) + if !isSkipLike(err, "KWOK") { + t.Fatalf("error = %v, want KWOK skip", err) + } +} + +func TestCheckSlinkySlurmIMEXChannelRequiresContext(t *testing.T) { + tests := []struct { + name string + ctx *validators.Context + want string + }{ + { + name: "missing client", + ctx: &validators.Context{ + Ctx: context.Background(), + RESTConfig: &rest.Config{Host: "https://example.test"}, + ValidationInput: &v1.ValidationInput{}, + }, + want: "kubernetes client", + }, + { + name: "missing rest config", + ctx: &validators.Context{ + Ctx: context.Background(), + Clientset: k8sfake.NewSimpleClientset(), + ValidationInput: &v1.ValidationInput{}, + }, + want: "RESTConfig", + }, + { + name: "missing validation", + ctx: &validators.Context{ + Ctx: context.Background(), + Clientset: k8sfake.NewSimpleClientset(), + RESTConfig: &rest.Config{Host: "https://example.test"}, + }, + want: "validation", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := CheckSlinkySlurmIMEXChannel(tt.ctx) + if err == nil || !strings.Contains(err.Error(), tt.want) { + t.Fatalf("error = %v, want containing %q", err, tt.want) + } + }) + } +} + +func TestParseSlinkySlurmIMEXChannelsRejectsMalformedPath(t *testing.T) { + for _, channel := range []string{ + "/dev/nvidia-caps-imex-channels/channel", + "/dev/wrong", + "/dev/nvidia-caps-imex-channels/channel2/extra", + } { + t.Run(channel, func(t *testing.T) { + _, err := parseSlinkySlurmIMEXChannels("IMEX_CHANNEL=" + channel) + if err == nil || !strings.Contains(err.Error(), "invalid IMEX channel path") { + t.Fatalf("error = %v, want invalid IMEX channel path", err) + } + }) + } +} + +func TestSlinkySlurmIMEXCommandIsBoundedAndResourceScoped(t *testing.T) { + for _, want := range []string{ + "--immediate=30", + "--time=1:00", + "--nodes=1", + "--ntasks=1", + "--cpus-per-task=1", + "--mem=128M", + "--gres=gpu:1", + "IMEX_CHANNEL_ERROR=find failed", + "channel_count=$(printf \"%s\\n\" \"$channel\" | wc -l)", + "IMEX_CHANNEL_COUNT=%s", + "IMEX_CHANNEL_CANDIDATES:", + "IMEX_CHANNEL_ERROR=expected exactly one channel", + "test \"$channel_count\" -ne 1", + "sleep 40", + "test \"$first_rc\" -eq 0 && test \"$second_rc\" -eq 0", + } { + if !strings.Contains(slinkySlurmIMEXChannelShell, want) { + t.Fatalf("IMEX command is missing %q: %s", want, slinkySlurmIMEXChannelShell) + } + } + if strings.Contains(slinkySlurmIMEXChannelShell, "printf '%s\\n'") { + t.Fatal("single-quoted printf format breaks the surrounding single-quoted srun script") + } +} + +func slinkyIMEXTestContext( + t *testing.T, + includeComputeDomain bool, + includeResourceClaimTemplate bool, +) *validators.Context { + + t.Helper() + + ctx := slurmReadyTestContext(t, false) + objects := []runtime.Object{defaultLoginSet(), defaultNodeSet()} + if includeComputeDomain { + computeDomain := &unstructured.Unstructured{ + Object: map[string]any{ + "apiVersion": "resource.nvidia.com/v1beta1", + "kind": "ComputeDomain", + "metadata": map[string]any{ + "name": "slinky-slurm-imex", + "namespace": slinkySlurmNamespace, + }, + }, + } + computeDomain.SetGroupVersionKind(schema.GroupVersionKind{ + Group: "resource.nvidia.com", Version: "v1beta1", Kind: "ComputeDomain", + }) + objects = append(objects, computeDomain) + } + ctx.DynamicClient = newSlinkyDynamicClient(t, objects...) + + if includeResourceClaimTemplate { + _, err := ctx.Clientset.ResourceV1().ResourceClaimTemplates(slinkySlurmNamespace).Create( + ctx.Ctx, + &resourcev1.ResourceClaimTemplate{ + ObjectMeta: metav1.ObjectMeta{ + Name: "slinky-slurm-imex-channels", + Namespace: slinkySlurmNamespace, + }, + }, + metav1.CreateOptions{}, + ) + if err != nil { + t.Fatalf("create ResourceClaimTemplate fixture: %v", err) + } + } + return ctx +}