diff --git a/osdc/clusters.yaml b/osdc/clusters.yaml index 796d5291..52b77b0c 100644 --- a/osdc/clusters.yaml +++ b/osdc/clusters.yaml @@ -176,6 +176,8 @@ clusters: - karpenter - arc - nodepools + - nfd # [TEST-ONLY] NUMA topology data for g4dn.metal nodes + - numa-scheduler # [TEST-ONLY] NUMA-aware secondary scheduler - arc-runners - keda - buildkit @@ -220,8 +222,6 @@ clusters: - arc - nodepools - nodepools-h100 # H100 only — B200 has no capacity reservation in us-west-1 - - nfd # NUMA topology data for p5 nodes - - numa-scheduler # NUMA-aware secondary scheduler - arc-runners-h100 - pypi-cache - cache-enforcer @@ -308,8 +308,6 @@ clusters: - nodepools - nodepools-b200 - nodepools-h100 - - nfd # NUMA topology data for p5 nodes - - numa-scheduler # NUMA-aware secondary scheduler - arc-runners - arc-runners-b200 - arc-runners-h100 diff --git a/osdc/modules/arc-runners/defs/l-bx86iavx512-11-43-t4-1.yaml b/osdc/modules/arc-runners/defs/l-bx86iavx512-11-43-t4-1.yaml new file mode 100644 index 00000000..756c4ae2 --- /dev/null +++ b/osdc/modules/arc-runners/defs/l-bx86iavx512-11-43-t4-1.yaml @@ -0,0 +1,22 @@ +# ARC runner definition: l-bx86iavx512-11-43-t4-1 (1x NVIDIA T4) +# Instance type: g4dn.metal — 1 of 8 T4 GPUs, 96c/384Gi node. +# NUMA-test 1-GPU runner: a single GPU always fits one NUMA zone, so no scheduler_name +# (mirrors the A100 1-GPU runner). Shares the g4dn-metal-numa pool; small 1-GPU pods can +# fragment zones, which is exactly what the numa-scheduler must pack around for the 4-GPU. +runner: + name: l-bx86iavx512-11-43-t4-1 + instance_type: g4dn.metal + node_fleet: g4dn-metal-numa + disk_size: 150 + # TESTING (topology A/B): 60 vCPU deliberately exceeds ONE NUMA zone (~48 logical CPUs + # on g4dn.metal) while still fitting the node (96). Under single-numa-node this can't + # align to one zone, so: + # Phase 1 (no scheduler_name → default scheduler): binds then TopologyAffinityError + # Phase 2 (add scheduler_name: numa-scheduler below): never binds → stays Pending + # Name/label kept as -11-43- so the canary workflow's runs-on still matches. + vcpu: 60 + memory: 43Gi + gpu: 1 + max_runners: + default: 1 # TESTING: one oversized runner for a clean observation + scheduler_name: numa-scheduler # PHASE 2: scheduler refuses to bind (Pending) — no TopologyAffinityError diff --git a/osdc/modules/arc-runners/defs/l-bx86iavx512-47-172-t4-4.yaml b/osdc/modules/arc-runners/defs/l-bx86iavx512-47-172-t4-4.yaml new file mode 100644 index 00000000..e5e6469f --- /dev/null +++ b/osdc/modules/arc-runners/defs/l-bx86iavx512-47-172-t4-4.yaml @@ -0,0 +1,16 @@ +# ARC runner definition: l-bx86iavx512-47-172-t4-4 (4x NVIDIA T4) +# Instance type: g4dn.metal — 4 of 8 T4 GPUs (one NUMA zone), 96c/384Gi node. +# NUMA-test runner: two of these pack onto one g4dn.metal node, one per NUMA zone, +# placed by the numa-scheduler — serving the role the A100 4-GPU runner did, on +# on-demand T4 capacity that is actually available in us-west-1. +runner: + name: l-bx86iavx512-47-172-t4-4 + instance_type: g4dn.metal + node_fleet: g4dn-metal-numa + disk_size: 150 + vcpu: 47 + memory: 172Gi + gpu: 4 + max_runners: + default: 2 # TESTING: 2x 4-GPU packs one g4dn.metal node (2 NUMA zones x 4 GPU) + scheduler_name: numa-scheduler # TESTING: NUMA-aware scheduling diff --git a/osdc/modules/nfd/helm/values.yaml b/osdc/modules/nfd/helm/values.yaml index ce15a291..3c7f53bc 100644 --- a/osdc/modules/nfd/helm/values.yaml +++ b/osdc/modules/nfd/helm/values.yaml @@ -23,12 +23,11 @@ topologyUpdater: # evaluated by the NUMA-aware scheduler. updateInterval: 15s - # Scoped to H100 nodes (p5 fleet) where the packed pool uses - # single-numa-node topology policy. Only these multi-NUMA nodes - # need NRT data for the numa-scheduler to prevent - # TopologyAffinityError on 4-GPU jobs. + # TESTING: g4dn-metal-numa (T4, on-demand in us-west-1) for arc-staging validation. + # p4d/A100 isn't offered in us-west-1; g4dn.metal is the cheap 2-NUMA stand-in. + # Production: change to node-fleet: p5. nodeSelector: - node-fleet: p5 + node-fleet: g4dn-metal-numa # Tolerate every taint so the topology-updater schedules on p5 # nodes regardless of their taint set (node-fleet, instance-type, diff --git a/osdc/modules/nfd/kubernetes/nfd-taint-remover.yaml b/osdc/modules/nfd/kubernetes/nfd-taint-remover.yaml index 371e89e0..47e1e263 100644 --- a/osdc/modules/nfd/kubernetes/nfd-taint-remover.yaml +++ b/osdc/modules/nfd/kubernetes/nfd-taint-remover.yaml @@ -67,9 +67,10 @@ spec: spec: priorityClassName: system-node-critical - # Match the same nodes as the NFD topology-updater. + # TESTING: g4dn-metal-numa (T4, on-demand in us-west-1) for arc-staging validation. + # Production: change to node-fleet: p5. Match the NFD topology-updater selector. nodeSelector: - node-fleet: p5 + node-fleet: g4dn-metal-numa # Tolerate every taint — same rationale as node-performance-tuning: # must coexist with all node-init.osdc.io/* startup taints without diff --git a/osdc/modules/nodepools/defs/g4dn-metal-numa.yaml b/osdc/modules/nodepools/defs/g4dn-metal-numa.yaml new file mode 100644 index 00000000..995c5f45 --- /dev/null +++ b/osdc/modules/nodepools/defs/g4dn-metal-numa.yaml @@ -0,0 +1,29 @@ +# Karpenter NodePool fleet: g4dn-metal-numa — dedicated pool for NUMA-aware T4 runners +# (1-GPU + 4-GPU). Cheap, on-demand stand-in for A100/p4d, which AWS does not offer in +# us-west-1 (arc-staging's region). +# +# g4dn.metal is a 2-socket box: 8x T4 split 4-per-NUMA-zone — topologically identical to +# p4d/p5 (2 NUMA x 4 GPU). topology_manager_policy=single-numa-node forces a 4-GPU pod to +# fit within one NUMA zone, reproducing the TopologyAffinityError the numa-scheduler +# prevents and exercising the full NFD -> NRT -> numa-scheduler pipeline. +# +# Kept SEPARATE from the best-effort `g4dn-metal` pool (which serves the live 8-GPU T4 +# runner): adding single-numa-node there would break that full-node runner (an 8-GPU pod +# spans both zones). Separation is enforced by the node-fleet=g4dn-metal-numa taint. +fleet: + name: g4dn-metal-numa + arch: amd64 + gpu: true + instances: + - type: g4dn.metal + weight: 100 + node_disk_size: 600 + has_nvme: true + baremetal: true + topology_manager_policy: single-numa-node + topology_manager_scope: pod + # TESTING: cap the fleet to a SINGLE g4dn.metal node (8 GPUs = 1 node), so + # the 1-GPU + 4-GPU runners pack onto one 2-NUMA box for deterministic + # NUMA validation. Karpenter limits by resource, not node count. + limits: + nvidia.com/gpu: 8 diff --git a/osdc/modules/nodepools/scripts/python/generate_nodepools.py b/osdc/modules/nodepools/scripts/python/generate_nodepools.py index 2e9cf0c1..0d018a14 100755 --- a/osdc/modules/nodepools/scripts/python/generate_nodepools.py +++ b/osdc/modules/nodepools/scripts/python/generate_nodepools.py @@ -98,10 +98,11 @@ "key": "node-init.osdc.io/nfd-topology", "value": "true", "effect": "NoSchedule", - # NFD topology-updater only targets p5 nodes (nodeSelector: node-fleet: p5). + # TESTING: g4dn-metal-numa (T4) for arc-staging validation (p4d/A100 not in us-west-1). # Only emit the taint on nodepools where NFD actually runs — otherwise the # node would be tainted with nothing to remove it. - "applies_when": lambda d: d.get("fleet_name") == "p5", + # Production: restrict to fleet_name == "p5" only. + "applies_when": lambda d: d.get("fleet_name") in ("p5", "g4dn-metal-numa"), }, ] @@ -329,6 +330,14 @@ def generate_nodepool_yaml(nodepool_def, module_name, defs_dir=None): # ----- Fleet-specific YAML blocks ----- weight_block = f" weight: {weight}\n" if weight is not None else "" + # ----- Optional NodePool resource cap ----- + # Karpenter limits by RESOURCE, not node count: set limits to one node's + # capacity (e.g. nvidia.com/gpu: 8 = a single g4dn.metal) to bound a fleet + # to one node. Absent = uncapped (bounded only by AWS availability). + limits = nodepool_def.get("limits") + limits_block = "" + if limits: + limits_block = " limits:\n" + "".join(f' {k}: "{v}"\n' for k, v in limits.items()) fleet_label = f' node-fleet: "{fleet_name}"\n' if fleet_name else "" fleet_taint = ( (f' - key: node-fleet\n value: "{fleet_name}"\n effect: NoSchedule\n') @@ -369,6 +378,7 @@ def generate_nodepool_yaml(nodepool_def, module_name, defs_dir=None): {compactor_label}\ spec: {weight_block}\ +{limits_block}\ disruption: consolidationPolicy: {consolidation_policy} consolidateAfter: {consolidation_after} @@ -562,7 +572,7 @@ def _build_fleet_nodepool_def(fleet_data, inst, name_suffix="", extra_labels=Non # Only set optional keys when explicitly provided — leaving them absent # lets generate_nodepool_yaml() fall through to its own defaults. - for key in ("node_compactor", "topology_manager_policy", "topology_manager_scope", "user_data_script"): + for key in ("node_compactor", "topology_manager_policy", "topology_manager_scope", "user_data_script", "limits"): val = inst.get(key) if val is not None: nodepool_def[key] = val