pytorch · georgehong · Jun 12, 2026 · Jun 15, 2026 · Jun 15, 2026 · Jun 15, 2026
@@ -176,6 +176,8 @@ clusters:
       - karpenter
       - arc
       - nodepools
+      - nfd                   # [TEST-ONLY] NUMA topology data for g4dn.metal nodes
+      - numa-scheduler        # [TEST-ONLY] NUMA-aware secondary scheduler
       - arc-runners
       - keda
       - buildkit
@@ -220,8 +222,6 @@ clusters:
       - arc
       - nodepools
       - nodepools-h100        # H100 only — B200 has no capacity reservation in us-west-1
-      - nfd                   # NUMA topology data for p5 nodes
-      - numa-scheduler        # NUMA-aware secondary scheduler
       - arc-runners-h100
       - pypi-cache
       - cache-enforcer
@@ -308,8 +308,6 @@ clusters:
       - nodepools
       - nodepools-b200
       - nodepools-h100
-      - nfd                   # NUMA topology data for p5 nodes
-      - numa-scheduler        # NUMA-aware secondary scheduler
       - arc-runners
       - arc-runners-b200
       - arc-runners-h100

@@ -0,0 +1,22 @@
+# ARC runner definition: l-bx86iavx512-11-43-t4-1 (1x NVIDIA T4)
+# Instance type: g4dn.metal — 1 of 8 T4 GPUs, 96c/384Gi node.
+# NUMA-test 1-GPU runner: a single GPU always fits one NUMA zone, so no scheduler_name
+# (mirrors the A100 1-GPU runner). Shares the g4dn-metal-numa pool; small 1-GPU pods can
+# fragment zones, which is exactly what the numa-scheduler must pack around for the 4-GPU.
+runner:
+  name: l-bx86iavx512-11-43-t4-1
+  instance_type: g4dn.metal
+  node_fleet: g4dn-metal-numa
+  disk_size: 150
+  # TESTING (topology A/B): 60 vCPU deliberately exceeds ONE NUMA zone (~48 logical CPUs
+  # on g4dn.metal) while still fitting the node (96). Under single-numa-node this can't
+  # align to one zone, so:
+  #   Phase 1 (no scheduler_name → default scheduler): binds then TopologyAffinityError
+  #   Phase 2 (add scheduler_name: numa-scheduler below): never binds → stays Pending
+  # Name/label kept as -11-43- so the canary workflow's runs-on still matches.
+  vcpu: 60
+  memory: 43Gi
+  gpu: 1
+  max_runners:
+    default: 1  # TESTING: one oversized runner for a clean observation
+  scheduler_name: numa-scheduler  # PHASE 2: scheduler refuses to bind (Pending) — no TopologyAffinityError
@@ -0,0 +1,16 @@
+# ARC runner definition: l-bx86iavx512-47-172-t4-4 (4x NVIDIA T4)
+# Instance type: g4dn.metal — 4 of 8 T4 GPUs (one NUMA zone), 96c/384Gi node.
+# NUMA-test runner: two of these pack onto one g4dn.metal node, one per NUMA zone,
+# placed by the numa-scheduler — serving the role the A100 4-GPU runner did, on
+# on-demand T4 capacity that is actually available in us-west-1.
+runner:
+  name: l-bx86iavx512-47-172-t4-4
+  instance_type: g4dn.metal
+  node_fleet: g4dn-metal-numa
+  disk_size: 150
+  vcpu: 47
+  memory: 172Gi
+  gpu: 4
+  max_runners:
+    default: 2  # TESTING: 2x 4-GPU packs one g4dn.metal node (2 NUMA zones x 4 GPU)
+  scheduler_name: numa-scheduler  # TESTING: NUMA-aware scheduling
@@ -23,12 +23,11 @@ topologyUpdater:
   # evaluated by the NUMA-aware scheduler.
   updateInterval: 15s
 
-  # Scoped to H100 nodes (p5 fleet) where the packed pool uses
-  # single-numa-node topology policy. Only these multi-NUMA nodes
-  # need NRT data for the numa-scheduler to prevent
-  # TopologyAffinityError on 4-GPU jobs.
+  # TESTING: g4dn-metal-numa (T4, on-demand in us-west-1) for arc-staging validation.
+  # p4d/A100 isn't offered in us-west-1; g4dn.metal is the cheap 2-NUMA stand-in.
+  # Production: change to node-fleet: p5.
   nodeSelector:
-    node-fleet: p5
+    node-fleet: g4dn-metal-numa
 
   # Tolerate every taint so the topology-updater schedules on p5
   # nodes regardless of their taint set (node-fleet, instance-type,

@@ -67,9 +67,10 @@ spec:
     spec:
       priorityClassName: system-node-critical
 
-      # Match the same nodes as the NFD topology-updater.
+      # TESTING: g4dn-metal-numa (T4, on-demand in us-west-1) for arc-staging validation.
+      # Production: change to node-fleet: p5. Match the NFD topology-updater selector.
       nodeSelector:
-        node-fleet: p5
+        node-fleet: g4dn-metal-numa
 
       # Tolerate every taint — same rationale as node-performance-tuning:
       # must coexist with all node-init.osdc.io/* startup taints without

@@ -0,0 +1,29 @@
+# Karpenter NodePool fleet: g4dn-metal-numa — dedicated pool for NUMA-aware T4 runners
+# (1-GPU + 4-GPU). Cheap, on-demand stand-in for A100/p4d, which AWS does not offer in
+# us-west-1 (arc-staging's region).
+#
+# g4dn.metal is a 2-socket box: 8x T4 split 4-per-NUMA-zone — topologically identical to
+# p4d/p5 (2 NUMA x 4 GPU). topology_manager_policy=single-numa-node forces a 4-GPU pod to
+# fit within one NUMA zone, reproducing the TopologyAffinityError the numa-scheduler
+# prevents and exercising the full NFD -> NRT -> numa-scheduler pipeline.
+#
+# Kept SEPARATE from the best-effort `g4dn-metal` pool (which serves the live 8-GPU T4
+# runner): adding single-numa-node there would break that full-node runner (an 8-GPU pod
+# spans both zones). Separation is enforced by the node-fleet=g4dn-metal-numa taint.
+fleet:
+  name: g4dn-metal-numa
+  arch: amd64
+  gpu: true
+  instances:
+    - type: g4dn.metal
+      weight: 100
+      node_disk_size: 600
+      has_nvme: true
+      baremetal: true
+      topology_manager_policy: single-numa-node
+      topology_manager_scope: pod
+      # TESTING: cap the fleet to a SINGLE g4dn.metal node (8 GPUs = 1 node), so
+      # the 1-GPU + 4-GPU runners pack onto one 2-NUMA box for deterministic
+      # NUMA validation. Karpenter limits by resource, not node count.
+      limits:
+        nvidia.com/gpu: 8
@@ -98,10 +98,11 @@
         "key": "node-init.osdc.io/nfd-topology",
         "value": "true",
         "effect": "NoSchedule",
-        # NFD topology-updater only targets p5 nodes (nodeSelector: node-fleet: p5).
+        # TESTING: g4dn-metal-numa (T4) for arc-staging validation (p4d/A100 not in us-west-1).
         # Only emit the taint on nodepools where NFD actually runs — otherwise the
         # node would be tainted with nothing to remove it.
-        "applies_when": lambda d: d.get("fleet_name") == "p5",
+        # Production: restrict to fleet_name == "p5" only.
+        "applies_when": lambda d: d.get("fleet_name") in ("p5", "g4dn-metal-numa"),
     },
 ]
 
@@ -329,6 +330,14 @@ def generate_nodepool_yaml(nodepool_def, module_name, defs_dir=None):
 
     # ----- Fleet-specific YAML blocks -----
     weight_block = f"  weight: {weight}\n" if weight is not None else ""
+    # ----- Optional NodePool resource cap -----
+    # Karpenter limits by RESOURCE, not node count: set limits to one node's
+    # capacity (e.g. nvidia.com/gpu: 8 = a single g4dn.metal) to bound a fleet
+    # to one node. Absent = uncapped (bounded only by AWS availability).
+    limits = nodepool_def.get("limits")
+    limits_block = ""
+    if limits:
+        limits_block = "  limits:\n" + "".join(f'    {k}: "{v}"\n' for k, v in limits.items())
     fleet_label = f'        node-fleet: "{fleet_name}"\n' if fleet_name else ""
     fleet_taint = (
         (f'        - key: node-fleet\n          value: "{fleet_name}"\n          effect: NoSchedule\n')
@@ -369,6 +378,7 @@ def generate_nodepool_yaml(nodepool_def, module_name, defs_dir=None):
 {compactor_label}\
 spec:
 {weight_block}\
+{limits_block}\
   disruption:
     consolidationPolicy: {consolidation_policy}
     consolidateAfter: {consolidation_after}
@@ -562,7 +572,7 @@ def _build_fleet_nodepool_def(fleet_data, inst, name_suffix="", extra_labels=Non
 
     # Only set optional keys when explicitly provided — leaving them absent
     # lets generate_nodepool_yaml() fall through to its own defaults.
-    for key in ("node_compactor", "topology_manager_policy", "topology_manager_scope", "user_data_script"):
+    for key in ("node_compactor", "topology_manager_policy", "topology_manager_scope", "user_data_script", "limits"):
         val = inst.get(key)
         if val is not None:
             nodepool_def[key] = val