From f42f04b2d5a0d7dcd4c1c964d8a5112edc5b79eb Mon Sep 17 00:00:00 2001
From: George Hong <georgehong@meta.com>
Date: Fri, 12 Jun 2026 14:09:07 -0700
Subject: [PATCH] Update

[ghstack-poisoned]
---
 osdc/clusters.yaml                            |  6 +--
 .../defs/l-bx86iavx512-11-43-t4-1.yaml        | 15 ++++++++
 .../defs/l-bx86iavx512-47-172-t4-4.yaml       | 16 ++++++++
 osdc/modules/nfd/helm/values.yaml             |  9 ++---
 .../nfd/kubernetes/nfd-taint-remover.yaml     |  5 ++-
 .../nfd/scripts/cleanup-arc-staging.sh        | 37 +++++++++++++++++++
 .../nodepools/defs/g4dn-metal-numa.yaml       | 29 +++++++++++++++
 .../scripts/python/generate_nodepools.py      | 16 ++++++--
 8 files changed, 119 insertions(+), 14 deletions(-)
 create mode 100644 osdc/modules/arc-runners/defs/l-bx86iavx512-11-43-t4-1.yaml
 create mode 100644 osdc/modules/arc-runners/defs/l-bx86iavx512-47-172-t4-4.yaml
 create mode 100755 osdc/modules/nfd/scripts/cleanup-arc-staging.sh
 create mode 100644 osdc/modules/nodepools/defs/g4dn-metal-numa.yaml

diff --git a/osdc/clusters.yaml b/osdc/clusters.yaml
index cf4f0d70..f01d165a 100644
--- a/osdc/clusters.yaml
+++ b/osdc/clusters.yaml
@@ -172,6 +172,8 @@ clusters:
       - karpenter
       - arc
       - nodepools
+      - nfd                   # [TEST-ONLY] NUMA topology data for g4dn.metal nodes
+      - numa-scheduler        # [TEST-ONLY] NUMA-aware secondary scheduler
       - arc-runners
       - keda
       - buildkit
@@ -216,8 +218,6 @@ clusters:
       - arc
       - nodepools
       - nodepools-h100        # H100 only — B200 has no capacity reservation in us-west-1
-      - nfd                   # NUMA topology data for p5 nodes
-      - numa-scheduler        # NUMA-aware secondary scheduler
       - arc-runners-h100
       - pypi-cache
       - cache-enforcer
@@ -304,8 +304,6 @@ clusters:
       - nodepools
       - nodepools-b200
       - nodepools-h100
-      - nfd                   # NUMA topology data for p5 nodes
-      - numa-scheduler        # NUMA-aware secondary scheduler
       - arc-runners
       - arc-runners-b200
       - arc-runners-h100
diff --git a/osdc/modules/arc-runners/defs/l-bx86iavx512-11-43-t4-1.yaml b/osdc/modules/arc-runners/defs/l-bx86iavx512-11-43-t4-1.yaml
new file mode 100644
index 00000000..eef85b98
--- /dev/null
+++ b/osdc/modules/arc-runners/defs/l-bx86iavx512-11-43-t4-1.yaml
@@ -0,0 +1,15 @@
+# ARC runner definition: l-bx86iavx512-11-43-t4-1 (1x NVIDIA T4)
+# Instance type: g4dn.metal — 1 of 8 T4 GPUs, 96c/384Gi node.
+# NUMA-test 1-GPU runner: a single GPU always fits one NUMA zone, so no scheduler_name
+# (mirrors the A100 1-GPU runner). Shares the g4dn-metal-numa pool; small 1-GPU pods can
+# fragment zones, which is exactly what the numa-scheduler must pack around for the 4-GPU.
+runner:
+  name: l-bx86iavx512-11-43-t4-1
+  instance_type: g4dn.metal
+  node_fleet: g4dn-metal-numa
+  disk_size: 150
+  vcpu: 11
+  memory: 43Gi
+  gpu: 1
+  max_runners:
+    default: 2  # TESTING
diff --git a/osdc/modules/arc-runners/defs/l-bx86iavx512-47-172-t4-4.yaml b/osdc/modules/arc-runners/defs/l-bx86iavx512-47-172-t4-4.yaml
new file mode 100644
index 00000000..e5e6469f
--- /dev/null
+++ b/osdc/modules/arc-runners/defs/l-bx86iavx512-47-172-t4-4.yaml
@@ -0,0 +1,16 @@
+# ARC runner definition: l-bx86iavx512-47-172-t4-4 (4x NVIDIA T4)
+# Instance type: g4dn.metal — 4 of 8 T4 GPUs (one NUMA zone), 96c/384Gi node.
+# NUMA-test runner: two of these pack onto one g4dn.metal node, one per NUMA zone,
+# placed by the numa-scheduler — serving the role the A100 4-GPU runner did, on
+# on-demand T4 capacity that is actually available in us-west-1.
+runner:
+  name: l-bx86iavx512-47-172-t4-4
+  instance_type: g4dn.metal
+  node_fleet: g4dn-metal-numa
+  disk_size: 150
+  vcpu: 47
+  memory: 172Gi
+  gpu: 4
+  max_runners:
+    default: 2  # TESTING: 2x 4-GPU packs one g4dn.metal node (2 NUMA zones x 4 GPU)
+  scheduler_name: numa-scheduler  # TESTING: NUMA-aware scheduling
diff --git a/osdc/modules/nfd/helm/values.yaml b/osdc/modules/nfd/helm/values.yaml
index ce15a291..3c7f53bc 100644
--- a/osdc/modules/nfd/helm/values.yaml
+++ b/osdc/modules/nfd/helm/values.yaml
@@ -23,12 +23,11 @@ topologyUpdater:
   # evaluated by the NUMA-aware scheduler.
   updateInterval: 15s
 
-  # Scoped to H100 nodes (p5 fleet) where the packed pool uses
-  # single-numa-node topology policy. Only these multi-NUMA nodes
-  # need NRT data for the numa-scheduler to prevent
-  # TopologyAffinityError on 4-GPU jobs.
+  # TESTING: g4dn-metal-numa (T4, on-demand in us-west-1) for arc-staging validation.
+  # p4d/A100 isn't offered in us-west-1; g4dn.metal is the cheap 2-NUMA stand-in.
+  # Production: change to node-fleet: p5.
   nodeSelector:
-    node-fleet: p5
+    node-fleet: g4dn-metal-numa
 
   # Tolerate every taint so the topology-updater schedules on p5
   # nodes regardless of their taint set (node-fleet, instance-type,
diff --git a/osdc/modules/nfd/kubernetes/nfd-taint-remover.yaml b/osdc/modules/nfd/kubernetes/nfd-taint-remover.yaml
index 371e89e0..47e1e263 100644
--- a/osdc/modules/nfd/kubernetes/nfd-taint-remover.yaml
+++ b/osdc/modules/nfd/kubernetes/nfd-taint-remover.yaml
@@ -67,9 +67,10 @@ spec:
     spec:
       priorityClassName: system-node-critical
 
-      # Match the same nodes as the NFD topology-updater.
+      # TESTING: g4dn-metal-numa (T4, on-demand in us-west-1) for arc-staging validation.
+      # Production: change to node-fleet: p5. Match the NFD topology-updater selector.
       nodeSelector:
-        node-fleet: p5
+        node-fleet: g4dn-metal-numa
 
       # Tolerate every taint — same rationale as node-performance-tuning:
       # must coexist with all node-init.osdc.io/* startup taints without
diff --git a/osdc/modules/nfd/scripts/cleanup-arc-staging.sh b/osdc/modules/nfd/scripts/cleanup-arc-staging.sh
new file mode 100755
index 00000000..9bc445db
--- /dev/null
+++ b/osdc/modules/nfd/scripts/cleanup-arc-staging.sh
@@ -0,0 +1,37 @@
+#!/usr/bin/env bash
+set -euo pipefail
+#
+# [TEST-ONLY] Clean up NUMA test resources from arc-staging (g4dn.metal path).
+# Removes everything the nfd + numa-scheduler + g4dn-metal-numa test deploy created
+# and restores the cluster to its pre-test state.
+#
+# Usage: bash modules/nfd/scripts/cleanup-arc-staging.sh
+#
+# IMPORTANT ordering: this script only deletes the live cluster resources. To
+# restore the *config*, drop the test commit FIRST, THEN redeploy from the clean
+# base — redeploying while this commit is still checked out would just re-apply
+# the test config. (This is the bug the A100 cleanup script had.)
+
+CTX="pytorch-arc-staging"
+
+echo "=== Cleaning up NUMA (g4dn.metal) test resources from arc-staging ==="
+
+# 1. Delete namespaced resources (Helm releases + all pods/services/etc.)
+echo "→ Deleting nfd namespace..."
+kubectl --context "$CTX" delete namespace nfd --ignore-not-found
+echo "→ Deleting numa-scheduler namespace..."
+kubectl --context "$CTX" delete namespace numa-scheduler --ignore-not-found
+
+# 2. Delete cluster-scoped resources (not removed by namespace deletion)
+echo "→ Removing NRT CRD..."
+kubectl --context "$CTX" delete crd noderesourcetopologies.topology.node.k8s.io --ignore-not-found
+echo "→ Removing nfd-taint-remover ClusterRole/ClusterRoleBinding..."
+kubectl --context "$CTX" delete clusterrole nfd-taint-remover --ignore-not-found
+kubectl --context "$CTX" delete clusterrolebinding nfd-taint-remover --ignore-not-found
+
+echo ""
+echo "=== Cluster resources removed. Now restore the config (in this order): ==="
+echo "  1. Drop the test commit:   git checkout numa-aware-scheduling   # or: git reset --hard HEAD~1"
+echo "  2. Redeploy from clean base (removes the g4dn-metal-numa NodePool + restores runners):"
+echo "       just deploy-module arc-staging nodepools"
+echo "       just deploy-module arc-staging arc-runners"
diff --git a/osdc/modules/nodepools/defs/g4dn-metal-numa.yaml b/osdc/modules/nodepools/defs/g4dn-metal-numa.yaml
new file mode 100644
index 00000000..995c5f45
--- /dev/null
+++ b/osdc/modules/nodepools/defs/g4dn-metal-numa.yaml
@@ -0,0 +1,29 @@
+# Karpenter NodePool fleet: g4dn-metal-numa — dedicated pool for NUMA-aware T4 runners
+# (1-GPU + 4-GPU). Cheap, on-demand stand-in for A100/p4d, which AWS does not offer in
+# us-west-1 (arc-staging's region).
+#
+# g4dn.metal is a 2-socket box: 8x T4 split 4-per-NUMA-zone — topologically identical to
+# p4d/p5 (2 NUMA x 4 GPU). topology_manager_policy=single-numa-node forces a 4-GPU pod to
+# fit within one NUMA zone, reproducing the TopologyAffinityError the numa-scheduler
+# prevents and exercising the full NFD -> NRT -> numa-scheduler pipeline.
+#
+# Kept SEPARATE from the best-effort `g4dn-metal` pool (which serves the live 8-GPU T4
+# runner): adding single-numa-node there would break that full-node runner (an 8-GPU pod
+# spans both zones). Separation is enforced by the node-fleet=g4dn-metal-numa taint.
+fleet:
+  name: g4dn-metal-numa
+  arch: amd64
+  gpu: true
+  instances:
+    - type: g4dn.metal
+      weight: 100
+      node_disk_size: 600
+      has_nvme: true
+      baremetal: true
+      topology_manager_policy: single-numa-node
+      topology_manager_scope: pod
+      # TESTING: cap the fleet to a SINGLE g4dn.metal node (8 GPUs = 1 node), so
+      # the 1-GPU + 4-GPU runners pack onto one 2-NUMA box for deterministic
+      # NUMA validation. Karpenter limits by resource, not node count.
+      limits:
+        nvidia.com/gpu: 8
diff --git a/osdc/modules/nodepools/scripts/python/generate_nodepools.py b/osdc/modules/nodepools/scripts/python/generate_nodepools.py
index 2e9cf0c1..0d018a14 100755
--- a/osdc/modules/nodepools/scripts/python/generate_nodepools.py
+++ b/osdc/modules/nodepools/scripts/python/generate_nodepools.py
@@ -98,10 +98,11 @@
         "key": "node-init.osdc.io/nfd-topology",
         "value": "true",
         "effect": "NoSchedule",
-        # NFD topology-updater only targets p5 nodes (nodeSelector: node-fleet: p5).
+        # TESTING: g4dn-metal-numa (T4) for arc-staging validation (p4d/A100 not in us-west-1).
         # Only emit the taint on nodepools where NFD actually runs — otherwise the
         # node would be tainted with nothing to remove it.
-        "applies_when": lambda d: d.get("fleet_name") == "p5",
+        # Production: restrict to fleet_name == "p5" only.
+        "applies_when": lambda d: d.get("fleet_name") in ("p5", "g4dn-metal-numa"),
     },
 ]
 
@@ -329,6 +330,14 @@ def generate_nodepool_yaml(nodepool_def, module_name, defs_dir=None):
 
     # ----- Fleet-specific YAML blocks -----
     weight_block = f"  weight: {weight}\n" if weight is not None else ""
+    # ----- Optional NodePool resource cap -----
+    # Karpenter limits by RESOURCE, not node count: set limits to one node's
+    # capacity (e.g. nvidia.com/gpu: 8 = a single g4dn.metal) to bound a fleet
+    # to one node. Absent = uncapped (bounded only by AWS availability).
+    limits = nodepool_def.get("limits")
+    limits_block = ""
+    if limits:
+        limits_block = "  limits:\n" + "".join(f'    {k}: "{v}"\n' for k, v in limits.items())
     fleet_label = f'        node-fleet: "{fleet_name}"\n' if fleet_name else ""
     fleet_taint = (
         (f'        - key: node-fleet\n          value: "{fleet_name}"\n          effect: NoSchedule\n')
@@ -369,6 +378,7 @@ def generate_nodepool_yaml(nodepool_def, module_name, defs_dir=None):
 {compactor_label}\
 spec:
 {weight_block}\
+{limits_block}\
   disruption:
     consolidationPolicy: {consolidation_policy}
     consolidateAfter: {consolidation_after}
@@ -562,7 +572,7 @@ def _build_fleet_nodepool_def(fleet_data, inst, name_suffix="", extra_labels=Non
 
     # Only set optional keys when explicitly provided — leaving them absent
     # lets generate_nodepool_yaml() fall through to its own defaults.
-    for key in ("node_compactor", "topology_manager_policy", "topology_manager_scope", "user_data_script"):
+    for key in ("node_compactor", "topology_manager_policy", "topology_manager_scope", "user_data_script", "limits"):
         val = inst.get(key)
         if val is not None:
             nodepool_def[key] = val