From f42f04b2d5a0d7dcd4c1c964d8a5112edc5b79eb Mon Sep 17 00:00:00 2001 From: George Hong Date: Fri, 12 Jun 2026 14:09:07 -0700 Subject: [PATCH] Update [ghstack-poisoned] --- osdc/clusters.yaml | 6 +-- .../defs/l-bx86iavx512-11-43-t4-1.yaml | 15 ++++++++ .../defs/l-bx86iavx512-47-172-t4-4.yaml | 16 ++++++++ osdc/modules/nfd/helm/values.yaml | 9 ++--- .../nfd/kubernetes/nfd-taint-remover.yaml | 5 ++- .../nfd/scripts/cleanup-arc-staging.sh | 37 +++++++++++++++++++ .../nodepools/defs/g4dn-metal-numa.yaml | 29 +++++++++++++++ .../scripts/python/generate_nodepools.py | 16 ++++++-- 8 files changed, 119 insertions(+), 14 deletions(-) create mode 100644 osdc/modules/arc-runners/defs/l-bx86iavx512-11-43-t4-1.yaml create mode 100644 osdc/modules/arc-runners/defs/l-bx86iavx512-47-172-t4-4.yaml create mode 100755 osdc/modules/nfd/scripts/cleanup-arc-staging.sh create mode 100644 osdc/modules/nodepools/defs/g4dn-metal-numa.yaml diff --git a/osdc/clusters.yaml b/osdc/clusters.yaml index cf4f0d70..f01d165a 100644 --- a/osdc/clusters.yaml +++ b/osdc/clusters.yaml @@ -172,6 +172,8 @@ clusters: - karpenter - arc - nodepools + - nfd # [TEST-ONLY] NUMA topology data for g4dn.metal nodes + - numa-scheduler # [TEST-ONLY] NUMA-aware secondary scheduler - arc-runners - keda - buildkit @@ -216,8 +218,6 @@ clusters: - arc - nodepools - nodepools-h100 # H100 only — B200 has no capacity reservation in us-west-1 - - nfd # NUMA topology data for p5 nodes - - numa-scheduler # NUMA-aware secondary scheduler - arc-runners-h100 - pypi-cache - cache-enforcer @@ -304,8 +304,6 @@ clusters: - nodepools - nodepools-b200 - nodepools-h100 - - nfd # NUMA topology data for p5 nodes - - numa-scheduler # NUMA-aware secondary scheduler - arc-runners - arc-runners-b200 - arc-runners-h100 diff --git a/osdc/modules/arc-runners/defs/l-bx86iavx512-11-43-t4-1.yaml b/osdc/modules/arc-runners/defs/l-bx86iavx512-11-43-t4-1.yaml new file mode 100644 index 00000000..eef85b98 --- /dev/null +++ b/osdc/modules/arc-runners/defs/l-bx86iavx512-11-43-t4-1.yaml @@ -0,0 +1,15 @@ +# ARC runner definition: l-bx86iavx512-11-43-t4-1 (1x NVIDIA T4) +# Instance type: g4dn.metal — 1 of 8 T4 GPUs, 96c/384Gi node. +# NUMA-test 1-GPU runner: a single GPU always fits one NUMA zone, so no scheduler_name +# (mirrors the A100 1-GPU runner). Shares the g4dn-metal-numa pool; small 1-GPU pods can +# fragment zones, which is exactly what the numa-scheduler must pack around for the 4-GPU. +runner: + name: l-bx86iavx512-11-43-t4-1 + instance_type: g4dn.metal + node_fleet: g4dn-metal-numa + disk_size: 150 + vcpu: 11 + memory: 43Gi + gpu: 1 + max_runners: + default: 2 # TESTING diff --git a/osdc/modules/arc-runners/defs/l-bx86iavx512-47-172-t4-4.yaml b/osdc/modules/arc-runners/defs/l-bx86iavx512-47-172-t4-4.yaml new file mode 100644 index 00000000..e5e6469f --- /dev/null +++ b/osdc/modules/arc-runners/defs/l-bx86iavx512-47-172-t4-4.yaml @@ -0,0 +1,16 @@ +# ARC runner definition: l-bx86iavx512-47-172-t4-4 (4x NVIDIA T4) +# Instance type: g4dn.metal — 4 of 8 T4 GPUs (one NUMA zone), 96c/384Gi node. +# NUMA-test runner: two of these pack onto one g4dn.metal node, one per NUMA zone, +# placed by the numa-scheduler — serving the role the A100 4-GPU runner did, on +# on-demand T4 capacity that is actually available in us-west-1. +runner: + name: l-bx86iavx512-47-172-t4-4 + instance_type: g4dn.metal + node_fleet: g4dn-metal-numa + disk_size: 150 + vcpu: 47 + memory: 172Gi + gpu: 4 + max_runners: + default: 2 # TESTING: 2x 4-GPU packs one g4dn.metal node (2 NUMA zones x 4 GPU) + scheduler_name: numa-scheduler # TESTING: NUMA-aware scheduling diff --git a/osdc/modules/nfd/helm/values.yaml b/osdc/modules/nfd/helm/values.yaml index ce15a291..3c7f53bc 100644 --- a/osdc/modules/nfd/helm/values.yaml +++ b/osdc/modules/nfd/helm/values.yaml @@ -23,12 +23,11 @@ topologyUpdater: # evaluated by the NUMA-aware scheduler. updateInterval: 15s - # Scoped to H100 nodes (p5 fleet) where the packed pool uses - # single-numa-node topology policy. Only these multi-NUMA nodes - # need NRT data for the numa-scheduler to prevent - # TopologyAffinityError on 4-GPU jobs. + # TESTING: g4dn-metal-numa (T4, on-demand in us-west-1) for arc-staging validation. + # p4d/A100 isn't offered in us-west-1; g4dn.metal is the cheap 2-NUMA stand-in. + # Production: change to node-fleet: p5. nodeSelector: - node-fleet: p5 + node-fleet: g4dn-metal-numa # Tolerate every taint so the topology-updater schedules on p5 # nodes regardless of their taint set (node-fleet, instance-type, diff --git a/osdc/modules/nfd/kubernetes/nfd-taint-remover.yaml b/osdc/modules/nfd/kubernetes/nfd-taint-remover.yaml index 371e89e0..47e1e263 100644 --- a/osdc/modules/nfd/kubernetes/nfd-taint-remover.yaml +++ b/osdc/modules/nfd/kubernetes/nfd-taint-remover.yaml @@ -67,9 +67,10 @@ spec: spec: priorityClassName: system-node-critical - # Match the same nodes as the NFD topology-updater. + # TESTING: g4dn-metal-numa (T4, on-demand in us-west-1) for arc-staging validation. + # Production: change to node-fleet: p5. Match the NFD topology-updater selector. nodeSelector: - node-fleet: p5 + node-fleet: g4dn-metal-numa # Tolerate every taint — same rationale as node-performance-tuning: # must coexist with all node-init.osdc.io/* startup taints without diff --git a/osdc/modules/nfd/scripts/cleanup-arc-staging.sh b/osdc/modules/nfd/scripts/cleanup-arc-staging.sh new file mode 100755 index 00000000..9bc445db --- /dev/null +++ b/osdc/modules/nfd/scripts/cleanup-arc-staging.sh @@ -0,0 +1,37 @@ +#!/usr/bin/env bash +set -euo pipefail +# +# [TEST-ONLY] Clean up NUMA test resources from arc-staging (g4dn.metal path). +# Removes everything the nfd + numa-scheduler + g4dn-metal-numa test deploy created +# and restores the cluster to its pre-test state. +# +# Usage: bash modules/nfd/scripts/cleanup-arc-staging.sh +# +# IMPORTANT ordering: this script only deletes the live cluster resources. To +# restore the *config*, drop the test commit FIRST, THEN redeploy from the clean +# base — redeploying while this commit is still checked out would just re-apply +# the test config. (This is the bug the A100 cleanup script had.) + +CTX="pytorch-arc-staging" + +echo "=== Cleaning up NUMA (g4dn.metal) test resources from arc-staging ===" + +# 1. Delete namespaced resources (Helm releases + all pods/services/etc.) +echo "→ Deleting nfd namespace..." +kubectl --context "$CTX" delete namespace nfd --ignore-not-found +echo "→ Deleting numa-scheduler namespace..." +kubectl --context "$CTX" delete namespace numa-scheduler --ignore-not-found + +# 2. Delete cluster-scoped resources (not removed by namespace deletion) +echo "→ Removing NRT CRD..." +kubectl --context "$CTX" delete crd noderesourcetopologies.topology.node.k8s.io --ignore-not-found +echo "→ Removing nfd-taint-remover ClusterRole/ClusterRoleBinding..." +kubectl --context "$CTX" delete clusterrole nfd-taint-remover --ignore-not-found +kubectl --context "$CTX" delete clusterrolebinding nfd-taint-remover --ignore-not-found + +echo "" +echo "=== Cluster resources removed. Now restore the config (in this order): ===" +echo " 1. Drop the test commit: git checkout numa-aware-scheduling # or: git reset --hard HEAD~1" +echo " 2. Redeploy from clean base (removes the g4dn-metal-numa NodePool + restores runners):" +echo " just deploy-module arc-staging nodepools" +echo " just deploy-module arc-staging arc-runners" diff --git a/osdc/modules/nodepools/defs/g4dn-metal-numa.yaml b/osdc/modules/nodepools/defs/g4dn-metal-numa.yaml new file mode 100644 index 00000000..995c5f45 --- /dev/null +++ b/osdc/modules/nodepools/defs/g4dn-metal-numa.yaml @@ -0,0 +1,29 @@ +# Karpenter NodePool fleet: g4dn-metal-numa — dedicated pool for NUMA-aware T4 runners +# (1-GPU + 4-GPU). Cheap, on-demand stand-in for A100/p4d, which AWS does not offer in +# us-west-1 (arc-staging's region). +# +# g4dn.metal is a 2-socket box: 8x T4 split 4-per-NUMA-zone — topologically identical to +# p4d/p5 (2 NUMA x 4 GPU). topology_manager_policy=single-numa-node forces a 4-GPU pod to +# fit within one NUMA zone, reproducing the TopologyAffinityError the numa-scheduler +# prevents and exercising the full NFD -> NRT -> numa-scheduler pipeline. +# +# Kept SEPARATE from the best-effort `g4dn-metal` pool (which serves the live 8-GPU T4 +# runner): adding single-numa-node there would break that full-node runner (an 8-GPU pod +# spans both zones). Separation is enforced by the node-fleet=g4dn-metal-numa taint. +fleet: + name: g4dn-metal-numa + arch: amd64 + gpu: true + instances: + - type: g4dn.metal + weight: 100 + node_disk_size: 600 + has_nvme: true + baremetal: true + topology_manager_policy: single-numa-node + topology_manager_scope: pod + # TESTING: cap the fleet to a SINGLE g4dn.metal node (8 GPUs = 1 node), so + # the 1-GPU + 4-GPU runners pack onto one 2-NUMA box for deterministic + # NUMA validation. Karpenter limits by resource, not node count. + limits: + nvidia.com/gpu: 8 diff --git a/osdc/modules/nodepools/scripts/python/generate_nodepools.py b/osdc/modules/nodepools/scripts/python/generate_nodepools.py index 2e9cf0c1..0d018a14 100755 --- a/osdc/modules/nodepools/scripts/python/generate_nodepools.py +++ b/osdc/modules/nodepools/scripts/python/generate_nodepools.py @@ -98,10 +98,11 @@ "key": "node-init.osdc.io/nfd-topology", "value": "true", "effect": "NoSchedule", - # NFD topology-updater only targets p5 nodes (nodeSelector: node-fleet: p5). + # TESTING: g4dn-metal-numa (T4) for arc-staging validation (p4d/A100 not in us-west-1). # Only emit the taint on nodepools where NFD actually runs — otherwise the # node would be tainted with nothing to remove it. - "applies_when": lambda d: d.get("fleet_name") == "p5", + # Production: restrict to fleet_name == "p5" only. + "applies_when": lambda d: d.get("fleet_name") in ("p5", "g4dn-metal-numa"), }, ] @@ -329,6 +330,14 @@ def generate_nodepool_yaml(nodepool_def, module_name, defs_dir=None): # ----- Fleet-specific YAML blocks ----- weight_block = f" weight: {weight}\n" if weight is not None else "" + # ----- Optional NodePool resource cap ----- + # Karpenter limits by RESOURCE, not node count: set limits to one node's + # capacity (e.g. nvidia.com/gpu: 8 = a single g4dn.metal) to bound a fleet + # to one node. Absent = uncapped (bounded only by AWS availability). + limits = nodepool_def.get("limits") + limits_block = "" + if limits: + limits_block = " limits:\n" + "".join(f' {k}: "{v}"\n' for k, v in limits.items()) fleet_label = f' node-fleet: "{fleet_name}"\n' if fleet_name else "" fleet_taint = ( (f' - key: node-fleet\n value: "{fleet_name}"\n effect: NoSchedule\n') @@ -369,6 +378,7 @@ def generate_nodepool_yaml(nodepool_def, module_name, defs_dir=None): {compactor_label}\ spec: {weight_block}\ +{limits_block}\ disruption: consolidationPolicy: {consolidation_policy} consolidateAfter: {consolidation_after} @@ -562,7 +572,7 @@ def _build_fleet_nodepool_def(fleet_data, inst, name_suffix="", extra_labels=Non # Only set optional keys when explicitly provided — leaving them absent # lets generate_nodepool_yaml() fall through to its own defaults. - for key in ("node_compactor", "topology_manager_policy", "topology_manager_scope", "user_data_script"): + for key in ("node_compactor", "topology_manager_policy", "topology_manager_scope", "user_data_script", "limits"): val = inst.get(key) if val is not None: nodepool_def[key] = val