From b51b6a3717e1c024b9f713fefbfe6d0e30bd4a32 Mon Sep 17 00:00:00 2001 From: George Hong Date: Thu, 11 Jun 2026 17:53:08 -0700 Subject: [PATCH] Update [ghstack-poisoned] --- .../defs/l-bx86iavx512-88-1000-a100-8.yaml | 2 + .../defs/l-x86iavx512-11-125-a100.yaml | 2 + .../defs/l-x86iavx512-22-250-a100-2.yaml | 2 + .../defs/l-x86iavx512-44-500-a100-4.yaml | 3 ++ osdc/modules/nfd/helm/values.yaml | 8 ++- .../nfd/kubernetes/nfd-taint-remover.yaml | 5 +- .../nfd/scripts/cleanup-arc-staging.sh | 51 +++++++++++++++++++ osdc/modules/nodepools/defs/p4d.yaml | 4 +- .../scripts/python/generate_nodepools.py | 7 ++- 9 files changed, 71 insertions(+), 13 deletions(-) create mode 100755 osdc/modules/nfd/scripts/cleanup-arc-staging.sh diff --git a/osdc/modules/arc-runners/defs/l-bx86iavx512-88-1000-a100-8.yaml b/osdc/modules/arc-runners/defs/l-bx86iavx512-88-1000-a100-8.yaml index c1dc6cd9..e3abf1a9 100644 --- a/osdc/modules/arc-runners/defs/l-bx86iavx512-88-1000-a100-8.yaml +++ b/osdc/modules/arc-runners/defs/l-bx86iavx512-88-1000-a100-8.yaml @@ -8,3 +8,5 @@ runner: vcpu: 88 memory: 1000Gi gpu: 8 + max_runners: + default: 1 # TESTING: minimal for arc-staging NUMA test diff --git a/osdc/modules/arc-runners/defs/l-x86iavx512-11-125-a100.yaml b/osdc/modules/arc-runners/defs/l-x86iavx512-11-125-a100.yaml index a19f2da1..ac3bec44 100644 --- a/osdc/modules/arc-runners/defs/l-x86iavx512-11-125-a100.yaml +++ b/osdc/modules/arc-runners/defs/l-x86iavx512-11-125-a100.yaml @@ -8,3 +8,5 @@ runner: vcpu: 11 memory: 125Gi gpu: 1 + max_runners: + default: 2 # TESTING: limited for arc-staging NUMA test diff --git a/osdc/modules/arc-runners/defs/l-x86iavx512-22-250-a100-2.yaml b/osdc/modules/arc-runners/defs/l-x86iavx512-22-250-a100-2.yaml index a416d880..56cc9d49 100644 --- a/osdc/modules/arc-runners/defs/l-x86iavx512-22-250-a100-2.yaml +++ b/osdc/modules/arc-runners/defs/l-x86iavx512-22-250-a100-2.yaml @@ -7,3 +7,5 @@ runner: vcpu: 22 memory: 250Gi gpu: 2 + max_runners: + default: 2 # TESTING: limited for arc-staging NUMA test diff --git a/osdc/modules/arc-runners/defs/l-x86iavx512-44-500-a100-4.yaml b/osdc/modules/arc-runners/defs/l-x86iavx512-44-500-a100-4.yaml index e86bb5f5..2300bd38 100644 --- a/osdc/modules/arc-runners/defs/l-x86iavx512-44-500-a100-4.yaml +++ b/osdc/modules/arc-runners/defs/l-x86iavx512-44-500-a100-4.yaml @@ -7,3 +7,6 @@ runner: vcpu: 44 memory: 500Gi gpu: 4 + max_runners: + default: 2 # TESTING: cap to 1 p4d node (2 NUMA zones × 4 GPUs) + scheduler_name: numa-scheduler # TESTING: NUMA-aware scheduling diff --git a/osdc/modules/nfd/helm/values.yaml b/osdc/modules/nfd/helm/values.yaml index ce15a291..6924b5a1 100644 --- a/osdc/modules/nfd/helm/values.yaml +++ b/osdc/modules/nfd/helm/values.yaml @@ -23,12 +23,10 @@ topologyUpdater: # evaluated by the NUMA-aware scheduler. updateInterval: 15s - # Scoped to H100 nodes (p5 fleet) where the packed pool uses - # single-numa-node topology policy. Only these multi-NUMA nodes - # need NRT data for the numa-scheduler to prevent - # TopologyAffinityError on 4-GPU jobs. + # TESTING: p4d for arc-staging validation. + # Production: change to node-fleet: p5. nodeSelector: - node-fleet: p5 + node-fleet: p4d # Tolerate every taint so the topology-updater schedules on p5 # nodes regardless of their taint set (node-fleet, instance-type, diff --git a/osdc/modules/nfd/kubernetes/nfd-taint-remover.yaml b/osdc/modules/nfd/kubernetes/nfd-taint-remover.yaml index 371e89e0..bc315125 100644 --- a/osdc/modules/nfd/kubernetes/nfd-taint-remover.yaml +++ b/osdc/modules/nfd/kubernetes/nfd-taint-remover.yaml @@ -67,9 +67,10 @@ spec: spec: priorityClassName: system-node-critical - # Match the same nodes as the NFD topology-updater. + # TESTING: p4d for arc-staging validation. + # Production: change to node-fleet: p5. nodeSelector: - node-fleet: p5 + node-fleet: p4d # Tolerate every taint — same rationale as node-performance-tuning: # must coexist with all node-init.osdc.io/* startup taints without diff --git a/osdc/modules/nfd/scripts/cleanup-arc-staging.sh b/osdc/modules/nfd/scripts/cleanup-arc-staging.sh new file mode 100755 index 00000000..9406ac13 --- /dev/null +++ b/osdc/modules/nfd/scripts/cleanup-arc-staging.sh @@ -0,0 +1,51 @@ +#!/usr/bin/env bash +set -euo pipefail +# +# [TEST-ONLY] Clean up NUMA modules from arc-staging after testing. +# Removes all resources created by the nfd + numa-scheduler test deploy +# and restores the cluster to its pre-test state. +# +# Usage: bash modules/nfd/scripts/cleanup-arc-staging.sh +# +# After running this script, drop the test commit: +# git reset --hard HEAD~1 + +CTX="pytorch-arc-staging" + +echo "=== Cleaning up NUMA test resources from arc-staging ===" + +# 1. Delete namespaced resources (Helm releases + all pods/services/etc.) +echo "→ Deleting nfd namespace..." +kubectl --context "$CTX" delete namespace nfd --ignore-not-found +echo "→ Deleting numa-scheduler namespace..." +kubectl --context "$CTX" delete namespace numa-scheduler --ignore-not-found + +# 2. Delete cluster-scoped resources (not removed by namespace deletion) +echo "→ Removing NRT CRD..." +kubectl --context "$CTX" delete crd noderesourcetopologies.topology.node.k8s.io --ignore-not-found +echo "→ Removing nfd-taint-remover ClusterRole/ClusterRoleBinding..." +kubectl --context "$CTX" delete clusterrole nfd-taint-remover --ignore-not-found +kubectl --context "$CTX" delete clusterrolebinding nfd-taint-remover --ignore-not-found + +# 3. Restore local files to pre-test state +echo "→ Restoring modified files..." +git checkout -- \ + modules/nfd/helm/values.yaml \ + modules/nfd/kubernetes/nfd-taint-remover.yaml \ + modules/nodepools/scripts/python/generate_nodepools.py \ + modules/nodepools/defs/p4d.yaml \ + modules/arc-runners/defs/l-x86iavx512-11-125-a100.yaml \ + modules/arc-runners/defs/l-x86iavx512-22-250-a100-2.yaml \ + modules/arc-runners/defs/l-x86iavx512-44-500-a100-4.yaml \ + modules/arc-runners/defs/l-bx86iavx512-88-1000-a100-8.yaml \ + clusters.yaml + +# 4. Redeploy nodepools + arc-runners to restore original state on cluster +echo "→ Redeploying nodepools (removes p4d startup taint, restores exclude_regions)..." +just deploy-module arc-staging nodepools +echo "→ Redeploying arc-runners (restores original A100 runner defs)..." +just deploy-module arc-staging arc-runners + +echo "" +echo "=== Cleanup complete ===" +echo "Now drop the test commit: git reset --hard HEAD~1" diff --git a/osdc/modules/nodepools/defs/p4d.yaml b/osdc/modules/nodepools/defs/p4d.yaml index a2d91bda..47f3fc18 100644 --- a/osdc/modules/nodepools/defs/p4d.yaml +++ b/osdc/modules/nodepools/defs/p4d.yaml @@ -11,8 +11,8 @@ fleet: name: p4d arch: amd64 gpu: true - exclude_regions: - - us-west-1 + # exclude_regions: # TESTING: temporarily allowing us-west-1 for arc-staging + # - us-west-1 instances: - type: p4d.24xlarge weight: 100 diff --git a/osdc/modules/nodepools/scripts/python/generate_nodepools.py b/osdc/modules/nodepools/scripts/python/generate_nodepools.py index 2e9cf0c1..f21c64f2 100755 --- a/osdc/modules/nodepools/scripts/python/generate_nodepools.py +++ b/osdc/modules/nodepools/scripts/python/generate_nodepools.py @@ -98,10 +98,9 @@ "key": "node-init.osdc.io/nfd-topology", "value": "true", "effect": "NoSchedule", - # NFD topology-updater only targets p5 nodes (nodeSelector: node-fleet: p5). - # Only emit the taint on nodepools where NFD actually runs — otherwise the - # node would be tainted with nothing to remove it. - "applies_when": lambda d: d.get("fleet_name") == "p5", + # TESTING: broadened to include p4d for arc-staging validation. + # Production: restrict to fleet_name == "p5" only. + "applies_when": lambda d: d.get("fleet_name") in ("p5", "p4d"), }, ]