Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,5 @@ runner:
vcpu: 88
memory: 1000Gi
gpu: 8
max_runners:
default: 1 # TESTING: minimal for arc-staging NUMA test
2 changes: 2 additions & 0 deletions osdc/modules/arc-runners/defs/l-x86iavx512-11-125-a100.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,5 @@ runner:
vcpu: 11
memory: 125Gi
gpu: 1
max_runners:
default: 2 # TESTING: limited for arc-staging NUMA test
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,5 @@ runner:
vcpu: 22
memory: 250Gi
gpu: 2
max_runners:
default: 2 # TESTING: limited for arc-staging NUMA test
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,6 @@ runner:
vcpu: 44
memory: 500Gi
gpu: 4
max_runners:
default: 2 # TESTING: cap to 1 p4d node (2 NUMA zones × 4 GPUs)
scheduler_name: numa-scheduler # TESTING: NUMA-aware scheduling
8 changes: 3 additions & 5 deletions osdc/modules/nfd/helm/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,10 @@ topologyUpdater:
# evaluated by the NUMA-aware scheduler.
updateInterval: 15s

# Scoped to H100 nodes (p5 fleet) where the packed pool uses
# single-numa-node topology policy. Only these multi-NUMA nodes
# need NRT data for the numa-scheduler to prevent
# TopologyAffinityError on 4-GPU jobs.
# TESTING: p4d for arc-staging validation.
# Production: change to node-fleet: p5.
nodeSelector:
node-fleet: p5
node-fleet: p4d

# Tolerate every taint so the topology-updater schedules on p5
# nodes regardless of their taint set (node-fleet, instance-type,
Expand Down
5 changes: 3 additions & 2 deletions osdc/modules/nfd/kubernetes/nfd-taint-remover.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -67,9 +67,10 @@ spec:
spec:
priorityClassName: system-node-critical

# Match the same nodes as the NFD topology-updater.
# TESTING: p4d for arc-staging validation.
# Production: change to node-fleet: p5.
nodeSelector:
node-fleet: p5
node-fleet: p4d

# Tolerate every taint — same rationale as node-performance-tuning:
# must coexist with all node-init.osdc.io/* startup taints without
Expand Down
51 changes: 51 additions & 0 deletions osdc/modules/nfd/scripts/cleanup-arc-staging.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#!/usr/bin/env bash
set -euo pipefail
#
# [TEST-ONLY] Clean up NUMA modules from arc-staging after testing.
# Removes all resources created by the nfd + numa-scheduler test deploy
# and restores the cluster to its pre-test state.
#
# Usage: bash modules/nfd/scripts/cleanup-arc-staging.sh
#
# After running this script, drop the test commit:
# git reset --hard HEAD~1

CTX="pytorch-arc-staging"

echo "=== Cleaning up NUMA test resources from arc-staging ==="

# 1. Delete namespaced resources (Helm releases + all pods/services/etc.)
echo "→ Deleting nfd namespace..."
kubectl --context "$CTX" delete namespace nfd --ignore-not-found
echo "→ Deleting numa-scheduler namespace..."
kubectl --context "$CTX" delete namespace numa-scheduler --ignore-not-found

# 2. Delete cluster-scoped resources (not removed by namespace deletion)
echo "→ Removing NRT CRD..."
kubectl --context "$CTX" delete crd noderesourcetopologies.topology.node.k8s.io --ignore-not-found
echo "→ Removing nfd-taint-remover ClusterRole/ClusterRoleBinding..."
kubectl --context "$CTX" delete clusterrole nfd-taint-remover --ignore-not-found
kubectl --context "$CTX" delete clusterrolebinding nfd-taint-remover --ignore-not-found

# 3. Restore local files to pre-test state
echo "→ Restoring modified files..."
git checkout -- \
modules/nfd/helm/values.yaml \
modules/nfd/kubernetes/nfd-taint-remover.yaml \
modules/nodepools/scripts/python/generate_nodepools.py \
modules/nodepools/defs/p4d.yaml \
modules/arc-runners/defs/l-x86iavx512-11-125-a100.yaml \
modules/arc-runners/defs/l-x86iavx512-22-250-a100-2.yaml \
modules/arc-runners/defs/l-x86iavx512-44-500-a100-4.yaml \
modules/arc-runners/defs/l-bx86iavx512-88-1000-a100-8.yaml \
clusters.yaml

# 4. Redeploy nodepools + arc-runners to restore original state on cluster
echo "→ Redeploying nodepools (removes p4d startup taint, restores exclude_regions)..."
just deploy-module arc-staging nodepools
echo "→ Redeploying arc-runners (restores original A100 runner defs)..."
just deploy-module arc-staging arc-runners

echo ""
echo "=== Cleanup complete ==="
echo "Now drop the test commit: git reset --hard HEAD~1"
4 changes: 2 additions & 2 deletions osdc/modules/nodepools/defs/p4d.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ fleet:
name: p4d
arch: amd64
gpu: true
exclude_regions:
- us-west-1
# exclude_regions: # TESTING: temporarily allowing us-west-1 for arc-staging
# - us-west-1
instances:
- type: p4d.24xlarge
weight: 100
Expand Down
7 changes: 3 additions & 4 deletions osdc/modules/nodepools/scripts/python/generate_nodepools.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,10 +98,9 @@
"key": "node-init.osdc.io/nfd-topology",
"value": "true",
"effect": "NoSchedule",
# NFD topology-updater only targets p5 nodes (nodeSelector: node-fleet: p5).
# Only emit the taint on nodepools where NFD actually runs — otherwise the
# node would be tainted with nothing to remove it.
"applies_when": lambda d: d.get("fleet_name") == "p5",
# TESTING: broadened to include p4d for arc-staging validation.
# Production: restrict to fleet_name == "p5" only.
"applies_when": lambda d: d.get("fleet_name") in ("p5", "p4d"),
},
]

Expand Down
Loading