From 55def5afcb98371d3c0e01b07b576b7b58a3f651 Mon Sep 17 00:00:00 2001
From: George Hong <georgehong@meta.com>
Date: Thu, 11 Jun 2026 14:35:09 -0700
Subject: [PATCH] Update

[ghstack-poisoned]
---
 .../defs/l-x86iamx-88-900-h100-4.yaml         |  4 ++
 .../scripts/python/generate_runners.py        |  7 +++
 .../scripts/python/test_generate_runners.py   | 44 +++++++++++++++++++
 .../arc-runners/templates/runner.yaml.tpl     |  2 +-
 4 files changed, 56 insertions(+), 1 deletion(-)

diff --git a/osdc/modules/arc-runners-h100/defs/l-x86iamx-88-900-h100-4.yaml b/osdc/modules/arc-runners-h100/defs/l-x86iamx-88-900-h100-4.yaml
index 7100922e..5e1f6793 100644
--- a/osdc/modules/arc-runners-h100/defs/l-x86iamx-88-900-h100-4.yaml
+++ b/osdc/modules/arc-runners-h100/defs/l-x86iamx-88-900-h100-4.yaml
@@ -7,6 +7,10 @@ runner:
   vcpu: 88
   memory: 900Gi
   gpu: 4
+  # NUMA-aware scheduling: p5.48xlarge is a 2-NUMA-node box (4 H100 per zone).
+  # Without this, the default scheduler may place a 4-GPU pod on a node where
+  # both zones are partially used, causing TopologyAffinityError.
+  scheduler_name: numa-scheduler
   # Fixed-capacity cap: 1 reserved 8-GPU node / 4 GPUs per runner = 2 concurrent runners.
   max_runners:
     default: 2
diff --git a/osdc/modules/arc-runners/scripts/python/generate_runners.py b/osdc/modules/arc-runners/scripts/python/generate_runners.py
index e1829b1e..e1f3af66 100755
--- a/osdc/modules/arc-runners/scripts/python/generate_runners.py
+++ b/osdc/modules/arc-runners/scripts/python/generate_runners.py
@@ -314,6 +314,12 @@ def generate_runner(def_file, template_content, cluster_config, output_dir, modu
     # Optional maxRunners line — only emitted when max_runners is set in the def
     max_runners_line = f"maxRunners: {max_runners}" if max_runners is not None else ""
 
+    # Optional schedulerName for workflow pods. Per-runner-def value
+    # (e.g. scheduler_name: numa-scheduler on H100 4-GPU) takes precedence
+    # over the cluster-level default. Empty = default scheduler.
+    scheduler_name = runner.get("scheduler_name", "")
+    scheduler_name_line = f"      schedulerName: {scheduler_name}" if scheduler_name else ""
+
     # Replace all template placeholders
     output_content = template_content
     replacements = {
@@ -342,6 +348,7 @@ def generate_runner(def_file, template_content, cluster_config, output_dir, modu
         "{{PROACTIVE_CAPACITY}}": str(proactive_capacity),
         "{{MAX_BURST_CAPACITY}}": str(max_burst_capacity),
         "{{HUD_FAILURE_BASE_CAPACITY}}": str(hud_failure_base_capacity),
+        "{{SCHEDULER_NAME_LINE}}": scheduler_name_line,
     }
 
     for placeholder, value in replacements.items():
diff --git a/osdc/modules/arc-runners/scripts/python/test_generate_runners.py b/osdc/modules/arc-runners/scripts/python/test_generate_runners.py
index 51421c8a..e9c7273f 100644
--- a/osdc/modules/arc-runners/scripts/python/test_generate_runners.py
+++ b/osdc/modules/arc-runners/scripts/python/test_generate_runners.py
@@ -195,6 +195,7 @@ def make_def_file(
     max_burst_capacity=None,
     hud_failure_base_capacity=None,
     node_fleet=None,
+    scheduler_name=None,
 ):
     """Write a runner def YAML and return the path.
 
@@ -223,6 +224,8 @@ def make_def_file(
         runner["hud_failure_base_capacity"] = hud_failure_base_capacity
     if node_fleet is not None:
         runner["node_fleet"] = node_fleet
+    if scheduler_name is not None:
+        runner["scheduler_name"] = scheduler_name
     content = {"runner": runner}
     p = tmp_path / f"{name}.yaml"
     p.write_text(yaml.dump(content, default_flow_style=False))
@@ -1907,6 +1910,47 @@ def test_workflow_pod_node_fleet_matches_def(self, real_template, tmp_path, vari
         assert len(node_fleet_exprs) == 1
         assert node_fleet_exprs[0]["values"] == [expected_fleet]
 
+    def test_workflow_pod_scheduler_name_from_def(self, real_template, tmp_path):
+        """Workflow pod gets schedulerName when the runner def sets scheduler_name."""
+        variant = {
+            "name": "numa-runner",
+            "instance_type": "p5.48xlarge",
+            "vcpu": 88,
+            "memory": 900,
+            "gpu": 4,
+            "disk_size": 200,
+            "scheduler_name": "numa-scheduler",
+            "expected_workflow_fleet": "p5",
+        }
+        def_kwargs = {
+            "tmp_path": tmp_path,
+            "name": variant["name"],
+            "instance_type": variant["instance_type"],
+            "vcpu": variant["vcpu"],
+            "memory": variant["memory"],
+            "gpu": variant["gpu"],
+            "disk_size": variant["disk_size"],
+            "scheduler_name": variant["scheduler_name"],
+        }
+        def_file = make_def_file(**def_kwargs)
+        output_dir = tmp_path / "out"
+        output_dir.mkdir()
+        cluster_config = {
+            "github_config_url": "https://github.com/test-org",
+            "github_secret_name": "gh-secret",
+            "runner_name_prefix": "real-",
+        }
+        assert generate_runner(def_file, real_template, cluster_config, output_dir, "arc-runners") is True
+        docs = list(yaml.safe_load_all((output_dir / "numa-runner.yaml").read_text()))
+        workflow_pod = yaml.safe_load(docs[1]["data"]["job-pod.yaml"])
+        assert workflow_pod["spec"]["schedulerName"] == "numa-scheduler"
+
+    @pytest.mark.parametrize("variant", RUNNER_VARIANTS)
+    def test_workflow_pod_no_scheduler_name_by_default(self, real_template, tmp_path, variant):
+        """Workflow pod must NOT have schedulerName when the runner def omits scheduler_name."""
+        _, _, workflow_pod = _render_real(real_template, tmp_path, variant)
+        assert "schedulerName" not in workflow_pod["spec"]
+
     def test_gpu_workflow_pod_has_gpu_toleration_and_resources(self, real_template, tmp_path):
         """GPU runner's workflow pod must carry nvidia.com/gpu toleration and matching request/limit."""
         _, _, workflow_pod = _render_real(real_template, tmp_path, GPU_VARIANT.values[0])
diff --git a/osdc/modules/arc-runners/templates/runner.yaml.tpl b/osdc/modules/arc-runners/templates/runner.yaml.tpl
index 857e3f8c..c18a75a9 100644
--- a/osdc/modules/arc-runners/templates/runner.yaml.tpl
+++ b/osdc/modules/arc-runners/templates/runner.yaml.tpl
@@ -383,7 +383,7 @@ data:
       # Priority 20 — preempts placeholder-workflow (10) so workflow pods
       # can claim the capacity reserved by the placeholders they replace.
       priorityClassName: arc-workflow
-
+{{SCHEDULER_NAME_LINE}}
       # Prefer scheduling job pods on same node fleet as runner.
       # Tolerations enforce node-fleet constraints (every NodePool taints
       # with node-fleet=<fleet>:NoSchedule), so nodeSelector is not needed.