pytorch · georgehong · Jun 15, 2026 · Jun 15, 2026 · Jun 15, 2026 · Jun 16, 2026
@@ -328,6 +328,20 @@ def generate_runner(def_file, template_content, cluster_config, output_dir, modu
     # Optional maxRunners line — only emitted when max_runners is set in the def
     max_runners_line = f"maxRunners: {max_runners}" if max_runners is not None else ""
 
+    # Optional schedulerName for workflow pods. Per-runner-def value
+    # (e.g. scheduler_name: numa-scheduler on H100 4-GPU) takes precedence
+    # over the cluster-level default. Empty = default scheduler.
+    #
+    # The same value feeds two places so the real workflow pod and its
+    # capacity placeholder (ph-w-*) agree on the scheduler:
+    #   - {{SCHEDULER_NAME_LINE}}: schedulerName on the real workflow pod spec.
+    #   - {{SCHEDULER_NAME}}: CAPACITY_AWARE_WORKFLOW_SCHEDULER_NAME on the
+    #     listener, which the ARC fork stamps onto the workflow placeholder.
+    # If they diverged, a NUMA-blind placeholder would reserve a slot the
+    # NUMA-aware real pod can't claim (broken reservation on NUMA nodes).
+    scheduler_name = runner.get("scheduler_name", "")
+    scheduler_name_line = f"      schedulerName: {scheduler_name}" if scheduler_name else ""
+
     # Replace all template placeholders
     output_content = template_content
     replacements = {
@@ -356,6 +370,8 @@ def generate_runner(def_file, template_content, cluster_config, output_dir, modu
         "{{PROACTIVE_CAPACITY}}": str(proactive_capacity),
         "{{MAX_BURST_CAPACITY}}": str(max_burst_capacity),
         "{{HUD_FAILURE_BASE_CAPACITY}}": str(hud_failure_base_capacity),
+        "{{SCHEDULER_NAME_LINE}}": scheduler_name_line,
+        "{{SCHEDULER_NAME}}": scheduler_name,
     }
 
     for placeholder, value in replacements.items():

@@ -193,6 +193,7 @@ def make_def_file(
     max_burst_capacity=None,
     hud_failure_base_capacity=None,
     node_fleet=None,
+    scheduler_name=None,
 ):
     """Write a runner def YAML and return the path.
 
@@ -221,6 +222,8 @@ def make_def_file(
         runner["hud_failure_base_capacity"] = hud_failure_base_capacity
     if node_fleet is not None:
         runner["node_fleet"] = node_fleet
+    if scheduler_name is not None:
+        runner["scheduler_name"] = scheduler_name
     content = {"runner": runner}
     p = tmp_path / f"{name}.yaml"
     p.write_text(yaml.dump(content, default_flow_style=False))
@@ -1858,6 +1861,19 @@ def _render_real(real_template, tmp_path, variant):
     return helm, configmap, workflow_pod
 
 
+def _listener_env_value(helm, name):
+    """Return the value of a named env var on the listener container.
+
+    Returns None when the env var is absent so callers can distinguish
+    "missing" from "present but empty".
+    """
+    containers = helm["listenerTemplate"]["spec"]["containers"]
+    for entry in containers[0].get("env", []):
+        if entry.get("name") == name:
+            return entry.get("value")
+    return None
+
+
 class TestRealTemplate:
     @pytest.mark.parametrize("variant", RUNNER_VARIANTS)
     def test_runner_pod_uses_arc_runner_priority_class(self, real_template, tmp_path, variant):
@@ -1932,6 +1948,54 @@ def test_workflow_pod_node_fleet_matches_def(self, real_template, tmp_path, vari
         assert len(node_fleet_exprs) == 1
         assert node_fleet_exprs[0]["values"] == [expected_fleet]
 
+    def test_workflow_pod_scheduler_name_from_def(self, real_template, tmp_path):
+        """Workflow pod gets schedulerName when the runner def sets scheduler_name."""
+        variant = {
+            "name": "numa-runner",
+            "instance_type": "p5.48xlarge",
+            "vcpu": 88,
+            "memory": 900,
+            "gpu": 4,
+            "disk_size": 200,
+            "scheduler_name": "numa-scheduler",
+            "expected_workflow_fleet": "p5",
+        }
+        def_kwargs = {
+            "tmp_path": tmp_path,
+            "name": variant["name"],
+            "instance_type": variant["instance_type"],
+            "vcpu": variant["vcpu"],
+            "memory": variant["memory"],
+            "gpu": variant["gpu"],
+            "disk_size": variant["disk_size"],
+            "scheduler_name": variant["scheduler_name"],
+        }
+        def_file = make_def_file(**def_kwargs)
+        output_dir = tmp_path / "out"
+        output_dir.mkdir()
+        cluster_config = {
+            "github_config_url": "https://github.com/test-org",
+            "github_secret_name": "gh-secret",
+            "runner_name_prefix": "real-",
+        }
+        assert generate_runner(def_file, real_template, cluster_config, output_dir, "arc-runners") is True
+        docs = list(yaml.safe_load_all((output_dir / "numa-runner.yaml").read_text()))
+        helm = docs[0]
+        workflow_pod = yaml.safe_load(docs[1]["data"]["job-pod.yaml"])
+        assert workflow_pod["spec"]["schedulerName"] == "numa-scheduler"
+        # The capacity placeholder (ph-w-*) must use the SAME scheduler so it
+        # reserves a slot the real workflow pod can actually claim on NUMA nodes.
+        assert _listener_env_value(helm, "CAPACITY_AWARE_WORKFLOW_SCHEDULER_NAME") == "numa-scheduler"
+
+    @pytest.mark.parametrize("variant", RUNNER_VARIANTS)
+    def test_workflow_pod_no_scheduler_name_by_default(self, real_template, tmp_path, variant):
+        """Workflow pod must NOT have schedulerName when the runner def omits scheduler_name."""
+        helm, _, workflow_pod = _render_real(real_template, tmp_path, variant)
+        assert "schedulerName" not in workflow_pod["spec"]
+        # Placeholder scheduler env must be present-but-empty (the fork treats
+        # empty as default-scheduler), keeping it in sync with the workflow pod.
+        assert _listener_env_value(helm, "CAPACITY_AWARE_WORKFLOW_SCHEDULER_NAME") == ""
+
     def test_gpu_workflow_pod_has_gpu_toleration_and_resources(self, real_template, tmp_path):
         """GPU runner's workflow pod must carry nvidia.com/gpu toleration and matching request/limit."""
         _, _, workflow_pod = _render_real(real_template, tmp_path, GPU_VARIANT.values[0])

@@ -179,6 +179,13 @@ listenerTemplate:
             value: "{{GPU_COUNT}}"
           - name: CAPACITY_AWARE_WORKFLOW_DISK
             value: "{{DISK_SIZE}}"
+          # Stamp the workflow placeholder (ph-w-*) with the same scheduler the
+          # real workflow pod uses (see {{SCHEDULER_NAME_LINE}} in the job-pod
+          # template). Empty = default-scheduler (the fork only sets
+          # .Spec.SchedulerName when non-empty), so packing/reservation stay
+          # consistent on NUMA nodes. Single source: the def's scheduler_name.
+          - name: CAPACITY_AWARE_WORKFLOW_SCHEDULER_NAME
+            value: "{{SCHEDULER_NAME}}"
           # Must match the runner container resources below (requests/limits)
           - name: CAPACITY_AWARE_RUNNER_CPU
             value: "750m"
@@ -383,7 +390,7 @@ data:
       # Priority 20 — preempts placeholder-workflow (10) so workflow pods
       # can claim the capacity reserved by the placeholders they replace.
       priorityClassName: arc-workflow
-
+{{SCHEDULER_NAME_LINE}}
       # Prefer scheduling job pods on same node fleet as runner.
       # Tolerations enforce node-fleet constraints (every NodePool taints
       # with node-fleet=<fleet>:NoSchedule), so nodeSelector is not needed.