From 55def5afcb98371d3c0e01b07b576b7b58a3f651 Mon Sep 17 00:00:00 2001 From: George Hong Date: Thu, 11 Jun 2026 14:35:09 -0700 Subject: [PATCH] Update [ghstack-poisoned] --- .../defs/l-x86iamx-88-900-h100-4.yaml | 4 ++ .../scripts/python/generate_runners.py | 7 +++ .../scripts/python/test_generate_runners.py | 44 +++++++++++++++++++ .../arc-runners/templates/runner.yaml.tpl | 2 +- 4 files changed, 56 insertions(+), 1 deletion(-) diff --git a/osdc/modules/arc-runners-h100/defs/l-x86iamx-88-900-h100-4.yaml b/osdc/modules/arc-runners-h100/defs/l-x86iamx-88-900-h100-4.yaml index 7100922e..5e1f6793 100644 --- a/osdc/modules/arc-runners-h100/defs/l-x86iamx-88-900-h100-4.yaml +++ b/osdc/modules/arc-runners-h100/defs/l-x86iamx-88-900-h100-4.yaml @@ -7,6 +7,10 @@ runner: vcpu: 88 memory: 900Gi gpu: 4 + # NUMA-aware scheduling: p5.48xlarge is a 2-NUMA-node box (4 H100 per zone). + # Without this, the default scheduler may place a 4-GPU pod on a node where + # both zones are partially used, causing TopologyAffinityError. + scheduler_name: numa-scheduler # Fixed-capacity cap: 1 reserved 8-GPU node / 4 GPUs per runner = 2 concurrent runners. max_runners: default: 2 diff --git a/osdc/modules/arc-runners/scripts/python/generate_runners.py b/osdc/modules/arc-runners/scripts/python/generate_runners.py index e1829b1e..e1f3af66 100755 --- a/osdc/modules/arc-runners/scripts/python/generate_runners.py +++ b/osdc/modules/arc-runners/scripts/python/generate_runners.py @@ -314,6 +314,12 @@ def generate_runner(def_file, template_content, cluster_config, output_dir, modu # Optional maxRunners line — only emitted when max_runners is set in the def max_runners_line = f"maxRunners: {max_runners}" if max_runners is not None else "" + # Optional schedulerName for workflow pods. Per-runner-def value + # (e.g. scheduler_name: numa-scheduler on H100 4-GPU) takes precedence + # over the cluster-level default. Empty = default scheduler. + scheduler_name = runner.get("scheduler_name", "") + scheduler_name_line = f" schedulerName: {scheduler_name}" if scheduler_name else "" + # Replace all template placeholders output_content = template_content replacements = { @@ -342,6 +348,7 @@ def generate_runner(def_file, template_content, cluster_config, output_dir, modu "{{PROACTIVE_CAPACITY}}": str(proactive_capacity), "{{MAX_BURST_CAPACITY}}": str(max_burst_capacity), "{{HUD_FAILURE_BASE_CAPACITY}}": str(hud_failure_base_capacity), + "{{SCHEDULER_NAME_LINE}}": scheduler_name_line, } for placeholder, value in replacements.items(): diff --git a/osdc/modules/arc-runners/scripts/python/test_generate_runners.py b/osdc/modules/arc-runners/scripts/python/test_generate_runners.py index 51421c8a..e9c7273f 100644 --- a/osdc/modules/arc-runners/scripts/python/test_generate_runners.py +++ b/osdc/modules/arc-runners/scripts/python/test_generate_runners.py @@ -195,6 +195,7 @@ def make_def_file( max_burst_capacity=None, hud_failure_base_capacity=None, node_fleet=None, + scheduler_name=None, ): """Write a runner def YAML and return the path. @@ -223,6 +224,8 @@ def make_def_file( runner["hud_failure_base_capacity"] = hud_failure_base_capacity if node_fleet is not None: runner["node_fleet"] = node_fleet + if scheduler_name is not None: + runner["scheduler_name"] = scheduler_name content = {"runner": runner} p = tmp_path / f"{name}.yaml" p.write_text(yaml.dump(content, default_flow_style=False)) @@ -1907,6 +1910,47 @@ def test_workflow_pod_node_fleet_matches_def(self, real_template, tmp_path, vari assert len(node_fleet_exprs) == 1 assert node_fleet_exprs[0]["values"] == [expected_fleet] + def test_workflow_pod_scheduler_name_from_def(self, real_template, tmp_path): + """Workflow pod gets schedulerName when the runner def sets scheduler_name.""" + variant = { + "name": "numa-runner", + "instance_type": "p5.48xlarge", + "vcpu": 88, + "memory": 900, + "gpu": 4, + "disk_size": 200, + "scheduler_name": "numa-scheduler", + "expected_workflow_fleet": "p5", + } + def_kwargs = { + "tmp_path": tmp_path, + "name": variant["name"], + "instance_type": variant["instance_type"], + "vcpu": variant["vcpu"], + "memory": variant["memory"], + "gpu": variant["gpu"], + "disk_size": variant["disk_size"], + "scheduler_name": variant["scheduler_name"], + } + def_file = make_def_file(**def_kwargs) + output_dir = tmp_path / "out" + output_dir.mkdir() + cluster_config = { + "github_config_url": "https://github.com/test-org", + "github_secret_name": "gh-secret", + "runner_name_prefix": "real-", + } + assert generate_runner(def_file, real_template, cluster_config, output_dir, "arc-runners") is True + docs = list(yaml.safe_load_all((output_dir / "numa-runner.yaml").read_text())) + workflow_pod = yaml.safe_load(docs[1]["data"]["job-pod.yaml"]) + assert workflow_pod["spec"]["schedulerName"] == "numa-scheduler" + + @pytest.mark.parametrize("variant", RUNNER_VARIANTS) + def test_workflow_pod_no_scheduler_name_by_default(self, real_template, tmp_path, variant): + """Workflow pod must NOT have schedulerName when the runner def omits scheduler_name.""" + _, _, workflow_pod = _render_real(real_template, tmp_path, variant) + assert "schedulerName" not in workflow_pod["spec"] + def test_gpu_workflow_pod_has_gpu_toleration_and_resources(self, real_template, tmp_path): """GPU runner's workflow pod must carry nvidia.com/gpu toleration and matching request/limit.""" _, _, workflow_pod = _render_real(real_template, tmp_path, GPU_VARIANT.values[0]) diff --git a/osdc/modules/arc-runners/templates/runner.yaml.tpl b/osdc/modules/arc-runners/templates/runner.yaml.tpl index 857e3f8c..c18a75a9 100644 --- a/osdc/modules/arc-runners/templates/runner.yaml.tpl +++ b/osdc/modules/arc-runners/templates/runner.yaml.tpl @@ -383,7 +383,7 @@ data: # Priority 20 — preempts placeholder-workflow (10) so workflow pods # can claim the capacity reserved by the placeholders they replace. priorityClassName: arc-workflow - +{{SCHEDULER_NAME_LINE}} # Prefer scheduling job pods on same node fleet as runner. # Tolerations enforce node-fleet constraints (every NodePool taints # with node-fleet=:NoSchedule), so nodeSelector is not needed.