diff --git a/osdc/modules/arc-runners/scripts/python/generate_runners.py b/osdc/modules/arc-runners/scripts/python/generate_runners.py index 3df5871b..7eaa5c94 100755 --- a/osdc/modules/arc-runners/scripts/python/generate_runners.py +++ b/osdc/modules/arc-runners/scripts/python/generate_runners.py @@ -328,6 +328,20 @@ def generate_runner(def_file, template_content, cluster_config, output_dir, modu # Optional maxRunners line — only emitted when max_runners is set in the def max_runners_line = f"maxRunners: {max_runners}" if max_runners is not None else "" + # Optional schedulerName for workflow pods. Per-runner-def value + # (e.g. scheduler_name: numa-scheduler on H100 4-GPU) takes precedence + # over the cluster-level default. Empty = default scheduler. + # + # The same value feeds two places so the real workflow pod and its + # capacity placeholder (ph-w-*) agree on the scheduler: + # - {{SCHEDULER_NAME_LINE}}: schedulerName on the real workflow pod spec. + # - {{SCHEDULER_NAME}}: CAPACITY_AWARE_WORKFLOW_SCHEDULER_NAME on the + # listener, which the ARC fork stamps onto the workflow placeholder. + # If they diverged, a NUMA-blind placeholder would reserve a slot the + # NUMA-aware real pod can't claim (broken reservation on NUMA nodes). + scheduler_name = runner.get("scheduler_name", "") + scheduler_name_line = f" schedulerName: {scheduler_name}" if scheduler_name else "" + # Replace all template placeholders output_content = template_content replacements = { @@ -356,6 +370,8 @@ def generate_runner(def_file, template_content, cluster_config, output_dir, modu "{{PROACTIVE_CAPACITY}}": str(proactive_capacity), "{{MAX_BURST_CAPACITY}}": str(max_burst_capacity), "{{HUD_FAILURE_BASE_CAPACITY}}": str(hud_failure_base_capacity), + "{{SCHEDULER_NAME_LINE}}": scheduler_name_line, + "{{SCHEDULER_NAME}}": scheduler_name, } for placeholder, value in replacements.items(): diff --git a/osdc/modules/arc-runners/scripts/python/test_generate_runners.py b/osdc/modules/arc-runners/scripts/python/test_generate_runners.py index de355d6c..bffddb77 100644 --- a/osdc/modules/arc-runners/scripts/python/test_generate_runners.py +++ b/osdc/modules/arc-runners/scripts/python/test_generate_runners.py @@ -193,6 +193,7 @@ def make_def_file( max_burst_capacity=None, hud_failure_base_capacity=None, node_fleet=None, + scheduler_name=None, ): """Write a runner def YAML and return the path. @@ -221,6 +222,8 @@ def make_def_file( runner["hud_failure_base_capacity"] = hud_failure_base_capacity if node_fleet is not None: runner["node_fleet"] = node_fleet + if scheduler_name is not None: + runner["scheduler_name"] = scheduler_name content = {"runner": runner} p = tmp_path / f"{name}.yaml" p.write_text(yaml.dump(content, default_flow_style=False)) @@ -1858,6 +1861,19 @@ def _render_real(real_template, tmp_path, variant): return helm, configmap, workflow_pod +def _listener_env_value(helm, name): + """Return the value of a named env var on the listener container. + + Returns None when the env var is absent so callers can distinguish + "missing" from "present but empty". + """ + containers = helm["listenerTemplate"]["spec"]["containers"] + for entry in containers[0].get("env", []): + if entry.get("name") == name: + return entry.get("value") + return None + + class TestRealTemplate: @pytest.mark.parametrize("variant", RUNNER_VARIANTS) def test_runner_pod_uses_arc_runner_priority_class(self, real_template, tmp_path, variant): @@ -1932,6 +1948,54 @@ def test_workflow_pod_node_fleet_matches_def(self, real_template, tmp_path, vari assert len(node_fleet_exprs) == 1 assert node_fleet_exprs[0]["values"] == [expected_fleet] + def test_workflow_pod_scheduler_name_from_def(self, real_template, tmp_path): + """Workflow pod gets schedulerName when the runner def sets scheduler_name.""" + variant = { + "name": "numa-runner", + "instance_type": "p5.48xlarge", + "vcpu": 88, + "memory": 900, + "gpu": 4, + "disk_size": 200, + "scheduler_name": "numa-scheduler", + "expected_workflow_fleet": "p5", + } + def_kwargs = { + "tmp_path": tmp_path, + "name": variant["name"], + "instance_type": variant["instance_type"], + "vcpu": variant["vcpu"], + "memory": variant["memory"], + "gpu": variant["gpu"], + "disk_size": variant["disk_size"], + "scheduler_name": variant["scheduler_name"], + } + def_file = make_def_file(**def_kwargs) + output_dir = tmp_path / "out" + output_dir.mkdir() + cluster_config = { + "github_config_url": "https://github.com/test-org", + "github_secret_name": "gh-secret", + "runner_name_prefix": "real-", + } + assert generate_runner(def_file, real_template, cluster_config, output_dir, "arc-runners") is True + docs = list(yaml.safe_load_all((output_dir / "numa-runner.yaml").read_text())) + helm = docs[0] + workflow_pod = yaml.safe_load(docs[1]["data"]["job-pod.yaml"]) + assert workflow_pod["spec"]["schedulerName"] == "numa-scheduler" + # The capacity placeholder (ph-w-*) must use the SAME scheduler so it + # reserves a slot the real workflow pod can actually claim on NUMA nodes. + assert _listener_env_value(helm, "CAPACITY_AWARE_WORKFLOW_SCHEDULER_NAME") == "numa-scheduler" + + @pytest.mark.parametrize("variant", RUNNER_VARIANTS) + def test_workflow_pod_no_scheduler_name_by_default(self, real_template, tmp_path, variant): + """Workflow pod must NOT have schedulerName when the runner def omits scheduler_name.""" + helm, _, workflow_pod = _render_real(real_template, tmp_path, variant) + assert "schedulerName" not in workflow_pod["spec"] + # Placeholder scheduler env must be present-but-empty (the fork treats + # empty as default-scheduler), keeping it in sync with the workflow pod. + assert _listener_env_value(helm, "CAPACITY_AWARE_WORKFLOW_SCHEDULER_NAME") == "" + def test_gpu_workflow_pod_has_gpu_toleration_and_resources(self, real_template, tmp_path): """GPU runner's workflow pod must carry nvidia.com/gpu toleration and matching request/limit.""" _, _, workflow_pod = _render_real(real_template, tmp_path, GPU_VARIANT.values[0]) diff --git a/osdc/modules/arc-runners/templates/runner.yaml.tpl b/osdc/modules/arc-runners/templates/runner.yaml.tpl index 857e3f8c..15766a79 100644 --- a/osdc/modules/arc-runners/templates/runner.yaml.tpl +++ b/osdc/modules/arc-runners/templates/runner.yaml.tpl @@ -179,6 +179,13 @@ listenerTemplate: value: "{{GPU_COUNT}}" - name: CAPACITY_AWARE_WORKFLOW_DISK value: "{{DISK_SIZE}}" + # Stamp the workflow placeholder (ph-w-*) with the same scheduler the + # real workflow pod uses (see {{SCHEDULER_NAME_LINE}} in the job-pod + # template). Empty = default-scheduler (the fork only sets + # .Spec.SchedulerName when non-empty), so packing/reservation stay + # consistent on NUMA nodes. Single source: the def's scheduler_name. + - name: CAPACITY_AWARE_WORKFLOW_SCHEDULER_NAME + value: "{{SCHEDULER_NAME}}" # Must match the runner container resources below (requests/limits) - name: CAPACITY_AWARE_RUNNER_CPU value: "750m" @@ -383,7 +390,7 @@ data: # Priority 20 — preempts placeholder-workflow (10) so workflow pods # can claim the capacity reserved by the placeholders they replace. priorityClassName: arc-workflow - +{{SCHEDULER_NAME_LINE}} # Prefer scheduling job pods on same node fleet as runner. # Tolerations enforce node-fleet constraints (every NodePool taints # with node-fleet=:NoSchedule), so nodeSelector is not needed.