Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions osdc/modules/arc-runners/scripts/python/generate_runners.py
Original file line number Diff line number Diff line change
Expand Up @@ -328,6 +328,20 @@ def generate_runner(def_file, template_content, cluster_config, output_dir, modu
# Optional maxRunners line — only emitted when max_runners is set in the def
max_runners_line = f"maxRunners: {max_runners}" if max_runners is not None else ""

# Optional schedulerName for workflow pods. Per-runner-def value
# (e.g. scheduler_name: numa-scheduler on H100 4-GPU) takes precedence
# over the cluster-level default. Empty = default scheduler.
#
# The same value feeds two places so the real workflow pod and its
# capacity placeholder (ph-w-*) agree on the scheduler:
# - {{SCHEDULER_NAME_LINE}}: schedulerName on the real workflow pod spec.
# - {{SCHEDULER_NAME}}: CAPACITY_AWARE_WORKFLOW_SCHEDULER_NAME on the
# listener, which the ARC fork stamps onto the workflow placeholder.
# If they diverged, a NUMA-blind placeholder would reserve a slot the
# NUMA-aware real pod can't claim (broken reservation on NUMA nodes).
scheduler_name = runner.get("scheduler_name", "")
scheduler_name_line = f" schedulerName: {scheduler_name}" if scheduler_name else ""

# Replace all template placeholders
output_content = template_content
replacements = {
Expand Down Expand Up @@ -356,6 +370,8 @@ def generate_runner(def_file, template_content, cluster_config, output_dir, modu
"{{PROACTIVE_CAPACITY}}": str(proactive_capacity),
"{{MAX_BURST_CAPACITY}}": str(max_burst_capacity),
"{{HUD_FAILURE_BASE_CAPACITY}}": str(hud_failure_base_capacity),
"{{SCHEDULER_NAME_LINE}}": scheduler_name_line,
"{{SCHEDULER_NAME}}": scheduler_name,
}

for placeholder, value in replacements.items():
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,7 @@ def make_def_file(
max_burst_capacity=None,
hud_failure_base_capacity=None,
node_fleet=None,
scheduler_name=None,
):
"""Write a runner def YAML and return the path.

Expand Down Expand Up @@ -221,6 +222,8 @@ def make_def_file(
runner["hud_failure_base_capacity"] = hud_failure_base_capacity
if node_fleet is not None:
runner["node_fleet"] = node_fleet
if scheduler_name is not None:
runner["scheduler_name"] = scheduler_name
content = {"runner": runner}
p = tmp_path / f"{name}.yaml"
p.write_text(yaml.dump(content, default_flow_style=False))
Expand Down Expand Up @@ -1858,6 +1861,19 @@ def _render_real(real_template, tmp_path, variant):
return helm, configmap, workflow_pod


def _listener_env_value(helm, name):
"""Return the value of a named env var on the listener container.

Returns None when the env var is absent so callers can distinguish
"missing" from "present but empty".
"""
containers = helm["listenerTemplate"]["spec"]["containers"]
for entry in containers[0].get("env", []):
if entry.get("name") == name:
return entry.get("value")
return None


class TestRealTemplate:
@pytest.mark.parametrize("variant", RUNNER_VARIANTS)
def test_runner_pod_uses_arc_runner_priority_class(self, real_template, tmp_path, variant):
Expand Down Expand Up @@ -1932,6 +1948,54 @@ def test_workflow_pod_node_fleet_matches_def(self, real_template, tmp_path, vari
assert len(node_fleet_exprs) == 1
assert node_fleet_exprs[0]["values"] == [expected_fleet]

def test_workflow_pod_scheduler_name_from_def(self, real_template, tmp_path):
"""Workflow pod gets schedulerName when the runner def sets scheduler_name."""
variant = {
"name": "numa-runner",
"instance_type": "p5.48xlarge",
"vcpu": 88,
"memory": 900,
"gpu": 4,
"disk_size": 200,
"scheduler_name": "numa-scheduler",
"expected_workflow_fleet": "p5",
}
def_kwargs = {
"tmp_path": tmp_path,
"name": variant["name"],
"instance_type": variant["instance_type"],
"vcpu": variant["vcpu"],
"memory": variant["memory"],
"gpu": variant["gpu"],
"disk_size": variant["disk_size"],
"scheduler_name": variant["scheduler_name"],
}
def_file = make_def_file(**def_kwargs)
output_dir = tmp_path / "out"
output_dir.mkdir()
cluster_config = {
"github_config_url": "https://github.com/test-org",
"github_secret_name": "gh-secret",
"runner_name_prefix": "real-",
}
assert generate_runner(def_file, real_template, cluster_config, output_dir, "arc-runners") is True
docs = list(yaml.safe_load_all((output_dir / "numa-runner.yaml").read_text()))
helm = docs[0]
workflow_pod = yaml.safe_load(docs[1]["data"]["job-pod.yaml"])
assert workflow_pod["spec"]["schedulerName"] == "numa-scheduler"
# The capacity placeholder (ph-w-*) must use the SAME scheduler so it
# reserves a slot the real workflow pod can actually claim on NUMA nodes.
assert _listener_env_value(helm, "CAPACITY_AWARE_WORKFLOW_SCHEDULER_NAME") == "numa-scheduler"

@pytest.mark.parametrize("variant", RUNNER_VARIANTS)
def test_workflow_pod_no_scheduler_name_by_default(self, real_template, tmp_path, variant):
"""Workflow pod must NOT have schedulerName when the runner def omits scheduler_name."""
helm, _, workflow_pod = _render_real(real_template, tmp_path, variant)
assert "schedulerName" not in workflow_pod["spec"]
# Placeholder scheduler env must be present-but-empty (the fork treats
# empty as default-scheduler), keeping it in sync with the workflow pod.
assert _listener_env_value(helm, "CAPACITY_AWARE_WORKFLOW_SCHEDULER_NAME") == ""

def test_gpu_workflow_pod_has_gpu_toleration_and_resources(self, real_template, tmp_path):
"""GPU runner's workflow pod must carry nvidia.com/gpu toleration and matching request/limit."""
_, _, workflow_pod = _render_real(real_template, tmp_path, GPU_VARIANT.values[0])
Expand Down
9 changes: 8 additions & 1 deletion osdc/modules/arc-runners/templates/runner.yaml.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,13 @@ listenerTemplate:
value: "{{GPU_COUNT}}"
- name: CAPACITY_AWARE_WORKFLOW_DISK
value: "{{DISK_SIZE}}"
# Stamp the workflow placeholder (ph-w-*) with the same scheduler the
# real workflow pod uses (see {{SCHEDULER_NAME_LINE}} in the job-pod
# template). Empty = default-scheduler (the fork only sets
# .Spec.SchedulerName when non-empty), so packing/reservation stay
# consistent on NUMA nodes. Single source: the def's scheduler_name.
- name: CAPACITY_AWARE_WORKFLOW_SCHEDULER_NAME
value: "{{SCHEDULER_NAME}}"
# Must match the runner container resources below (requests/limits)
- name: CAPACITY_AWARE_RUNNER_CPU
value: "750m"
Expand Down Expand Up @@ -383,7 +390,7 @@ data:
# Priority 20 — preempts placeholder-workflow (10) so workflow pods
# can claim the capacity reserved by the placeholders they replace.
priorityClassName: arc-workflow

{{SCHEDULER_NAME_LINE}}
# Prefer scheduling job pods on same node fleet as runner.
# Tolerations enforce node-fleet constraints (every NodePool taints
# with node-fleet=<fleet>:NoSchedule), so nodeSelector is not needed.
Expand Down
Loading