Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ runner:
vcpu: 88
memory: 900Gi
gpu: 4
# NUMA-aware scheduling: p5.48xlarge is a 2-NUMA-node box (4 H100 per zone).
# Without this, the default scheduler may place a 4-GPU pod on a node where
# both zones are partially used, causing TopologyAffinityError.
scheduler_name: numa-scheduler
# Fixed-capacity cap: 1 reserved 8-GPU node / 4 GPUs per runner = 2 concurrent runners.
max_runners:
default: 2
Expand Down
7 changes: 7 additions & 0 deletions osdc/modules/arc-runners/scripts/python/generate_runners.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,6 +314,12 @@ def generate_runner(def_file, template_content, cluster_config, output_dir, modu
# Optional maxRunners line — only emitted when max_runners is set in the def
max_runners_line = f"maxRunners: {max_runners}" if max_runners is not None else ""

# Optional schedulerName for workflow pods. Per-runner-def value
# (e.g. scheduler_name: numa-scheduler on H100 4-GPU) takes precedence
# over the cluster-level default. Empty = default scheduler.
scheduler_name = runner.get("scheduler_name", "")
scheduler_name_line = f" schedulerName: {scheduler_name}" if scheduler_name else ""

# Replace all template placeholders
output_content = template_content
replacements = {
Expand Down Expand Up @@ -342,6 +348,7 @@ def generate_runner(def_file, template_content, cluster_config, output_dir, modu
"{{PROACTIVE_CAPACITY}}": str(proactive_capacity),
"{{MAX_BURST_CAPACITY}}": str(max_burst_capacity),
"{{HUD_FAILURE_BASE_CAPACITY}}": str(hud_failure_base_capacity),
"{{SCHEDULER_NAME_LINE}}": scheduler_name_line,
}

for placeholder, value in replacements.items():
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,7 @@ def make_def_file(
max_burst_capacity=None,
hud_failure_base_capacity=None,
node_fleet=None,
scheduler_name=None,
):
"""Write a runner def YAML and return the path.

Expand Down Expand Up @@ -223,6 +224,8 @@ def make_def_file(
runner["hud_failure_base_capacity"] = hud_failure_base_capacity
if node_fleet is not None:
runner["node_fleet"] = node_fleet
if scheduler_name is not None:
runner["scheduler_name"] = scheduler_name
content = {"runner": runner}
p = tmp_path / f"{name}.yaml"
p.write_text(yaml.dump(content, default_flow_style=False))
Expand Down Expand Up @@ -1907,6 +1910,47 @@ def test_workflow_pod_node_fleet_matches_def(self, real_template, tmp_path, vari
assert len(node_fleet_exprs) == 1
assert node_fleet_exprs[0]["values"] == [expected_fleet]

def test_workflow_pod_scheduler_name_from_def(self, real_template, tmp_path):
"""Workflow pod gets schedulerName when the runner def sets scheduler_name."""
variant = {
"name": "numa-runner",
"instance_type": "p5.48xlarge",
"vcpu": 88,
"memory": 900,
"gpu": 4,
"disk_size": 200,
"scheduler_name": "numa-scheduler",
"expected_workflow_fleet": "p5",
}
def_kwargs = {
"tmp_path": tmp_path,
"name": variant["name"],
"instance_type": variant["instance_type"],
"vcpu": variant["vcpu"],
"memory": variant["memory"],
"gpu": variant["gpu"],
"disk_size": variant["disk_size"],
"scheduler_name": variant["scheduler_name"],
}
def_file = make_def_file(**def_kwargs)
output_dir = tmp_path / "out"
output_dir.mkdir()
cluster_config = {
"github_config_url": "https://github.com/test-org",
"github_secret_name": "gh-secret",
"runner_name_prefix": "real-",
}
assert generate_runner(def_file, real_template, cluster_config, output_dir, "arc-runners") is True
docs = list(yaml.safe_load_all((output_dir / "numa-runner.yaml").read_text()))
workflow_pod = yaml.safe_load(docs[1]["data"]["job-pod.yaml"])
assert workflow_pod["spec"]["schedulerName"] == "numa-scheduler"

@pytest.mark.parametrize("variant", RUNNER_VARIANTS)
def test_workflow_pod_no_scheduler_name_by_default(self, real_template, tmp_path, variant):
"""Workflow pod must NOT have schedulerName when the runner def omits scheduler_name."""
_, _, workflow_pod = _render_real(real_template, tmp_path, variant)
assert "schedulerName" not in workflow_pod["spec"]

def test_gpu_workflow_pod_has_gpu_toleration_and_resources(self, real_template, tmp_path):
"""GPU runner's workflow pod must carry nvidia.com/gpu toleration and matching request/limit."""
_, _, workflow_pod = _render_real(real_template, tmp_path, GPU_VARIANT.values[0])
Expand Down
2 changes: 1 addition & 1 deletion osdc/modules/arc-runners/templates/runner.yaml.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -383,7 +383,7 @@ data:
# Priority 20 — preempts placeholder-workflow (10) so workflow pods
# can claim the capacity reserved by the placeholders they replace.
priorityClassName: arc-workflow

{{SCHEDULER_NAME_LINE}}
# Prefer scheduling job pods on same node fleet as runner.
# Tolerations enforce node-fleet constraints (every NodePool taints
# with node-fleet=<fleet>:NoSchedule), so nodeSelector is not needed.
Expand Down
Loading