Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions osdc/justfile
Original file line number Diff line number Diff line change
Expand Up @@ -2262,6 +2262,38 @@ generate-arc-runners cluster:
fi
echo "Generated ARC runner configs in ${OUTPUT_DIR} (${elapsed})"

# Render a single nodepool def to stdout (generation only, no cluster required).
# Useful for previewing the generated userData / kubelet config before deploying.
# just render-nodepool p4d # renders defs/p4d.yaml
# just render-nodepool p4d nfd # with nfd startup taint enabled
render-nodepool def *modules:
#!/usr/bin/env bash
set -euo pipefail
source "{{UPSTREAM}}/scripts/mise-activate.sh"
DEF="{{def}}"
MODULES="{{modules}}"
MODULE_DIR="{{UPSTREAM}}/modules/nodepools"
DEFS_DIR="$MODULE_DIR/defs"
DEF_FILE="$DEFS_DIR/${DEF}.yaml"
if [[ ! -f "$DEF_FILE" ]]; then
echo "ERROR: $DEF_FILE not found" >&2
echo "Available defs:" >&2
ls "$DEFS_DIR"/*.yaml 2>/dev/null | xargs -n1 basename | sed 's/\.yaml$//' | sed 's/^/ /' >&2
exit 1
fi
TMPDEF=$(mktemp -d)
TMPOUT=$(mktemp -d)
trap 'rm -rf "$TMPDEF" "$TMPOUT"' EXIT
cp "$DEF_FILE" "$TMPDEF/"
NODEPOOLS_DEFS_DIR="$TMPDEF" \
NODEPOOLS_OUTPUT_DIR="$TMPOUT" \
NODEPOOLS_ENABLED_MODULES="$MODULES" \
uv run "$MODULE_DIR/scripts/python/generate_nodepools.py" >&2
for f in "$TMPOUT"/*.yaml; do
echo "--- # $(basename "$f")"
cat "$f"
done

# Monte Carlo cluster simulation for PyTorch CI load
simulate-cluster *args:
@export OSDC_ROOT="{{ROOT}}"; \
Expand Down
37 changes: 1 addition & 36 deletions osdc/modules/arc-runners/scripts/python/generate_runners.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
from conditional_blocks import strip_conditional_block # noqa: E402
from fleet_naming import derive_fleet_name # noqa: E402
from nodepool_defs import load_excluded_instance_types # noqa: E402
from quantities import parse_memory_bytes # noqa: E402
from runner_fleet_validator import validate_cluster_runner_fleets # noqa: E402

# ANSI colors
Expand All @@ -59,42 +60,6 @@ def normalize_name(name):
return name.replace(".", "-").replace("_", "-")


# Kubernetes resource quantity suffixes → multiplier (bytes)
_K8S_MEMORY_SUFFIXES = {
"Ki": 1024,
"Mi": 1024**2,
"Gi": 1024**3,
"Ti": 1024**4,
"K": 1000,
"M": 1000**2,
"G": 1000**3,
"T": 1000**4,
}


def parse_memory_bytes(memory_str):
"""Convert a Kubernetes memory quantity string to bytes.

Supports binary (Ki, Mi, Gi, Ti) and decimal (K, M, G, T) suffixes,
as well as plain integer strings (already in bytes).

>>> parse_memory_bytes("115Gi")
123480309760
>>> parse_memory_bytes("512Mi")
536870912
>>> parse_memory_bytes("1024")
1024
"""
s = str(memory_str)
# Try two-char suffix first (Ki, Mi, Gi, Ti), then one-char (K, M, G, T)
for suffix_len in (2, 1):
if len(s) > suffix_len:
suffix = s[-suffix_len:]
if suffix in _K8S_MEMORY_SUFFIXES:
return int(s[:-suffix_len]) * _K8S_MEMORY_SUFFIXES[suffix]
return int(s)


def load_clusters_yaml(repo_root):
"""Load clusters.yaml from the repository root."""
config_path = repo_root / "clusters.yaml"
Expand Down
144 changes: 143 additions & 1 deletion osdc/modules/nodepools/scripts/python/generate_nodepools.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@

from instance_specs import INSTANCE_SPECS # noqa: E402
from nodepool_defs import is_excluded_for_region as _is_excluded_for_region # noqa: E402
from quantities import parse_memory_bytes # noqa: E402

# List of startup taint entries. Each entry is a dict with:
# - ``key``, ``value``, ``effect`` (all str) — the Karpenter startupTaint to emit
Expand Down Expand Up @@ -118,6 +119,7 @@ def _validate_startup_taints_registry(modules_root: Path) -> None:

# ANSI colors
GREEN = "\033[0;32m"
YELLOW = "\033[0;33m"
RED = "\033[0;31m"
NC = "\033[0m"

Expand All @@ -126,6 +128,10 @@ def log_info(msg):
print(f"{GREEN}\u2192{NC} {msg}")


def log_warning(msg):
print(f"{YELLOW}\u26a0{NC} {msg}", file=sys.stderr)


def log_error(msg):
print(f"{RED}\u2717{NC} {msg}")

Expand Down Expand Up @@ -194,6 +200,125 @@ def _user_data_script_mime_part(indented_script):
"""


# Per-def kubelet memory knobs → (kubelet.config field, sub-key, quote-value?).
# These three share the shape ``<field>:\n <subkey>: <value>`` and are GENERAL
# kubelet settings (they change node-allocatable memory / the eviction threshold,
# and take effect on their own). nodeadm deep-merges each onto the EKS defaults,
# preserving sibling keys (cpu, ephemeral-storage, nodefs.*).
#
# NODEADM DEEP-MERGE ASSUMPTION: setting e.g. kubeReserved.memory preserves the
# EKS-default kubeReserved.cpu and kubeReserved.ephemeral-storage (likewise
# evictionHard.memory.available preserves nodefs.available and nodefs.inodesFree).
# Validated empirically on EKS AL2023 AMI (2026-06-17, p4d ip-10-20-26-82).
# This is a nodeadm implementation detail, NOT a contractual API guarantee — a
# future AMI/nodeadm version could change to replace-merge. Re-verify sibling
# preservation on every AMI update by checking ``kubectl get --raw
# /api/v1/nodes/<node>/proxy/configz`` on the first rolled node. If siblings
# are ever dropped, the mitigation is to emit all sibling keys explicitly here
# (pin kubeReserved.cpu / ephemeral-storage / evictionHard.nodefs.* alongside
# the memory keys).
_KUBELET_MEMORY_RESERVATIONS = (
("kube_reserved_memory", "kubeReserved", "memory", False),
("system_reserved_memory", "systemReserved", "memory", True), # quote so "0" stays a string
("eviction_hard_memory_available", "evictionHard", "memory.available", False),
)


def _validate_kubelet_memory(nodepool_def, topology_policy):
"""Validate the per-def kubelet memory knobs (see ``_kubelet_memory_block``).

The general knobs (kubeReserved/systemReserved/evictionHard) need no
cross-checks — they stand alone. The Memory Manager pieces do:
- ``reserved_memory`` is consumed only by the Memory Manager → requires Static;
- Static requires ``reserved_memory``;
- with Static AND all three reservation terms pinned, the kubelet boot gate
``sum(reservedMemory) == kubeReserved + systemReserved + evictionHard`` is
checked here (a bad sum stops the kubelet from starting). If a term is left
to the EKS default — unknown at generation time — warn instead of guess.
Also warns when Static is used without single-numa-node (no scheduling gain).
"""
name = nodepool_def["name"]
policy = nodepool_def.get("memory_manager_policy")
reserved_memory = nodepool_def.get("reserved_memory")

if policy is not None and policy != "Static":
raise ValueError(
f"{name}: memory_manager_policy must be 'Static' (got '{policy}'); "
f"None is the kubelet default and needs no override."
)
if reserved_memory is not None and policy != "Static":
raise ValueError(
f"{name}: reserved_memory requires memory_manager_policy: Static "
f"(reservedMemory is consumed only by the kubelet Memory Manager)."
)
if policy != "Static":
return # general knobs (if any) stand on their own — nothing more to check

if not reserved_memory:
raise ValueError(
f"{name}: memory_manager_policy: Static requires reserved_memory "
f"(the per-NUMA reservations the Memory Manager pins)."
)
if topology_policy != "single-numa-node":
log_warning(
f"{name}: memory_manager_policy: Static with topology_manager_policy="
f"'{topology_policy}' (not single-numa-node) — memory will be NUMA-reserved "
f"but the scheduler won't align it; Static only helps under single-numa-node."
)

terms = [nodepool_def.get(key) for key, *_ in _KUBELET_MEMORY_RESERVATIONS]
if all(t is not None for t in terms):
gate = sum(parse_memory_bytes(t) for t in terms)
reserved_total = sum(parse_memory_bytes(z["memory"]) for z in reserved_memory)
if reserved_total != gate:
raise ValueError(
f"{name}: reserved_memory sum ({reserved_total} bytes) must equal "
f"kube_reserved_memory + system_reserved_memory + "
f"eviction_hard_memory_available ({gate} bytes), or the kubelet will "
f"refuse to start. Adjust the per-NUMA split to total the reservation sum."
)
else:
log_warning(
f"{name}: memory_manager_policy: Static — boot gate NOT validated at generation "
f"because kube_reserved_memory/system_reserved_memory/eviction_hard_memory_available "
f"are not all pinned. The kubelet requires sum(reservedMemory) == kubeReserved + "
f"systemReserved + evictionHard (EKS defaults apply where unset) or it will not "
f"boot — pin all three to validate here."
)


def _kubelet_memory_block(nodepool_def):
"""Translate the per-def kubelet memory knobs to ``kubelet.config`` YAML lines
(10-space base), one block per key present, or "" when none are set.

Pure translation — call ``_validate_kubelet_memory`` first for the constraints.
- ``memory_manager_policy`` -> memoryManagerPolicy
- ``kube_reserved_memory`` -> kubeReserved.memory
- ``system_reserved_memory`` -> systemReserved.memory
- ``eviction_hard_memory_available`` -> evictionHard.memory.available
- ``reserved_memory`` -> reservedMemory (per-NUMA list)
"""
parts = []
policy = nodepool_def.get("memory_manager_policy")
if policy is not None:
parts.append(f" memoryManagerPolicy: {policy}\n")

for key, field, subkey, quote in _KUBELET_MEMORY_RESERVATIONS:
value = nodepool_def.get(key)
if value is not None:
rendered = f'"{value}"' if quote else value
parts.append(f" {field}:\n {subkey}: {rendered}\n")

reserved_memory = nodepool_def.get("reserved_memory")
if reserved_memory is not None:
zones = "".join(
f" - numaNode: {z['numa_node']}\n limits:\n memory: {z['memory']}\n"
for z in reserved_memory
)
parts.append(f" reservedMemory:\n{zones}")
return "".join(parts)


def generate_nodepool_yaml(nodepool_def, module_name, defs_dir=None):
"""Generate a combined NodePool + EC2NodeClass YAML string."""
name = nodepool_def["name"]
Expand All @@ -211,6 +336,12 @@ def generate_nodepool_yaml(nodepool_def, module_name, defs_dir=None):
topology_policy = nodepool_def.get("topology_manager_policy", "best-effort")
topology_scope = nodepool_def.get("topology_manager_scope", "container")

# Per-def kubelet memory knobs (kubeReserved / systemReserved / evictionHard /
# memoryManagerPolicy / reservedMemory): validate the constraints, then translate
# each set key 1:1 into kubelet.config. Each block is emitted only when present.
_validate_kubelet_memory(nodepool_def, topology_policy)
kubelet_memory_block = _kubelet_memory_block(nodepool_def)

# Read optional user data script for embedding as a MIME part
indented_userdata = _read_user_data_script(user_data_script_path, defs_dir) if defs_dir else None

Expand Down Expand Up @@ -441,6 +572,7 @@ def generate_nodepool_yaml(nodepool_def, module_name, defs_dir=None):
topologyManagerPolicy: {topology_policy}
topologyManagerScope: {topology_scope}
{" topologyManagerPolicyOptions:" + chr(10) + ' prefer-closest-numa-nodes: "true"' + chr(10) if topology_policy in ("restricted", "best-effort") else ""}\
{kubelet_memory_block}\
containerLogMaxSize: 50Mi
containerLogMaxFiles: 5
{_user_data_script_mime_part(indented_userdata)}
Expand Down Expand Up @@ -552,7 +684,17 @@ def _build_fleet_nodepool_def(fleet_data, inst, name_suffix="", extra_labels=Non

# Only set optional keys when explicitly provided — leaving them absent
# lets generate_nodepool_yaml() fall through to its own defaults.
for key in ("node_compactor", "topology_manager_policy", "topology_manager_scope", "user_data_script"):
for key in (
"node_compactor",
"topology_manager_policy",
"topology_manager_scope",
"user_data_script",
"memory_manager_policy",
"kube_reserved_memory",
"system_reserved_memory",
"eviction_hard_memory_available",
"reserved_memory",
):
val = inst.get(key)
if val is not None:
nodepool_def[key] = val
Expand Down
Loading
Loading