From aa2c430166cc0119de162879633ecc257766a786 Mon Sep 17 00:00:00 2001 From: Rakesh Paul Date: Fri, 8 May 2026 20:06:23 +0530 Subject: [PATCH 1/5] Add airgap packaging flow for Nemotron Customizer - Add deploy-scoped airgap tooling for Nemotron Customizer steps under src/nemotron/steps. - Build a portable submitter image plus deduplicated task images for selected workflow targets. - Expand step dependencies and map selected steps to task image families through a single airgap.yaml. - Discover small task-image Python dependency gaps and bake pinned repo overlays required by step configs. - Models, datasets, checkpoints, and customer data to be kept in external persistent storage by user - Add resumable build state, image manifests with checksums, Dockerfiles, SFT overlay configs, README guidance, and focused tests. Signed-off-by: Rakesh Paul --- deploy/nemotron-customizer/airgap/.gitignore | 7 + .../airgap/Dockerfile.submitter | 26 + .../airgap/Dockerfile.submitter.dockerignore | 21 + .../airgap/Dockerfile.task | 52 + .../airgap/Dockerfile.task.dockerignore | 14 + deploy/nemotron-customizer/airgap/README.md | 135 ++ deploy/nemotron-customizer/airgap/airgap.yaml | 130 ++ .../configs/sft_megatron_bridge_default.yaml | 12 + .../configs/sft_megatron_bridge_tiny.yaml | 12 + deploy/nemotron-customizer/airgap/runner.py | 1227 +++++++++++++++++ tests/deploy/test_airgap_runner.py | 376 +++++ 11 files changed, 2012 insertions(+) create mode 100644 deploy/nemotron-customizer/airgap/.gitignore create mode 100644 deploy/nemotron-customizer/airgap/Dockerfile.submitter create mode 100644 deploy/nemotron-customizer/airgap/Dockerfile.submitter.dockerignore create mode 100644 deploy/nemotron-customizer/airgap/Dockerfile.task create mode 100644 deploy/nemotron-customizer/airgap/Dockerfile.task.dockerignore create mode 100644 deploy/nemotron-customizer/airgap/README.md create mode 100644 deploy/nemotron-customizer/airgap/airgap.yaml create mode 100644 deploy/nemotron-customizer/airgap/configs/sft_megatron_bridge_default.yaml create mode 100644 deploy/nemotron-customizer/airgap/configs/sft_megatron_bridge_tiny.yaml create mode 100644 deploy/nemotron-customizer/airgap/runner.py create mode 100644 tests/deploy/test_airgap_runner.py diff --git a/deploy/nemotron-customizer/airgap/.gitignore b/deploy/nemotron-customizer/airgap/.gitignore new file mode 100644 index 000000000..6ccaadce0 --- /dev/null +++ b/deploy/nemotron-customizer/airgap/.gitignore @@ -0,0 +1,7 @@ +# Generated by airgap runner. +out/ +airgap-bundle/ +archives/ +__pycache__/ +*.lock.yaml +*.tar diff --git a/deploy/nemotron-customizer/airgap/Dockerfile.submitter b/deploy/nemotron-customizer/airgap/Dockerfile.submitter new file mode 100644 index 000000000..ca2d0e131 --- /dev/null +++ b/deploy/nemotron-customizer/airgap/Dockerfile.submitter @@ -0,0 +1,26 @@ +# Submitter image for Nemotron Customizer airgap. +# It contains the repo and a uv-synced environment. It does not run training. + +ARG BASE_IMAGE=python:3.12-slim +FROM ${BASE_IMAGE} + +ARG UV_VERSION=0.11.1 + +WORKDIR /workspace/Nemotron + +ENV UV_LINK_MODE=copy +ENV UV_PYTHON_DOWNLOADS=never +ENV HF_HUB_OFFLINE=1 +ENV TRANSFORMERS_OFFLINE=1 +ENV HF_DATASETS_OFFLINE=1 +ENV WANDB_MODE=offline +ENV PYTHONPATH=/workspace/Nemotron/src +ENV PATH=/workspace/Nemotron/.venv/bin:$PATH + +RUN python -m pip install --no-cache-dir "uv==${UV_VERSION}" + +COPY . . + +RUN uv sync --frozen --no-dev + +CMD ["bash"] diff --git a/deploy/nemotron-customizer/airgap/Dockerfile.submitter.dockerignore b/deploy/nemotron-customizer/airgap/Dockerfile.submitter.dockerignore new file mode 100644 index 000000000..6cecc5520 --- /dev/null +++ b/deploy/nemotron-customizer/airgap/Dockerfile.submitter.dockerignore @@ -0,0 +1,21 @@ +.git +.venv +.ruff_cache +.pytest_cache +**/__pycache__ +**/*.pyc + +/.nemo_run +/outputs +/output +/logs +/checkpoints +/wandb +/data +/downloads + +deploy/nemotron-customizer/airgap/out +deploy/nemotron-customizer/airgap/airgap-bundle +deploy/nemotron-customizer/airgap/archives +deploy/nemotron-customizer/airgap/*.tar +deploy/nemotron-customizer/airgap/*.lock.yaml diff --git a/deploy/nemotron-customizer/airgap/Dockerfile.task b/deploy/nemotron-customizer/airgap/Dockerfile.task new file mode 100644 index 000000000..bb68c321a --- /dev/null +++ b/deploy/nemotron-customizer/airgap/Dockerfile.task @@ -0,0 +1,52 @@ +# Derivative task image for Nemotron Customizer airgap. +# Built from the real training/runtime image and only adds small missing +# wrapper packages. + +ARG BASE_IMAGE +FROM ${BASE_IMAGE} + +ARG TASK_REQUIREMENTS +ARG REPO_OVERLAYS +ARG REPO_OVERLAYS_DIR +ARG PYTHON_BIN=python +ARG PIP_NO_DEPS=true + +ENV HF_HUB_OFFLINE=1 +ENV TRANSFORMERS_OFFLINE=1 +ENV HF_DATASETS_OFFLINE=1 +ENV WANDB_MODE=offline + +COPY ${TASK_REQUIREMENTS} /opt/nemotron-airgap/task-requirements.txt +COPY ${REPO_OVERLAYS} /opt/nemotron-airgap/repo-overlays.json +COPY ${REPO_OVERLAYS_DIR}/ /opt/nemotron-airgap/repo-overlays/ + +# Build-time installs keep --no-cache-dir so derivative image layers stay small. +RUN if [ -s /opt/nemotron-airgap/task-requirements.txt ]; then \ + if [ "${PIP_NO_DEPS}" = "true" ]; then \ + ${PYTHON_BIN} -m pip install --no-cache-dir --no-deps -r /opt/nemotron-airgap/task-requirements.txt; \ + else \ + ${PYTHON_BIN} -m pip install --no-cache-dir -r /opt/nemotron-airgap/task-requirements.txt; \ + fi; \ + fi && \ + ${PYTHON_BIN} - <<'PY' +import json +import pathlib +import shutil + +root = pathlib.Path("/opt/nemotron-airgap/repo-overlays") +items = json.loads(pathlib.Path("/opt/nemotron-airgap/repo-overlays.json").read_text()) +for item in items: + repo = item["repo"] + source = item.get("source", repo) + target = pathlib.Path(item["target"]) + src = root / source + if not src.exists(): + raise SystemExit(f"missing baked repo overlay: {src}") + if target.exists() or target.is_symlink(): + if target.is_dir() and not target.is_symlink(): + shutil.rmtree(target) + else: + target.unlink() + target.parent.mkdir(parents=True, exist_ok=True) + shutil.copytree(src, target) +PY diff --git a/deploy/nemotron-customizer/airgap/Dockerfile.task.dockerignore b/deploy/nemotron-customizer/airgap/Dockerfile.task.dockerignore new file mode 100644 index 000000000..ef7a5c102 --- /dev/null +++ b/deploy/nemotron-customizer/airgap/Dockerfile.task.dockerignore @@ -0,0 +1,14 @@ +** + +!deploy +!deploy/nemotron-customizer +!deploy/nemotron-customizer/airgap +!deploy/nemotron-customizer/airgap/out +!deploy/nemotron-customizer/airgap/out/task-context +!deploy/nemotron-customizer/airgap/out/task-context/** +!deploy/nemotron-customizer/airgap/out/repo-overlays +!deploy/nemotron-customizer/airgap/out/repo-overlays/** + +**/.git +**/__pycache__ +**/*.pyc diff --git a/deploy/nemotron-customizer/airgap/README.md b/deploy/nemotron-customizer/airgap/README.md new file mode 100644 index 000000000..ebbccbe9f --- /dev/null +++ b/deploy/nemotron-customizer/airgap/README.md @@ -0,0 +1,135 @@ +# Nemotron Customizer Airgap + +This folder is scoped only to Nemotron Customizer steps under +`src/nemotron/steps/`. + +The flow is intentionally small: + +1. Build one **submitter image** with this repo and `uv.lock`. +2. Build one or more **task images** by grouping selected workflow stages by base image. +3. Save those images as tarballs for the airgapped side. +4. Keep models, datasets, checkpoints, and customer files on persistent storage. + +Edit `airgap.yaml` first: + +- `workflow.stages`: the Nemotron Customizer steps the customer wants to run +- `dependencies`: central step dependency map, for example SFT training needs SFT packing +- `step_images`: which task image each step should use +- `task_images`: the base image, output tag, and known/import-probed Python requirements + +Only steps reached from `workflow.stages` are built. Steps are grouped by +`base_image + repo_overlays`; each group gets one derivative image with the +union of its small missing packages. If two selected step families share the +same base image and repo overlays, the runner emits one combined task image for +both. + +Run from the repo root: + +```bash +uv run python deploy/nemotron-customizer/airgap/runner.py \ + --config deploy/nemotron-customizer/airgap/airgap.yaml +``` + +That prints the plan. To actually pull/build/save images on the connected +machine: + +```bash +uv run python deploy/nemotron-customizer/airgap/runner.py \ + --config deploy/nemotron-customizer/airgap/airgap.yaml \ + --execute +``` + +To run only a few stages: + +```bash +uv run python deploy/nemotron-customizer/airgap/runner.py \ + --config deploy/nemotron-customizer/airgap/airgap.yaml \ + --stage validate \ + --stage discover-task-deps +``` + +To override the workflow without editing YAML, pass one or more selected +Nemotron step targets. Dependencies are still expanded from `dependencies`. +For example, SDG plus SFT also adds `prep/sft_packing` because SFT needs packed +data: + +```bash +uv run python deploy/nemotron-customizer/airgap/runner.py \ + --config deploy/nemotron-customizer/airgap/airgap.yaml \ + --target sdg/data_designer:tiny \ + --target sft/megatron_bridge:tiny +``` + +Outputs are written under `deploy/nemotron-customizer/airgap/out/` by default: + +- `airgap-manifest.yaml`: what was validated and built +- `airgap-progress.yaml`: incomplete execute run state used for resume +- `airgap-complete.yaml`: final execute run state after success +- `requirements-.txt`: small missing packages per task image +- `repo-overlays-.json`: git auto-mounts discovered from selected step configs +- `submitter-image.tar` +- `task-*.tar` +- SHA256 checksums for saved image tarballs in `airgap-manifest.yaml` + +If an execute run fails midway, leave `airgap-progress.yaml` in place and rerun +the same command. Completed expensive actions are reused when their artifacts +still exist. If you intentionally change the workflow or image plan before +finishing, move or remove `airgap-progress.yaml` first; the runner will not +silently overwrite incomplete state from a different plan. + +Runtime dependency probes use Docker volumes named +`nemotron-airgap-pip-cache-` to avoid downloading the same wheels on +every probe loop. To reset them, run `docker volume ls | grep +nemotron-airgap-pip-cache` and remove the relevant volume with +`docker volume rm`. + +Large assets are not baked into images. The customer should stage them on +executor-visible persistent storage and reference them through config overrides +and `run.env.mounts`. + +During dependency discovery, the runner mounts the connected-machine checkout +into each task image only to probe imports. The final task image deliberately +does not bake this repo; the submitter image and the normal nemo-run/nemo-runspec +code transport provide the repo to the remote job at submission time. + +Repo logistics stay outside `airgap.yaml`. If a selected step config contains +`${auto_mount:git+...}`, the runner treats it as a connected-machine build input: +it fetches that pinned repo and bakes it into the derivative task image at the +requested target path. Runtime jobs then use the baked image and do not clone +from GitHub. Site-specific data/model mounts remain in env profiles or step +overrides. + +If the connected machine is not the same architecture as the target cluster, +set `platform: linux/amd64` on the submitter or task image entry in +`airgap.yaml`. If you need to minimize transfer size for several images that +share layers, `docker save -o all-images.tar tag1 tag2 ...` can be used after +the runner builds the images; a single tar deduplicates shared layers better +than one tar per image. + +The Dockerfiles expect the chosen base images to have Python and `pip` available +for bootstrapping small offline additions. The runtime defaults bake +`HF_HUB_OFFLINE=1`, `TRANSFORMERS_OFFLINE=1`, `HF_DATASETS_OFFLINE=1`, and +`WANDB_MODE=offline`; customers with an internal mirror can override those at +submission time through their env profile or `run.env.env_vars`. + +For SFT Megatron-Bridge, build with the normal config so the runner can discover +the pinned Megatron-LM and Megatron-Bridge auto-mounts: + +```yaml +workflow: + stages: + - sft/megatron_bridge:tiny +``` + +When submitting inside the airgap, use the deploy overlay config so those git +auto-mounts are cleared at runtime while persistent storage mounts from the env +profile still apply. Use the image printed by the runner under +`selected step images`, or read it from `out/airgap-manifest.yaml` under +`step_images`. + +```bash +uv run nemotron step run sft/megatron_bridge \ + -c deploy/nemotron-customizer/airgap/configs/sft_megatron_bridge_tiny.yaml \ + -b \ + run.env.container_image= +``` diff --git a/deploy/nemotron-customizer/airgap/airgap.yaml b/deploy/nemotron-customizer/airgap/airgap.yaml new file mode 100644 index 000000000..7745ea857 --- /dev/null +++ b/deploy/nemotron-customizer/airgap/airgap.yaml @@ -0,0 +1,130 @@ +# One file controls the Nemotron Customizer airgap plan. +# +# Change workflow.stages to the steps the customer wants. The runner expands +# dependencies, validates those step files/configs, groups selected steps by +# task image, then builds only the images needed for that selection. + +workflow: + name: sft-megatron-bridge + stages: + - sft/megatron_bridge:tiny + # Example SDG-only run: + # stages: + # - sdg/data_designer:tiny + # Example SDG -> SFT run: + # stages: + # - sdg/data_designer:tiny + # - sft/megatron_bridge:tiny + +build_stages: + - validate + - discover-task-deps + - build-submitter + - build-task-images + - save-images + +paths: + output_dir: deploy/nemotron-customizer/airgap/out + +submitter: + base_image: python:3.12-slim + tag: nemotron-customizer-submit-airgap:latest + tar: submitter-image.tar + +# Central dependency map. Keep this small and explicit: it is only for steps +# that naturally require a previous Nemotron Customizer step output. +dependencies: + sft/megatron_bridge: + - prep/sft_packing:tiny + peft/megatron_bridge: + - prep/sft_packing:tiny + pretrain/megatron_bridge: + - prep/pretrain_prep:tiny + pretrain/automodel: + - prep/pretrain_prep:tiny + rl/nemo_rl/dpo: + - prep/rl_prep:tiny + rl/nemo_rl/rlhf: + - prep/rl_prep:tiny + rl/nemo_rl/rlvr: + - prep/rl_prep:tiny + # SDG can feed SFT or RL prep, but it is not forced as a dependency because + # many customers bring their own JSONL on persistent storage. + +# Step -> task-image mapping. The runner only uses entries reached from +# workflow.stages after dependency expansion. +step_images: + byob: nemo-data-designer + convert/hf_to_megatron: nemo-megatron + convert/megatron_to_hf: nemo-megatron + convert/merge_lora: nemo-megatron + curate/nemo_curator: nemo-curator + env/env_toml: submitter-python + eval/model_eval: nemo-eval + optimize/modelopt/distill: nemo-modelopt + optimize/modelopt/prune: nemo-modelopt + optimize/modelopt/quantize: nemo-modelopt + peft/automodel: nemo-automodel + peft/megatron_bridge: nemo-megatron + prep/pretrain_prep: nemo-megatron + prep/rl_prep: nemo-rl + prep/sft_packing: nemo-megatron + pretrain/automodel: nemo-automodel + pretrain/megatron_bridge: nemo-megatron + rl/nemo_rl/dpo: nemo-rl + rl/nemo_rl/rlhf: nemo-rl + rl/nemo_rl/rlvr: nemo-rl + sdg/data_designer: nemo-data-designer + sft/automodel: nemo-automodel + sft/megatron_bridge: nemo-megatron + translate/nemo_skills: nemo-curator + translate/translation: nemo-curator + +task_images: + submitter-python: + base_image: python:3.12-slim + tag: nemotron-customizer-python-task-airgap:latest + tar: task-python-image.tar + + nemo-megatron: + base_image: nvcr.io/nvidia/nemo:25.11.nemotron_3_nano + tag: nemotron-customizer-nemo-megatron-airgap:latest + tar: task-nemo-megatron-image.tar + required_imports: [] + + nemo-automodel: + base_image: nvcr.io/nvidia/nemo-automodel:26.04 + tag: nemotron-customizer-nemo-automodel-airgap:latest + tar: task-nemo-automodel-image.tar + required_imports: [] + + nemo-rl: + base_image: nvcr.io/nvidia/nemo-rl:v0.6.0 + tag: nemotron-customizer-nemo-rl-airgap:latest + tar: task-nemo-rl-image.tar + required_imports: [] + + nemo-modelopt: + base_image: nvcr.io/nvidia/nemo:26.02 + tag: nemotron-customizer-nemo-modelopt-airgap:latest + tar: task-nemo-modelopt-image.tar + required_imports: [] + + nemo-curator: + base_image: nvcr.io/nvidia/nemo-curator:25.07 + tag: nemotron-customizer-nemo-curator-airgap:latest + tar: task-nemo-curator-image.tar + required_imports: [] + + nemo-data-designer: + base_image: nvcr.io/nvidia/nemo:25.11.nemotron_3_nano + tag: nemotron-customizer-nemo-data-designer-airgap:latest + tar: task-nemo-data-designer-image.tar + required_imports: + - data_designer + + nemo-eval: + base_image: nvcr.io/nvidia/nemo:25.11.nemotron_3_nano + tag: nemotron-customizer-nemo-eval-airgap:latest + tar: task-nemo-eval-image.tar + required_imports: [] diff --git a/deploy/nemotron-customizer/airgap/configs/sft_megatron_bridge_default.yaml b/deploy/nemotron-customizer/airgap/configs/sft_megatron_bridge_default.yaml new file mode 100644 index 000000000..d1854c2bf --- /dev/null +++ b/deploy/nemotron-customizer/airgap/configs/sft_megatron_bridge_default.yaml @@ -0,0 +1,12 @@ +# Airgap runtime overlay for sft/megatron_bridge:default. +# +# The connected-machine airgap runner bakes the auto_mount repos from the base +# config into the derivative task image. At runtime, clear those git auto-mounts +# so the airgapped job does not clone from GitHub. Env-profile persistent +# storage mounts still append normally. + +defaults: ../../../../src/nemotron/steps/sft/megatron_bridge/config/default.yaml + +run: + env: + mounts: [] diff --git a/deploy/nemotron-customizer/airgap/configs/sft_megatron_bridge_tiny.yaml b/deploy/nemotron-customizer/airgap/configs/sft_megatron_bridge_tiny.yaml new file mode 100644 index 000000000..b2f54d38b --- /dev/null +++ b/deploy/nemotron-customizer/airgap/configs/sft_megatron_bridge_tiny.yaml @@ -0,0 +1,12 @@ +# Airgap runtime overlay for sft/megatron_bridge:tiny. +# +# The connected-machine airgap runner bakes the auto_mount repos from the base +# config into the derivative task image. At runtime, clear those git auto-mounts +# so the airgapped job does not clone from GitHub. Env-profile persistent +# storage mounts still append normally. + +defaults: ../../../../src/nemotron/steps/sft/megatron_bridge/config/tiny.yaml + +run: + env: + mounts: [] diff --git a/deploy/nemotron-customizer/airgap/runner.py b/deploy/nemotron-customizer/airgap/runner.py new file mode 100644 index 000000000..433a43f93 --- /dev/null +++ b/deploy/nemotron-customizer/airgap/runner.py @@ -0,0 +1,1227 @@ +#!/usr/bin/env python3 +"""Lightweight airgap image runner for Nemotron Customizer. + +This file intentionally lives under deploy/nemotron-customizer/airgap instead +of adding a new step. It is a connected-machine helper that validates requested +steps, discovers small task-image Python gaps, builds submitter/task images, and +saves image tarballs. +""" + +from __future__ import annotations + +import argparse +import ast +import hashlib +import importlib.metadata as metadata +import json +import re +import shutil +import subprocess +import sys +from collections.abc import Iterable, Mapping +from dataclasses import dataclass, field +from datetime import UTC, datetime +from pathlib import Path +from typing import Any + +import tomllib +import yaml + +AIRGAP_DIR = Path(__file__).resolve().parent +REPO_ROOT = AIRGAP_DIR.parents[2] +SRC_ROOT = REPO_ROOT / "src" +STEP_ROOT = SRC_ROOT / "nemotron" / "steps" +DEFAULT_OUTPUT_DIR = AIRGAP_DIR / "out" +UV_VERSION = "0.11.1" +PROGRESS_STATE = "airgap-progress.yaml" +COMPLETE_STATE = "airgap-complete.yaml" +LOCAL_PREFIXES = ("nemotron", "nemo_runspec") +CORE_IMPORTS = { + "datasets", + "megatron", + "nemo", + "numpy", + "ray", + "torch", + "transformers", + "triton", + "vllm", +} +IMPORT_ALIASES = { + "yaml": "pyyaml", + "pydantic_settings": "pydantic-settings", + "huggingface_hub": "huggingface-hub", + "cosmos_xenna": "cosmos-xenna", + "data_designer": "data-designer", + "nemo_curator": "nemo-curator", +} + + +@dataclass(frozen=True) +class Target: + step_id: str + config: str | None = None + + @property + def spec(self) -> str: + return f"{self.step_id}:{self.config}" if self.config else self.step_id + + +@dataclass +class StepInfo: + target: Target + step_dir: Path + step_py: Path + step_toml: Path + config_path: Path | None + module: str + mounts: list[Any] = field(default_factory=list) + repo_overlays: list[RepoOverlay] = field(default_factory=list) + + +@dataclass(frozen=True) +class RepoOverlay: + repo: str + url: str + ref: str + target: str + + +@dataclass +class TaskGroup: + name: str + base_image: str + tag: str + tar: Path + steps: list[str] + platform: str | None = None + required_imports: set[str] = field(default_factory=set) + repo_overlays: list[RepoOverlay] = field(default_factory=list) + pip_no_deps: bool = True + candidate_imports: set[str] = field(default_factory=set) + missing_imports: list[str] = field(default_factory=list) + missing_core_imports: list[str] = field(default_factory=list) + requirements: list[str] = field(default_factory=list) + requirements_path: Path | None = None + repo_overlays_path: Path | None = None + selected_image: str | None = None + image_names: set[str] = field(default_factory=set) + + +@dataclass +class RunState: + path: Path + done_path: Path + data: dict[str, Any] + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser(description="Build Nemotron Customizer airgap images from one YAML file.") + parser.add_argument("--config", default=str(AIRGAP_DIR / "airgap.yaml"), help="Airgap runner YAML.") + parser.add_argument("--execute", action="store_true", help="Run docker/git commands. Default prints the plan.") + parser.add_argument("--stage", action="append", help="Stage to run. Repeatable. Defaults to config stages.") + parser.add_argument( + "--target", + action="append", + help="Nemotron step target step-id[:config]. Repeatable. Overrides workflow.stages.", + ) + args = parser.parse_args(argv) + + config_path = resolve_input_path(Path(args.config)) + cfg = load_yaml(config_path) + if args.target: + cfg = with_workflow_targets(cfg, normalize_target_specs(args.target)) + stages = normalize_stages(args.stage or cfg.get("build_stages") or cfg.get("stages") or []) + output_dir = resolve_repo_path(Path(cfg.get("paths", {}).get("output_dir", DEFAULT_OUTPUT_DIR))) + if "build-task-images" in stages: + validate_docker_context_path(output_dir, field="paths.output_dir") + output_dir.mkdir(parents=True, exist_ok=True) + run_state = load_or_start_run_state( + output_dir, + config_path=config_path, + cfg=cfg, + stages=stages, + execute=args.execute, + ) + saved_images: list[dict[str, Any]] = [] + workflow = cfg.get("workflow") if isinstance(cfg.get("workflow"), Mapping) else {} + + print(f"[airgap] config={config_path}") + print(f"[airgap] mode={'execute' if args.execute else 'plan'}") + print(f"[airgap] stages={', '.join(stages)}") + + expanded_targets: list[Target] = [] + step_infos: dict[str, StepInfo] = {} + groups: list[TaskGroup] = [] + workflow_manifest: dict[str, Any] = { + "stages": list(workflow.get("stages") or []), + } + if workflow.get("name"): + workflow_manifest["name"] = workflow.get("name") + manifest: dict[str, Any] = { + "schema_version": 1, + "workflow": workflow_manifest, + "output_dir": str(output_dir), + "build_stages": stages, + } + + if "validate" in stages or any(stage_needs_targets(stage) for stage in stages): + begin_action(run_state, "validate") + expanded_targets = expand_targets(cfg) + step_infos = validate_targets(expanded_targets) + manifest["targets"] = [step_to_manifest(info) for info in step_infos.values()] + print(f"[validate] {len(step_infos)} target(s) ok") + complete_action(run_state, "validate", {"targets": [target.spec for target in expanded_targets]}) + + if any(stage in stages for stage in ("discover-task-deps", "build-task-images", "save-images")): + groups = task_groups(cfg, output_dir=output_dir, step_infos=step_infos) + manifest["task_groups"] = [task_group_manifest(group) for group in groups] + + if "discover-task-deps" in stages: + if action_completed(run_state, "discover-task-deps") and hydrate_discovered_groups(run_state, groups): + print("[resume] skipping discover-task-deps; using saved probe results") + else: + begin_action(run_state, "discover-task-deps") + locked_versions = locked_package_versions(REPO_ROOT / "uv.lock") + for group in groups: + discover_task_deps(group, step_infos=step_infos, locked_versions=locked_versions, execute=args.execute) + remember_discovered_groups(run_state, groups) + complete_action(run_state, "discover-task-deps", {"groups": [group.name for group in groups]}) + manifest["task_groups"] = [task_group_manifest(group) for group in groups] + + if "build-submitter" in stages: + submitter = cfg.get("submitter", {}) + submitter_tag = str(submitter.get("tag") or "nemotron-customizer-submit-airgap:latest") + platform = submitter_platform(submitter) + action = "build-submitter" + if action_completed(run_state, action) and docker_image_exists(submitter_tag, platform=platform): + print(f"[resume] skipping {action}; image exists: {submitter_tag}") + else: + begin_action(run_state, action) + status = build_submitter(submitter, execute=args.execute) + if status: + return status + complete_action(run_state, action, {"image": submitter_tag}) + manifest["submitter"] = submitter_manifest(submitter) + + if "build-task-images" in stages: + clean_stale_group_dirs(output_dir, groups, execute=args.execute) + for group in groups: + action = f"build-task-image:{group.name}" + if action_completed(run_state, action) and docker_image_exists(group.tag, platform=group.platform): + print(f"[resume] skipping {action}; image exists: {group.tag}") + else: + begin_action(run_state, action) + status = build_task_image(group, output_dir=output_dir, execute=args.execute) + if status: + return status + complete_action(run_state, action, {"image": group.tag}) + manifest["task_groups"] = [task_group_manifest(group) for group in groups] + + if "save-images" in stages: + submitter = cfg.get("submitter", {}) + if submitter: + output = output_dir / str(submitter.get("tar", "submitter-image.tar")) + submitter_tag = str(submitter.get("tag") or "nemotron-customizer-submit-airgap:latest") + action = f"save-image:{submitter_tag}" + if action_completed(run_state, action) and output.exists(): + print(f"[resume] skipping {action}; tar exists: {output}") + else: + begin_action(run_state, action) + status = save_image(submitter_tag, output, args.execute) + if status: + return status + complete_action(run_state, action, {"tar": str(output)}) + saved_images.append( + saved_image_manifest(submitter_tag, output, execute=args.execute, role="submitter", name="submitter") + ) + for group in groups: + action = f"save-image:{group.tag}" + if action_completed(run_state, action) and group.tar.exists(): + print(f"[resume] skipping {action}; tar exists: {group.tar}") + else: + begin_action(run_state, action) + status = save_image(group.tag, group.tar, args.execute) + if status: + return status + complete_action(run_state, action, {"tar": str(group.tar)}) + saved_images.append( + saved_image_manifest(group.tag, group.tar, execute=args.execute, role="task", name=group.name) + ) + + manifest["persistent_assets"] = { + "policy": "models, datasets, checkpoints, and customer data stay on executor-visible persistent storage", + "mounts_from_configs": collect_mounts(step_infos.values()), + "baked_repo_overlays": [repo_overlay_manifest(item) for item in collect_repo_overlays(step_infos.values())], + } + manifest["step_images"] = step_image_manifest(groups) + manifest["saved_images"] = saved_images + manifest_path = output_dir / "airgap-manifest.yaml" + manifest_path.write_text(yaml.safe_dump(manifest, sort_keys=False), encoding="utf-8") + complete_run_state(run_state, manifest_path=manifest_path) + print(f"[airgap] wrote {manifest_path}") + if groups: + print("[airgap] selected step images:") + for group in groups: + image = group.selected_image or group.tag + for step_id in group.steps: + print(f" - {step_id}: {image}") + return 0 + + +def load_yaml(path: Path) -> dict[str, Any]: + data = yaml.safe_load(path.read_text(encoding="utf-8")) or {} + if not isinstance(data, dict): + raise SystemExit(f"{path}: top-level YAML must be a mapping") + return data + + +def normalize_target_specs(values: Iterable[str]) -> list[str]: + out: list[str] = [] + for raw in values: + for item in str(raw).split(","): + target = item.strip() + if target: + out.append(target) + return out + + +def with_workflow_targets(cfg: Mapping[str, Any], targets: list[str]) -> dict[str, Any]: + out = dict(cfg) + existing = out.get("workflow") + workflow = dict(existing) if isinstance(existing, Mapping) else {} + workflow["stages"] = targets + out["workflow"] = workflow + return out + + +def resolve_input_path(path: Path) -> Path: + if path.is_absolute() or path.exists(): + return path + repo_path = REPO_ROOT / path + return repo_path if repo_path.exists() else path + + +def resolve_repo_path(path: Path) -> Path: + return path if path.is_absolute() else REPO_ROOT / path + + +def docker_context_path(path: Path) -> str: + resolved = path.resolve() + try: + return resolved.relative_to(REPO_ROOT).as_posix() + except ValueError as exc: + raise SystemExit(f"{path} must live under the repo root because docker build context is {REPO_ROOT}") from exc + + +def validate_docker_context_path(path: Path, *, field: str) -> None: + try: + docker_context_path(path) + except SystemExit as exc: + message = f"{field}={path} must live under the repo root because Docker builds use {REPO_ROOT}" + raise SystemExit(message) from exc + + +def load_or_start_run_state( + output_dir: Path, + *, + config_path: Path, + cfg: Mapping[str, Any], + stages: list[str], + execute: bool, +) -> RunState | None: + if not execute: + return None + path = output_dir / PROGRESS_STATE + done_path = output_dir / COMPLETE_STATE + signature = run_signature(config_path=config_path, cfg=cfg, stages=stages) + if path.exists(): + data = yaml.safe_load(path.read_text(encoding="utf-8")) or {} + if not isinstance(data, dict): + raise SystemExit(f"{path} must contain YAML mapping state") + if data.get("signature") != signature: + raise SystemExit( + f"{path} is an incomplete airgap run for a different plan. " + f"Finish it, move it aside, or remove it before starting a new plan." + ) + print(f"[resume] found incomplete run state: {path}") + return RunState(path=path, done_path=done_path, data=data) + + workflow = cfg.get("workflow") if isinstance(cfg.get("workflow"), Mapping) else {} + data = { + "schema_version": 1, + "signature": signature, + "config": str(config_path.resolve()), + "workflow_stages": list(workflow.get("stages") or []), + "build_stages": stages, + "started_at": timestamp(), + "completed_actions": {}, + "discovered_groups": {}, + } + if done_path.exists(): + data["previous_complete"] = str(done_path) + state = RunState(path=path, done_path=done_path, data=data) + write_run_state(state) + print(f"[airgap] progress state={path}") + return state + + +def run_signature(*, config_path: Path, cfg: Mapping[str, Any], stages: list[str]) -> str: + payload = { + "config": str(config_path.resolve()), + "stages": stages, + "workflow": cfg.get("workflow"), + "dependencies": cfg.get("dependencies"), + "step_images": cfg.get("step_images"), + "task_images": cfg.get("task_images"), + "submitter": cfg.get("submitter"), + } + text = yaml.safe_dump(payload, sort_keys=True) + return hashlib.sha256(text.encode("utf-8")).hexdigest()[:16] + + +def timestamp() -> str: + return datetime.now(UTC).isoformat(timespec="seconds").replace("+00:00", "Z") + + +def write_run_state(state: RunState | None) -> None: + if state is None: + return + state.data["updated_at"] = timestamp() + state.path.write_text(yaml.safe_dump(state.data, sort_keys=False), encoding="utf-8") + + +def begin_action(state: RunState | None, action: str) -> None: + if state is None: + return + state.data["current_action"] = {"name": action, "started_at": timestamp()} + write_run_state(state) + + +def complete_action(state: RunState | None, action: str, details: Mapping[str, Any] | None = None) -> None: + if state is None: + return + completed = state.data.setdefault("completed_actions", {}) + completed[action] = {"completed_at": timestamp(), **dict(details or {})} + if (state.data.get("current_action") or {}).get("name") == action: + state.data.pop("current_action", None) + write_run_state(state) + + +def action_completed(state: RunState | None, action: str) -> bool: + if state is None: + return False + return action in (state.data.get("completed_actions") or {}) + + +def remember_discovered_groups(state: RunState | None, groups: Iterable[TaskGroup]) -> None: + if state is None: + return + state.data["discovered_groups"] = { + group.name: { + "candidate_imports": sorted(group.candidate_imports), + "missing_imports": group.missing_imports, + "missing_core_imports": group.missing_core_imports, + "requirements": group.requirements, + } + for group in groups + } + write_run_state(state) + + +def hydrate_discovered_groups(state: RunState | None, groups: Iterable[TaskGroup]) -> bool: + if state is None: + return False + saved = state.data.get("discovered_groups") or {} + groups = list(groups) + if not all(group.name in saved for group in groups): + return False + for group in groups: + item = saved[group.name] + group.candidate_imports = set(item.get("candidate_imports") or []) + group.missing_imports = list(item.get("missing_imports") or []) + group.missing_core_imports = list(item.get("missing_core_imports") or []) + group.requirements = list(item.get("requirements") or []) + return True + + +def complete_run_state(state: RunState | None, *, manifest_path: Path) -> None: + if state is None: + return + state.data.pop("current_action", None) + state.data["manifest"] = str(manifest_path) + state.data["completed_at"] = timestamp() + state.done_path.write_text(yaml.safe_dump(state.data, sort_keys=False), encoding="utf-8") + state.path.unlink(missing_ok=True) + print(f"[airgap] complete state={state.done_path}") + + +def normalize_stages(stages: Iterable[str]) -> list[str]: + out: list[str] = [] + for raw in stages: + for item in str(raw).split(","): + stage = item.strip() + if stage and stage not in out: + out.append(stage) + out = out or ["validate", "discover-task-deps", "build-submitter", "build-task-images", "save-images"] + + def ensure_before(required: str, requested: str) -> None: + if requested not in out or required in out: + return + index = out.index(requested) + out.insert(index, required) + print(f"[airgap] auto-adding stage {required!r} because {requested!r} was requested") + + # Apply prerequisite edges from later stages toward earlier stages. Each + # insertion is idempotent, so a user can ask for any suffix of the pipeline. + ensure_before("build-task-images", "save-images") + ensure_before("build-submitter", "save-images") + ensure_before("discover-task-deps", "build-task-images") + ensure_before("validate", "discover-task-deps") + ensure_before("validate", "build-task-images") + ensure_before("validate", "save-images") + order = { + "validate": 0, + "discover-task-deps": 1, + "build-submitter": 2, + "build-task-images": 3, + "save-images": 4, + } + out.sort(key=lambda stage: order.get(stage, len(order))) + return out + + +def stage_needs_targets(stage: str) -> bool: + return stage in {"discover-task-deps", "build-task-images", "save-images"} + + +def expand_targets(cfg: Mapping[str, Any]) -> list[Target]: + workflow = cfg.get("workflow") or {} + raw_targets = [parse_target(item) for item in workflow.get("stages") or []] + deps = cfg.get("dependencies") or workflow.get("dependencies") or {} + out: list[Target] = [] + seen: set[str] = set() + visiting: set[str] = set() + stack: list[str] = [] + + def add(target: Target) -> None: + if target.spec in visiting: + start = stack.index(target.spec) if target.spec in stack else 0 + cycle = " -> ".join([*stack[start:], target.spec]) + raise SystemExit(f"cyclic airgap dependency detected: {cycle}") + if target.spec in seen: + return + visiting.add(target.spec) + stack.append(target.spec) + for dep in deps.get(target.step_id, []) or []: + add(parse_target(dep)) + stack.pop() + visiting.remove(target.spec) + seen.add(target.spec) + out.append(target) + + for target in raw_targets: + add(target) + if not out: + raise SystemExit("workflow.stages must list at least one step") + return out + + +def parse_target(value: str) -> Target: + step_id, sep, config = str(value).partition(":") + step_id = step_id.strip() + config = config.strip() if sep else "" + if not step_id: + raise SystemExit(f"invalid target {value!r}; expected step-id[:config]") + return Target(step_id=step_id, config=config or None) + + +def validate_targets(targets: Iterable[Target]) -> dict[str, StepInfo]: + out: dict[str, StepInfo] = {} + for target in targets: + step_dir = STEP_ROOT / target.step_id + step_py = step_dir / "step.py" + step_toml = step_dir / "step.toml" + config_path = step_dir / "config" / f"{target.config}.yaml" if target.config else None + missing = [ + path for path in (step_dir, step_py, step_toml, config_path) if path is not None and not path.exists() + ] + if missing: + raise SystemExit(f"{target.spec}: missing required path(s): {', '.join(str(path) for path in missing)}") + module = "nemotron.steps." + target.step_id.replace("/", ".") + ".step" + out[target.step_id] = StepInfo( + target=target, + step_dir=step_dir, + step_py=step_py, + step_toml=step_toml, + config_path=config_path, + module=module, + mounts=read_config_mounts(config_path), + repo_overlays=read_config_repo_overlays(config_path), + ) + return out + + +def read_config_mounts(config_path: Path | None) -> list[Any]: + if config_path is None or not config_path.exists(): + return [] + try: + data = yaml.safe_load(config_path.read_text(encoding="utf-8")) or {} + except Exception: + return [] + if not isinstance(data, Mapping): + return [] + run = data.get("run") if isinstance(data.get("run"), Mapping) else {} + env = run.get("env") if isinstance(run.get("env"), Mapping) else {} + mounts = env.get("mounts") if isinstance(env, Mapping) else [] + return mounts if isinstance(mounts, list) else [] + + +def task_groups( + cfg: Mapping[str, Any], + *, + output_dir: Path, + step_infos: Mapping[str, StepInfo] | None = None, +) -> list[TaskGroup]: + if not step_infos: + raise SystemExit("validate must run before task images can be planned") + if not cfg.get("step_images"): + raise SystemExit("airgap.yaml must define step_images for the selected workflow stages") + return task_groups_from_step_images(cfg, output_dir=output_dir, step_infos=step_infos) + + +def task_groups_from_step_images( + cfg: Mapping[str, Any], + *, + output_dir: Path, + step_infos: Mapping[str, StepInfo], +) -> list[TaskGroup]: + step_images = normalize_step_images(cfg.get("step_images") or {}) + image_defs = normalize_task_images(cfg.get("task_images") or {}) + merged: dict[str, TaskGroup] = {} + + for step_id in step_infos: + image_name = step_images.get(step_id) + if not image_name: + raise SystemExit(f"{step_id}: missing step_images entry in airgap.yaml") + image_def = image_defs.get(image_name) + if image_def is None: + raise SystemExit(f"{step_id}: step_images points to unknown task image {image_name!r}") + base = str(image_def.get("base_image") or "").strip() + if not base: + raise SystemExit(f"task_images.{image_name}.base_image is required") + repo_overlays = getattr(step_infos[step_id], "repo_overlays", []) + group_key = task_group_key(base, repo_overlays) + group = merged.get(group_key) + if group is None: + suffix = short_hash( + { + "base_image": base, + "repo_overlays": [repo_overlay_manifest(item) for item in repo_overlays], + } + ) + group = TaskGroup( + name=f"{image_name}-{suffix}", + base_image=base, + tag="", + tar=output_dir / "task-image.tar", + steps=[], + platform=str(image_def["platform"]) if image_def.get("platform") else None, + pip_no_deps=bool(image_def.get("pip_no_deps", True)), + repo_overlays=list(repo_overlays), + ) + merged[group_key] = group + group.image_names.add(image_name) + group.steps.append(step_id) + group.required_imports.update(str(name) for name in image_def.get("required_imports") or []) + group.repo_overlays = merge_repo_overlays( + group.repo_overlays, + repo_overlays, + ) + for group in merged.values(): + finalize_task_group_name(group, image_defs=image_defs, output_dir=output_dir) + return list(merged.values()) + + +def finalize_task_group_name( + group: TaskGroup, + *, + image_defs: Mapping[str, Mapping[str, Any]], + output_dir: Path, +) -> None: + names = sorted(group.image_names) + suffix = short_hash( + { + "base_image": group.base_image, + "repo_overlays": [repo_overlay_manifest(item) for item in group.repo_overlays], + } + ) + if len(names) == 1: + image_name = names[0] + image_def = image_defs[image_name] + tag = str(image_def.get("tag") or f"nemotron-task-{sanitize(image_name)}:airgap") + tar = output_dir / str(image_def.get("tar") or f"task-{sanitize(image_name)}.tar") + group.name = f"{image_name}-{suffix}" + else: + merged_name = "-".join(sanitize(name) for name in names) + tag = f"nemotron-customizer-{merged_name}-airgap:latest" + tar = output_dir / f"task-{merged_name}-image.tar" + group.name = f"{merged_name}-{suffix}" + group.tag = tag_with_suffix(tag, suffix) + group.tar = tar_with_suffix(tar, suffix) + group.selected_image = group.tag + + +def task_group_key(base_image: str, repo_overlays: Iterable[RepoOverlay]) -> str: + overlays = sorted( + (repo_overlay_manifest(item) for item in repo_overlays), + key=lambda item: (item["target"], item["url"], item["ref"], item["repo"]), + ) + payload = { + "base_image": base_image, + "repo_overlays": overlays, + } + return json.dumps(payload, sort_keys=True) + + +def short_hash(value: Any) -> str: + payload = json.dumps(value, sort_keys=True, separators=(",", ":")).encode("utf-8") + return hashlib.sha256(payload).hexdigest()[:8] + + +def tag_with_suffix(tag: str, suffix: str) -> str: + image, separator, digest = tag.partition("@") + last = image.rsplit("/", 1)[-1] + if ":" in last: + name, version = image.rsplit(":", 1) + image = f"{name}-{suffix}:{version}" + else: + image = f"{image}-{suffix}" + return f"{image}{separator}{digest}" if separator else image + + +def tar_with_suffix(path: Path, suffix: str) -> Path: + return path.with_name(f"{path.stem}-{suffix}{path.suffix}") + + +def normalize_step_images(raw: Mapping[str, Any]) -> dict[str, str]: + out: dict[str, str] = {} + for step_id, value in raw.items(): + if isinstance(value, str): + out[str(step_id)] = value + elif isinstance(value, Mapping) and value.get("task_image"): + out[str(step_id)] = str(value["task_image"]) + return out + + +def normalize_task_images(raw: Any) -> dict[str, Mapping[str, Any]]: + if isinstance(raw, Mapping): + return {str(name): spec for name, spec in raw.items() if isinstance(spec, Mapping)} + return {} + + +def read_config_repo_overlays(config_path: Path | None) -> list[RepoOverlay]: + if config_path is None or not config_path.exists(): + return [] + text = config_path.read_text(encoding="utf-8") + overlays: list[RepoOverlay] = [] + pattern = re.compile(r"\$\{auto_mount:(git\+[^,}]+),([^}]+)\}") + for spec, target in pattern.findall(text): + overlays.append(parse_git_overlay(spec, target)) + return merge_repo_overlays([], overlays) + + +def parse_git_overlay(spec: str, target: str) -> RepoOverlay: + if not spec.startswith("git+"): + raise SystemExit(f"invalid auto_mount git spec: {spec!r}") + url_and_ref = spec[4:] + if "@" not in url_and_ref: + raise SystemExit(f"invalid auto_mount git spec missing @ref: {spec!r}") + url, ref = url_and_ref.rsplit("@", 1) + repo = url.rstrip("/").split("/")[-1] + if repo.endswith(".git"): + repo = repo[:-4] + return RepoOverlay(repo=repo, url=url, ref=ref, target=target.strip()) + + +def merge_repo_overlays(existing: list[RepoOverlay], incoming: Iterable[RepoOverlay]) -> list[RepoOverlay]: + out = list(existing) + seen = {(item.repo, item.url, item.ref, item.target) for item in out} + for item in incoming: + key = (item.repo, item.url, item.ref, item.target) + if key not in seen: + out.append(item) + seen.add(key) + return out + + +def discover_task_deps( + group: TaskGroup, + *, + step_infos: Mapping[str, StepInfo], + locked_versions: Mapping[str, str], + execute: bool, +) -> None: + imports: set[str] = set(group.required_imports) + for step_id in group.steps: + imports.update(discover_external_imports(step_infos[step_id].step_py)) + group.candidate_imports = imports + if execute: + missing = probe_step_modules( + group.base_image, + [step_infos[step_id].module for step_id in group.steps], + required_imports=imports, + locked_versions=locked_versions, + pip_no_deps=group.pip_no_deps, + platform=group.platform, + ) + else: + missing = probe_missing_imports(group.base_image, sorted(imports), execute=False, platform=group.platform) + group.missing_imports = sorted(set(missing)) + group.missing_core_imports = sorted(name for name in missing if name.split(".", 1)[0] in CORE_IMPORTS) + installable = sorted(name for name in group.missing_imports if name not in group.missing_core_imports) + group.requirements = sorted(requirement_for_import(name, locked_versions) for name in installable) + + +def discover_external_imports(start: Path) -> set[str]: + external: set[str] = set() + try: + tree = ast.parse(start.read_text(encoding="utf-8")) + except SyntaxError: + return external + for node in ast.walk(tree): + imported: list[str] = [] + if isinstance(node, ast.Import): + imported = [alias.name for alias in node.names] + elif isinstance(node, ast.ImportFrom) and not node.level and node.module: + imported = [node.module] + for name in imported: + root = name.split(".", 1)[0] + if root in LOCAL_PREFIXES or is_stdlib(root): + continue + external.add(root) + return external + + +def is_stdlib(root: str) -> bool: + if root in sys.builtin_module_names: + return True + stdlib_names = getattr(sys, "stdlib_module_names", set()) + if root in stdlib_names: + return True + return False + + +def probe_missing_imports(image: str, imports: list[str], *, execute: bool, platform: str | None = None) -> list[str]: + if not imports: + return [] + code = ( + "import importlib.util,json;" + f"mods={imports!r};" + "missing=[m for m in mods if importlib.util.find_spec(m) is None];" + "print(json.dumps(missing))" + ) + cmd = ["docker", "run", "--rm", "--pull", "never"] + if platform: + cmd.extend(["--platform", platform]) + cmd.extend([image, "python", "-c", code]) + if not execute: + print_cmd(cmd) + return [] + ensure_image(image, platform=platform) + result = subprocess.run(cmd, check=False, capture_output=True, text=True, cwd=REPO_ROOT) + if result.returncode != 0: + print(result.stderr or result.stdout, file=sys.stderr) + raise SystemExit(result.returncode) + return [str(item) for item in json.loads(result.stdout.strip() or "[]")] + + +def probe_step_modules( + image: str, + modules: list[str], + *, + required_imports: Iterable[str], + locked_versions: Mapping[str, str], + pip_no_deps: bool, + platform: str | None = None, +) -> list[str]: + """Import selected step modules in the task image and discover missing imports. + + The loop installs only the packages it has already identified, in an + ephemeral container, so the final requirements file stays based on actual + import failures rather than broad static guesses. + """ + + ensure_image(image, platform=platform) + missing: list[str] = [] + requirements: list[str] = [] + imports = sorted(set(required_imports)) + import_code = "import importlib;" + import_code += "".join(f"importlib.import_module({module!r});" for module in imports) + import_code += "".join(f"importlib.import_module({module!r});" for module in modules) + for _ in range(20): + install = "" + if requirements: + no_deps = "--no-deps " if pip_no_deps else "" + install = "python -m pip install " + no_deps + install += " ".join(shlex_quote(req) for req in requirements) + install += ( + " >/tmp/nemotron-airgap-pip.log 2>&1 " + "|| { echo '[airgap-pip] failed:'; cat /tmp/nemotron-airgap-pip.log; exit 1; } && " + ) + cmd = [ + "docker", + "run", + "--rm", + "--pull", + "never", + "--mount", + f"type=volume,source={pip_cache_volume(platform)},target=/root/.cache/pip", + "-v", + f"{REPO_ROOT}:/workspace/Nemotron:ro", + "-w", + "/workspace/Nemotron", + "-e", + "PYTHONPATH=/workspace/Nemotron/src", + ] + if platform: + cmd.extend(["--platform", platform]) + cmd.extend([image, "bash", "-lc", install + "python -c " + shlex_quote(import_code)]) + result = subprocess.run(cmd, check=False, capture_output=True, text=True, cwd=REPO_ROOT) + if result.returncode == 0: + return missing + text = result.stderr + "\n" + result.stdout + match = re.search(r"(?:ModuleNotFoundError|ImportError):\s+No module named ['\"]([^'\"]+)['\"]", text) + if not match: + print(text, file=sys.stderr) + raise SystemExit(result.returncode) + import_name = match.group(1).split(".", 1)[0] + if import_name not in missing: + missing.append(import_name) + if import_name in CORE_IMPORTS: + print(f"[probe] base image is missing core import {import_name!r}; choose a compatible task image") + return missing + requirement = requirement_for_import(import_name, locked_versions) + if requirement in requirements: + return missing + requirements.append(requirement) + raise SystemExit(f"import probe did not converge for {image}") + + +def requirement_for_import(import_name: str, locked_versions: Mapping[str, str]) -> str: + package = package_for_import(import_name) + version = locked_versions.get(normalize_package(package)) + return f"{package}=={version}" if version else package + + +def package_for_import(import_name: str) -> str: + if import_name in IMPORT_ALIASES: + return IMPORT_ALIASES[import_name] + packages = metadata.packages_distributions().get(import_name) + if packages: + return normalize_package(packages[0]) + return import_name.replace("_", "-") + + +def locked_package_versions(lock_path: Path) -> dict[str, str]: + if not lock_path.exists(): + return {} + data = tomllib.loads(lock_path.read_text(encoding="utf-8")) + versions: dict[str, str] = {} + for package in data.get("package", []) or []: + name = package.get("name") + version = package.get("version") + if isinstance(name, str) and isinstance(version, str): + versions[normalize_package(name)] = version + return versions + + +def normalize_package(name: str) -> str: + return re.sub(r"[-_.]+", "-", name).lower() + + +def build_submitter(submitter: Mapping[str, Any], *, execute: bool) -> int: + image = str(submitter.get("tag") or "nemotron-customizer-submit-airgap:latest") + base = str(submitter.get("base_image") or "python:3.12-slim") + platform = submitter_platform(submitter) + cmd = [ + "docker", + "build", + "-f", + str(AIRGAP_DIR / "Dockerfile.submitter"), + "--build-arg", + f"BASE_IMAGE={base}", + "--build-arg", + f"UV_VERSION={UV_VERSION}", + "-t", + image, + ".", + ] + if platform: + cmd[2:2] = ["--platform", platform] + if execute: + ensure_image(base, platform=platform) + return run_or_print(cmd, execute) + + +def submitter_platform(submitter: Mapping[str, Any]) -> str | None: + return str(submitter["platform"]) if submitter.get("platform") else None + + +def build_task_image(group: TaskGroup, *, output_dir: Path, execute: bool) -> int: + group_dir = output_dir / "task-context" / group.name + group_dir.mkdir(parents=True, exist_ok=True) + group.requirements_path = group_dir / f"requirements-{group.name}.txt" + group.requirements_path.write_text( + "\n".join(group.requirements) + ("\n" if group.requirements else ""), + encoding="utf-8", + ) + repos_root = output_dir / "repo-overlays" / group.name + prepare_repo_overlays(group, repos_root=repos_root, execute=execute) + group.repo_overlays_path = group_dir / f"repo-overlays-{group.name}.json" + group.repo_overlays_path.write_text( + json.dumps([repo_overlay_build_manifest(item) for item in group.repo_overlays], indent=2) + "\n", + encoding="utf-8", + ) + cmd = [ + "docker", + "build", + "-f", + str(AIRGAP_DIR / "Dockerfile.task"), + "--build-arg", + f"BASE_IMAGE={group.base_image}", + "--build-arg", + f"TASK_REQUIREMENTS={docker_context_path(group.requirements_path)}", + "--build-arg", + f"REPO_OVERLAYS={docker_context_path(group.repo_overlays_path)}", + "--build-arg", + f"REPO_OVERLAYS_DIR={docker_context_path(repos_root)}", + "--build-arg", + f"PIP_NO_DEPS={'true' if group.pip_no_deps else 'false'}", + "-t", + group.tag, + ".", + ] + if group.platform: + cmd[2:2] = ["--platform", group.platform] + if execute: + ensure_image(group.base_image, platform=group.platform) + return run_or_print(cmd, execute) + + +def prepare_repo_overlays(group: TaskGroup, *, repos_root: Path, execute: bool) -> None: + repos_root.mkdir(parents=True, exist_ok=True) + (repos_root / ".keep").touch() + for overlay in group.repo_overlays: + dest = repos_root / repo_overlay_dir_name(overlay) + if dest.exists(): + run_or_print(["git", "-C", str(dest), "fetch", "--all", "--tags", "--force", "--prune"], execute) + else: + run_or_print(["git", "clone", overlay.url, str(dest)], execute) + run_or_print(["git", "-C", str(dest), "checkout", overlay.ref], execute) + + +def save_image(image: str, output: Path, execute: bool) -> int: + return run_or_print(["docker", "save", "-o", str(output), image], execute, mkdir=output.parent) + + +def ensure_image(image: str, *, platform: str | None = None) -> None: + if docker_image_exists(image, platform=platform): + return + suffix = f" for {platform}" if platform else "" + print(f"[docker] pulling missing base image{suffix}: {image}") + cmd = ["docker", "pull"] + if platform: + cmd.extend(["--platform", platform]) + cmd.append(image) + result = subprocess.run(cmd, check=False, cwd=REPO_ROOT) + if result.returncode: + raise SystemExit(result.returncode) + + +def docker_image_exists(image: str, *, platform: str | None = None) -> bool: + cached = docker_image_platform(image) + return cached is not None and platform_matches(cached, platform) + + +def docker_image_platform(image: str) -> str | None: + inspect = subprocess.run( + [ + "docker", + "image", + "inspect", + "--format", + "{{.Os}}/{{.Architecture}}{{if .Variant}}/{{.Variant}}{{end}}", + image, + ], + capture_output=True, + text=True, + stderr=subprocess.DEVNULL, + cwd=REPO_ROOT, + ) + if inspect.returncode != 0: + return None + return (inspect.stdout.strip().splitlines() or [None])[0] + + +def platform_matches(cached: str | None, requested: str | None) -> bool: + if cached is None: + return False + if not requested: + return True + return cached == requested or cached.startswith(f"{requested}/") + + +def pip_cache_volume(platform: str | None = None) -> str: + suffix = sanitize(platform or "default") + return f"nemotron-airgap-pip-cache-{suffix}" + + +def run_or_print(cmd: list[str], execute: bool, *, mkdir: Path | None = None) -> int: + print_cmd(cmd) + if not execute: + return 0 + if mkdir is not None: + mkdir.mkdir(parents=True, exist_ok=True) + return subprocess.run(cmd, check=False, cwd=REPO_ROOT).returncode + + +def clean_stale_group_dirs(output_dir: Path, groups: Iterable[TaskGroup], *, execute: bool) -> None: + keep = {group.name for group in groups} + for relative in ("task-context", "repo-overlays"): + parent = output_dir / relative + if not parent.exists(): + continue + for child in parent.iterdir(): + if not child.is_dir() or child.name in keep: + continue + if execute: + shutil.rmtree(child) + print(f"[clean] removed stale {child}") + else: + print_cmd(["rm", "-rf", str(child)]) + + +def sha256_file(path: Path) -> str: + digest = hashlib.sha256() + with path.open("rb") as handle: + for chunk in iter(lambda: handle.read(1024 * 1024), b""): + digest.update(chunk) + return digest.hexdigest() + + +def saved_image_manifest( + image: str, + output: Path, + *, + execute: bool, + role: str, + name: str, +) -> dict[str, Any]: + return { + "role": role, + "name": name, + "image": image, + "tar": str(output), + "sha256": sha256_file(output) if execute and output.exists() else None, + } + + +def print_cmd(cmd: list[str]) -> None: + print("$ " + " ".join(shlex_quote(part) for part in cmd)) + + +def shlex_quote(value: str) -> str: + import shlex + + return shlex.quote(str(value)) + + +def collect_mounts(infos: Iterable[StepInfo]) -> list[Any]: + out: list[Any] = [] + for info in infos: + out.extend(info.mounts) + return out + + +def collect_repo_overlays(infos: Iterable[StepInfo]) -> list[RepoOverlay]: + out: list[RepoOverlay] = [] + for info in infos: + out = merge_repo_overlays(out, info.repo_overlays) + return out + + +def repo_overlay_manifest(item: RepoOverlay) -> dict[str, str]: + return { + "repo": item.repo, + "url": item.url, + "ref": item.ref, + "target": item.target, + } + + +def repo_overlay_build_manifest(item: RepoOverlay) -> dict[str, str]: + data = repo_overlay_manifest(item) + data["source"] = repo_overlay_dir_name(item) + return data + + +def repo_overlay_dir_name(item: RepoOverlay) -> str: + return f"{sanitize(item.repo)}-{short_hash(repo_overlay_manifest(item))}" + + +def step_to_manifest(info: StepInfo) -> dict[str, Any]: + return { + "target": info.target.spec, + "step_py": str(info.step_py.relative_to(REPO_ROOT)), + "step_toml": str(info.step_toml.relative_to(REPO_ROOT)), + "config": str(info.config_path.relative_to(REPO_ROOT)) if info.config_path else None, + "module": info.module, + } + + +def task_group_manifest(group: TaskGroup) -> dict[str, Any]: + return { + "name": group.name, + "image_names": sorted(group.image_names), + "base_image": group.base_image, + "platform": group.platform, + "tag": group.tag, + "selected_image": group.selected_image or group.tag, + "tar": str(group.tar), + "steps": group.steps, + "pip_no_deps": group.pip_no_deps, + "candidate_imports": sorted(group.candidate_imports), + "missing_imports": group.missing_imports, + "missing_core_imports": group.missing_core_imports, + "requirements": group.requirements, + "requirements_path": str(group.requirements_path) if group.requirements_path else None, + "repo_overlays": [repo_overlay_manifest(item) for item in group.repo_overlays], + "repo_overlays_path": str(group.repo_overlays_path) if group.repo_overlays_path else None, + } + + +def step_image_manifest(groups: Iterable[TaskGroup]) -> dict[str, str]: + out: dict[str, str] = {} + for group in groups: + image = group.selected_image or group.tag + for step_id in group.steps: + out[step_id] = image + return out + + +def submitter_manifest(submitter: Mapping[str, Any]) -> dict[str, Any]: + return { + "base_image": submitter.get("base_image") or "python:3.12-slim", + "platform": submitter.get("platform"), + "tag": submitter.get("tag") or "nemotron-customizer-submit-airgap:latest", + "tar": submitter.get("tar") or "submitter-image.tar", + } + + +def sanitize(value: str) -> str: + return re.sub(r"[^a-zA-Z0-9_.-]+", "-", value).strip("-").lower() or "image" + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tests/deploy/test_airgap_runner.py b/tests/deploy/test_airgap_runner.py new file mode 100644 index 000000000..f93f47c26 --- /dev/null +++ b/tests/deploy/test_airgap_runner.py @@ -0,0 +1,376 @@ +from __future__ import annotations + +import importlib.util +import sys +from pathlib import Path + +import pytest +from omegaconf import OmegaConf + +from nemo_runspec.config.loader import load_config + + +def _runner_module(): + repo_root = Path(__file__).resolve().parents[2] + path = repo_root / "deploy/nemotron-customizer/airgap/runner.py" + spec = importlib.util.spec_from_file_location("airgap_runner", path) + assert spec and spec.loader + module = importlib.util.module_from_spec(spec) + sys.modules[spec.name] = module + spec.loader.exec_module(module) + return module + + +def test_airgap_runner_expands_and_validates_sft_dependency(): + runner = _runner_module() + cfg = { + "workflow": {"stages": ["sft/megatron_bridge:tiny"]}, + "dependencies": {"sft/megatron_bridge": ["prep/sft_packing:tiny"]}, + } + + targets = runner.expand_targets(cfg) + infos = runner.validate_targets(targets) + + assert [target.spec for target in targets] == ["prep/sft_packing:tiny", "sft/megatron_bridge:tiny"] + assert infos["sft/megatron_bridge"].module == "nemotron.steps.sft.megatron_bridge.step" + assert infos["prep/sft_packing"].config_path.name == "tiny.yaml" + assert [item.target for item in infos["sft/megatron_bridge"].repo_overlays] == [ + "/opt/megatron-lm", + "/opt/Megatron-Bridge", + ] + + +def test_airgap_runner_groups_task_images_by_base_image_and_repo_overlays(tmp_path): + runner = _runner_module() + overlay = runner.RepoOverlay( + repo="Megatron-Bridge", + url="https://github.com/NVIDIA-NeMo/Megatron-Bridge.git", + ref="main", + target="/opt/Megatron-Bridge", + ) + cfg = { + "step_images": { + "prep/sft_packing": "a", + "sft/megatron_bridge": "b", + }, + "task_images": { + "a": { + "base_image": "image:base", + "tag": "image:a", + "tar": "a.tar", + "required_imports": ["omegaconf"], + }, + "b": { + "base_image": "image:base", + "tag": "image:b", + "tar": "b.tar", + "required_imports": ["yaml"], + }, + }, + } + + groups = runner.task_groups( + cfg, + output_dir=tmp_path, + step_infos={ + "prep/sft_packing": runner.StepInfo( + target=runner.Target("prep/sft_packing"), + step_dir=tmp_path, + step_py=tmp_path / "step.py", + step_toml=tmp_path / "step.toml", + config_path=None, + module="x", + ), + "sft/megatron_bridge": runner.StepInfo( + target=runner.Target("sft/megatron_bridge"), + step_dir=tmp_path, + step_py=tmp_path / "step.py", + step_toml=tmp_path / "step.toml", + config_path=None, + module="y", + repo_overlays=[overlay], + ), + }, + ) + + assert len(groups) == 2 + by_step = {group.steps[0]: group for group in groups} + assert by_step["prep/sft_packing"].base_image == "image:base" + assert by_step["prep/sft_packing"].required_imports == {"omegaconf"} + assert by_step["prep/sft_packing"].repo_overlays == [] + assert by_step["sft/megatron_bridge"].base_image == "image:base" + assert by_step["sft/megatron_bridge"].required_imports == {"yaml"} + assert by_step["sft/megatron_bridge"].repo_overlays == [overlay] + assert len({group.tag for group in groups}) == 2 + + +def test_airgap_runner_only_builds_images_for_selected_steps(tmp_path): + runner = _runner_module() + cfg = { + "step_images": { + "prep/sft_packing": "nemo-megatron", + "sft/automodel": "nemo-automodel", + }, + "task_images": { + "nemo-megatron": {"base_image": "nemo:base"}, + "nemo-automodel": {"base_image": "automodel:base"}, + }, + } + + groups = runner.task_groups(cfg, output_dir=tmp_path, step_infos={"prep/sft_packing": object()}) + + assert len(groups) == 1 + assert groups[0].name.startswith("nemo-megatron-") + assert groups[0].steps == ["prep/sft_packing"] + + +def test_airgap_runner_maps_sdg_to_light_sdk_image(tmp_path): + runner = _runner_module() + cfg = runner.load_yaml(runner.AIRGAP_DIR / "airgap.yaml") + cfg["workflow"]["stages"] = ["sdg/data_designer:tiny"] + + targets = runner.expand_targets(cfg) + infos = runner.validate_targets(targets) + groups = runner.task_groups(cfg, output_dir=tmp_path, step_infos=infos) + + assert [target.spec for target in targets] == ["sdg/data_designer:tiny"] + assert len(groups) == 1 + assert groups[0].name.startswith("nemo-data-designer-") + assert groups[0].base_image == "nvcr.io/nvidia/nemo:25.11.nemotron_3_nano" + assert "data_designer" in groups[0].required_imports + + +def test_airgap_runner_maps_byob_to_data_designer_image(tmp_path): + runner = _runner_module() + cfg = runner.load_yaml(runner.AIRGAP_DIR / "airgap.yaml") + cfg["workflow"]["stages"] = ["byob:tiny"] + + targets = runner.expand_targets(cfg) + infos = runner.validate_targets(targets) + groups = runner.task_groups(cfg, output_dir=tmp_path, step_infos=infos) + + assert [target.spec for target in targets] == ["byob:tiny"] + assert len(groups) == 1 + assert groups[0].name.startswith("nemo-data-designer-") + assert "data_designer" in groups[0].required_imports + + +def test_airgap_runner_target_override_selects_sdg_and_sft(): + runner = _runner_module() + cfg = runner.load_yaml(runner.AIRGAP_DIR / "airgap.yaml") + cfg = runner.with_workflow_targets( + cfg, + runner.normalize_target_specs(["sdg/data_designer:tiny", "sft/megatron_bridge:tiny"]), + ) + + targets = runner.expand_targets(cfg) + infos = runner.validate_targets(targets) + groups = runner.task_groups(cfg, output_dir=runner.AIRGAP_DIR / "out", step_infos=infos) + + assert [target.spec for target in targets] == [ + "sdg/data_designer:tiny", + "prep/sft_packing:tiny", + "sft/megatron_bridge:tiny", + ] + by_steps = {tuple(group.steps): group for group in groups} + merged = by_steps[("sdg/data_designer", "prep/sft_packing")] + assert merged.image_names == {"nemo-data-designer", "nemo-megatron"} + assert merged.tag.startswith("nemotron-customizer-nemo-data-designer-nemo-megatron-airgap-") + + +def test_airgap_runner_splits_same_base_image_when_repo_overlays_differ(tmp_path): + runner = _runner_module() + overlay = runner.RepoOverlay( + repo="Megatron-Bridge", + url="https://github.com/NVIDIA-NeMo/Megatron-Bridge.git", + ref="feature", + target="/opt/Megatron-Bridge", + ) + cfg = { + "step_images": { + "prep/sft_packing": "nemo-megatron", + "sft/megatron_bridge": "nemo-megatron", + }, + "task_images": { + "nemo-megatron": { + "base_image": "nemo:base", + "tag": "nemo-airgap:latest", + "tar": "nemo-airgap.tar", + }, + }, + } + groups = runner.task_groups( + cfg, + output_dir=tmp_path, + step_infos={ + "prep/sft_packing": runner.StepInfo( + target=runner.Target("prep/sft_packing"), + step_dir=tmp_path, + step_py=tmp_path / "step.py", + step_toml=tmp_path / "step.toml", + config_path=None, + module="x", + ), + "sft/megatron_bridge": runner.StepInfo( + target=runner.Target("sft/megatron_bridge"), + step_dir=tmp_path, + step_py=tmp_path / "step.py", + step_toml=tmp_path / "step.toml", + config_path=None, + module="y", + repo_overlays=[overlay], + ), + }, + ) + + assert len(groups) == 2 + assert sorted([group.steps for group in groups]) == [["prep/sft_packing"], ["sft/megatron_bridge"]] + assert len({group.tag for group in groups}) == 2 + + +def test_airgap_runner_uses_collision_safe_repo_overlay_dirs(): + runner = _runner_module() + first = runner.RepoOverlay( + repo="Megatron-Bridge", + url="https://github.com/NVIDIA-NeMo/Megatron-Bridge.git", + ref="main", + target="/opt/Megatron-Bridge", + ) + second = runner.RepoOverlay( + repo="Megatron-Bridge", + url="https://github.com/example/Megatron-Bridge.git", + ref="main", + target="/opt/Other-Bridge", + ) + + assert runner.repo_overlay_dir_name(first) != runner.repo_overlay_dir_name(second) + assert runner.repo_overlay_build_manifest(first)["source"] == runner.repo_overlay_dir_name(first) + + +def test_airgap_runner_auto_adds_stage_prerequisites(): + runner = _runner_module() + + assert runner.normalize_stages(["build-task-images"]) == [ + "validate", + "discover-task-deps", + "build-task-images", + ] + assert runner.normalize_stages(["save-images"]) == [ + "validate", + "discover-task-deps", + "build-submitter", + "build-task-images", + "save-images", + ] + + +def test_airgap_runner_rejects_build_output_outside_docker_context(tmp_path): + runner = _runner_module() + + with pytest.raises(SystemExit, match="paths.output_dir=.*must live under the repo root"): + runner.validate_docker_context_path(tmp_path, field="paths.output_dir") + + +def test_airgap_runner_reports_dependency_cycles(): + runner = _runner_module() + cfg = { + "workflow": {"stages": ["a/b"]}, + "dependencies": { + "a/b": ["c/d"], + "c/d": ["a/b"], + }, + } + + with pytest.raises(SystemExit, match=r"cyclic airgap dependency detected: a/b -> c/d -> a/b"): + runner.expand_targets(cfg) + + +def test_airgap_runner_tag_suffix_handles_ports_and_digests(): + runner = _runner_module() + + assert runner.tag_with_suffix("registry:5000/team/image:latest", "abc123") == ( + "registry:5000/team/image-abc123:latest" + ) + assert runner.tag_with_suffix("repo/image:latest@sha256:deadbeef", "abc123") == ( + "repo/image-abc123:latest@sha256:deadbeef" + ) + assert runner.tag_with_suffix("repo/image@sha256:deadbeef", "abc123") == "repo/image-abc123@sha256:deadbeef" + + +def test_airgap_runner_saved_image_manifest_has_checksum(tmp_path): + runner = _runner_module() + image_tar = tmp_path / "image.tar" + image_tar.write_text("image bytes", encoding="utf-8") + + saved = runner.saved_image_manifest("image:tag", image_tar, execute=True, role="task", name="group") + + assert saved["role"] == "task" + assert saved["name"] == "group" + assert saved["image"] == "image:tag" + assert saved["tar"] == str(image_tar) + assert saved["sha256"] == runner.sha256_file(image_tar) + + +def test_airgap_runner_platform_matching_accepts_variant_only_when_compatible(): + runner = _runner_module() + + assert runner.platform_matches("linux/amd64", "linux/amd64") + assert runner.platform_matches("linux/arm64/v8", "linux/arm64") + assert not runner.platform_matches("linux/amd64", "linux/arm64") + assert runner.pip_cache_volume("linux/amd64") == "nemotron-airgap-pip-cache-linux-amd64" + + +def test_airgap_runner_progress_state_resumes_and_completes(tmp_path): + runner = _runner_module() + cfg = {"workflow": {"stages": ["byob:tiny"]}} + config_path = tmp_path / "airgap.yaml" + stages = ["validate"] + + state = runner.load_or_start_run_state( + tmp_path, + config_path=config_path, + cfg=cfg, + stages=stages, + execute=True, + ) + assert state is not None + runner.begin_action(state, "validate") + assert state.path.exists() + assert not state.done_path.exists() + + runner.complete_action(state, "validate", {"targets": ["byob:tiny"]}) + resumed = runner.load_or_start_run_state( + tmp_path, + config_path=config_path, + cfg=cfg, + stages=stages, + execute=True, + ) + assert runner.action_completed(resumed, "validate") + + manifest = tmp_path / "airgap-manifest.yaml" + manifest.write_text("schema_version: 1\n", encoding="utf-8") + runner.complete_run_state(resumed, manifest_path=manifest) + + assert not state.path.exists() + assert state.done_path.exists() + + +def test_airgap_runner_static_import_scan_stays_direct(): + runner = _runner_module() + step_py = runner.STEP_ROOT / "prep/sft_packing/step.py" + + imports = runner.discover_external_imports(step_py) + + assert "omegaconf" in imports + assert "cosmos_xenna" not in imports + + +def test_sft_airgap_overlay_clears_auto_mounts_but_inherits_config(): + runner = _runner_module() + config = load_config(runner.AIRGAP_DIR / "configs/sft_megatron_bridge_tiny.yaml") + plain = OmegaConf.to_container(config, resolve=False) + + assert plain["run"]["env"]["mounts"] == [] + assert plain["hf_model_path"] == "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16" + assert plain["dataset"]["packed_sequence_specs"]["packed_sequence_size"] == 4096 From f4b8f50910c616d8fe48842ef2cf2ea3fc4bed1b Mon Sep 17 00:00:00 2001 From: Rakesh Paul Date: Fri, 8 May 2026 23:33:56 +0530 Subject: [PATCH 2/5] Refine Nemotron Customizer airgap image flow - Rename airgap artifacts to use launcher and execution image terminology - Update runner stages, manifests, README, and config keys to match the new naming - Keep execution image generation scoped to selected Nemotron Customizer steps - Preserve external handling for models, datasets, checkpoints, and customer storage paths - Refresh SFT Megatron Bridge airgap overlay configs - Update tests for launcher/execution image behavior and staged runner flow Signed-off-by: Rakesh Paul --- .../{Dockerfile.task => Dockerfile.execution} | 12 +- ...nore => Dockerfile.execution.dockerignore} | 4 +- ...ckerfile.submitter => Dockerfile.launcher} | 2 +- ...gnore => Dockerfile.launcher.dockerignore} | 0 deploy/nemotron-customizer/airgap/README.md | 40 +-- deploy/nemotron-customizer/airgap/airgap.yaml | 42 ++-- .../configs/sft_megatron_bridge_default.yaml | 2 +- .../configs/sft_megatron_bridge_tiny.yaml | 2 +- deploy/nemotron-customizer/airgap/runner.py | 237 ++++++++++-------- tests/deploy/test_airgap_runner.py | 42 ++-- 10 files changed, 200 insertions(+), 183 deletions(-) rename deploy/nemotron-customizer/airgap/{Dockerfile.task => Dockerfile.execution} (81%) rename deploy/nemotron-customizer/airgap/{Dockerfile.task.dockerignore => Dockerfile.execution.dockerignore} (68%) rename deploy/nemotron-customizer/airgap/{Dockerfile.submitter => Dockerfile.launcher} (91%) rename deploy/nemotron-customizer/airgap/{Dockerfile.submitter.dockerignore => Dockerfile.launcher.dockerignore} (100%) diff --git a/deploy/nemotron-customizer/airgap/Dockerfile.task b/deploy/nemotron-customizer/airgap/Dockerfile.execution similarity index 81% rename from deploy/nemotron-customizer/airgap/Dockerfile.task rename to deploy/nemotron-customizer/airgap/Dockerfile.execution index bb68c321a..acc9fb7bd 100644 --- a/deploy/nemotron-customizer/airgap/Dockerfile.task +++ b/deploy/nemotron-customizer/airgap/Dockerfile.execution @@ -1,11 +1,11 @@ -# Derivative task image for Nemotron Customizer airgap. +# Derivative execution image for Nemotron Customizer airgap. # Built from the real training/runtime image and only adds small missing # wrapper packages. ARG BASE_IMAGE FROM ${BASE_IMAGE} -ARG TASK_REQUIREMENTS +ARG EXECUTION_REQUIREMENTS ARG REPO_OVERLAYS ARG REPO_OVERLAYS_DIR ARG PYTHON_BIN=python @@ -16,16 +16,16 @@ ENV TRANSFORMERS_OFFLINE=1 ENV HF_DATASETS_OFFLINE=1 ENV WANDB_MODE=offline -COPY ${TASK_REQUIREMENTS} /opt/nemotron-airgap/task-requirements.txt +COPY ${EXECUTION_REQUIREMENTS} /opt/nemotron-airgap/execution-requirements.txt COPY ${REPO_OVERLAYS} /opt/nemotron-airgap/repo-overlays.json COPY ${REPO_OVERLAYS_DIR}/ /opt/nemotron-airgap/repo-overlays/ # Build-time installs keep --no-cache-dir so derivative image layers stay small. -RUN if [ -s /opt/nemotron-airgap/task-requirements.txt ]; then \ +RUN if [ -s /opt/nemotron-airgap/execution-requirements.txt ]; then \ if [ "${PIP_NO_DEPS}" = "true" ]; then \ - ${PYTHON_BIN} -m pip install --no-cache-dir --no-deps -r /opt/nemotron-airgap/task-requirements.txt; \ + ${PYTHON_BIN} -m pip install --no-cache-dir --no-deps -r /opt/nemotron-airgap/execution-requirements.txt; \ else \ - ${PYTHON_BIN} -m pip install --no-cache-dir -r /opt/nemotron-airgap/task-requirements.txt; \ + ${PYTHON_BIN} -m pip install --no-cache-dir -r /opt/nemotron-airgap/execution-requirements.txt; \ fi; \ fi && \ ${PYTHON_BIN} - <<'PY' diff --git a/deploy/nemotron-customizer/airgap/Dockerfile.task.dockerignore b/deploy/nemotron-customizer/airgap/Dockerfile.execution.dockerignore similarity index 68% rename from deploy/nemotron-customizer/airgap/Dockerfile.task.dockerignore rename to deploy/nemotron-customizer/airgap/Dockerfile.execution.dockerignore index ef7a5c102..9ec7d6457 100644 --- a/deploy/nemotron-customizer/airgap/Dockerfile.task.dockerignore +++ b/deploy/nemotron-customizer/airgap/Dockerfile.execution.dockerignore @@ -4,8 +4,8 @@ !deploy/nemotron-customizer !deploy/nemotron-customizer/airgap !deploy/nemotron-customizer/airgap/out -!deploy/nemotron-customizer/airgap/out/task-context -!deploy/nemotron-customizer/airgap/out/task-context/** +!deploy/nemotron-customizer/airgap/out/execution-context +!deploy/nemotron-customizer/airgap/out/execution-context/** !deploy/nemotron-customizer/airgap/out/repo-overlays !deploy/nemotron-customizer/airgap/out/repo-overlays/** diff --git a/deploy/nemotron-customizer/airgap/Dockerfile.submitter b/deploy/nemotron-customizer/airgap/Dockerfile.launcher similarity index 91% rename from deploy/nemotron-customizer/airgap/Dockerfile.submitter rename to deploy/nemotron-customizer/airgap/Dockerfile.launcher index ca2d0e131..b65701785 100644 --- a/deploy/nemotron-customizer/airgap/Dockerfile.submitter +++ b/deploy/nemotron-customizer/airgap/Dockerfile.launcher @@ -1,4 +1,4 @@ -# Submitter image for Nemotron Customizer airgap. +# Launcher image for Nemotron Customizer airgap. # It contains the repo and a uv-synced environment. It does not run training. ARG BASE_IMAGE=python:3.12-slim diff --git a/deploy/nemotron-customizer/airgap/Dockerfile.submitter.dockerignore b/deploy/nemotron-customizer/airgap/Dockerfile.launcher.dockerignore similarity index 100% rename from deploy/nemotron-customizer/airgap/Dockerfile.submitter.dockerignore rename to deploy/nemotron-customizer/airgap/Dockerfile.launcher.dockerignore diff --git a/deploy/nemotron-customizer/airgap/README.md b/deploy/nemotron-customizer/airgap/README.md index ebbccbe9f..5f64e0e8f 100644 --- a/deploy/nemotron-customizer/airgap/README.md +++ b/deploy/nemotron-customizer/airgap/README.md @@ -5,8 +5,8 @@ This folder is scoped only to Nemotron Customizer steps under The flow is intentionally small: -1. Build one **submitter image** with this repo and `uv.lock`. -2. Build one or more **task images** by grouping selected workflow stages by base image. +1. Build one **launcher image** with this repo and `uv.lock`. +2. Build one or more **execution images** by grouping selected workflow stages by base image. 3. Save those images as tarballs for the airgapped side. 4. Keep models, datasets, checkpoints, and customer files on persistent storage. @@ -14,13 +14,13 @@ Edit `airgap.yaml` first: - `workflow.stages`: the Nemotron Customizer steps the customer wants to run - `dependencies`: central step dependency map, for example SFT training needs SFT packing -- `step_images`: which task image each step should use -- `task_images`: the base image, output tag, and known/import-probed Python requirements +- `step_execution_images`: which execution image each step should use +- `execution_images`: the base image, output tag, and known/import-probed Python requirements Only steps reached from `workflow.stages` are built. Steps are grouped by `base_image + repo_overlays`; each group gets one derivative image with the union of its small missing packages. If two selected step families share the -same base image and repo overlays, the runner emits one combined task image for +same base image and repo overlays, the runner emits one combined execution image for both. Run from the repo root: @@ -45,7 +45,7 @@ To run only a few stages: uv run python deploy/nemotron-customizer/airgap/runner.py \ --config deploy/nemotron-customizer/airgap/airgap.yaml \ --stage validate \ - --stage discover-task-deps + --stage discover-execution-deps ``` To override the workflow without editing YAML, pass one or more selected @@ -63,18 +63,18 @@ uv run python deploy/nemotron-customizer/airgap/runner.py \ Outputs are written under `deploy/nemotron-customizer/airgap/out/` by default: - `airgap-manifest.yaml`: what was validated and built -- `airgap-progress.yaml`: incomplete execute run state used for resume -- `airgap-complete.yaml`: final execute run state after success -- `requirements-.txt`: small missing packages per task image -- `repo-overlays-.json`: git auto-mounts discovered from selected step configs -- `submitter-image.tar` -- `task-*.tar` +- `airgap-build-state.yaml`: incomplete execute run state used for resume +- `airgap-build-complete.yaml`: final execute run state after success +- `requirements-.txt`: small missing packages per execution image +- `repo-overlays-.json`: git auto-mounts discovered from selected step configs +- `launcher-image.tar` +- `execution-*.tar` - SHA256 checksums for saved image tarballs in `airgap-manifest.yaml` -If an execute run fails midway, leave `airgap-progress.yaml` in place and rerun +If an execute run fails midway, leave `airgap-build-state.yaml` in place and rerun the same command. Completed expensive actions are reused when their artifacts still exist. If you intentionally change the workflow or image plan before -finishing, move or remove `airgap-progress.yaml` first; the runner will not +finishing, move or remove `airgap-build-state.yaml` first; the runner will not silently overwrite incomplete state from a different plan. Runtime dependency probes use Docker volumes named @@ -88,19 +88,19 @@ executor-visible persistent storage and reference them through config overrides and `run.env.mounts`. During dependency discovery, the runner mounts the connected-machine checkout -into each task image only to probe imports. The final task image deliberately -does not bake this repo; the submitter image and the normal nemo-run/nemo-runspec +into each execution image only to probe imports. The final execution image deliberately +does not bake this repo; the launcher image and the normal nemo-run/nemo-runspec code transport provide the repo to the remote job at submission time. Repo logistics stay outside `airgap.yaml`. If a selected step config contains `${auto_mount:git+...}`, the runner treats it as a connected-machine build input: -it fetches that pinned repo and bakes it into the derivative task image at the +it fetches that pinned repo and bakes it into the derivative execution image at the requested target path. Runtime jobs then use the baked image and do not clone from GitHub. Site-specific data/model mounts remain in env profiles or step overrides. If the connected machine is not the same architecture as the target cluster, -set `platform: linux/amd64` on the submitter or task image entry in +set `platform: linux/amd64` on the `launcher_image` or execution image entry in `airgap.yaml`. If you need to minimize transfer size for several images that share layers, `docker save -o all-images.tar tag1 tag2 ...` can be used after the runner builds the images; a single tar deduplicates shared layers better @@ -124,8 +124,8 @@ workflow: When submitting inside the airgap, use the deploy overlay config so those git auto-mounts are cleared at runtime while persistent storage mounts from the env profile still apply. Use the image printed by the runner under -`selected step images`, or read it from `out/airgap-manifest.yaml` under -`step_images`. +`selected execution images`, or read it from `out/airgap-manifest.yaml` under +`step_execution_images`. ```bash uv run nemotron step run sft/megatron_bridge \ diff --git a/deploy/nemotron-customizer/airgap/airgap.yaml b/deploy/nemotron-customizer/airgap/airgap.yaml index 7745ea857..5d4a60849 100644 --- a/deploy/nemotron-customizer/airgap/airgap.yaml +++ b/deploy/nemotron-customizer/airgap/airgap.yaml @@ -2,7 +2,7 @@ # # Change workflow.stages to the steps the customer wants. The runner expands # dependencies, validates those step files/configs, groups selected steps by -# task image, then builds only the images needed for that selection. +# execution image, then builds only the images needed for that selection. workflow: name: sft-megatron-bridge @@ -18,18 +18,18 @@ workflow: build_stages: - validate - - discover-task-deps - - build-submitter - - build-task-images + - discover-execution-deps + - build-launcher-image + - build-execution-images - save-images paths: output_dir: deploy/nemotron-customizer/airgap/out -submitter: +launcher_image: base_image: python:3.12-slim - tag: nemotron-customizer-submit-airgap:latest - tar: submitter-image.tar + tag: nemotron-customizer-launcher-airgap:latest + tar: launcher-image.tar # Central dependency map. Keep this small and explicit: it is only for steps # that naturally require a previous Nemotron Customizer step output. @@ -51,15 +51,15 @@ dependencies: # SDG can feed SFT or RL prep, but it is not forced as a dependency because # many customers bring their own JSONL on persistent storage. -# Step -> task-image mapping. The runner only uses entries reached from +# Step -> execution-image mapping. The runner only uses entries reached from # workflow.stages after dependency expansion. -step_images: +step_execution_images: byob: nemo-data-designer convert/hf_to_megatron: nemo-megatron convert/megatron_to_hf: nemo-megatron convert/merge_lora: nemo-megatron curate/nemo_curator: nemo-curator - env/env_toml: submitter-python + env/env_toml: launcher-python eval/model_eval: nemo-eval optimize/modelopt/distill: nemo-modelopt optimize/modelopt/prune: nemo-modelopt @@ -80,51 +80,51 @@ step_images: translate/nemo_skills: nemo-curator translate/translation: nemo-curator -task_images: - submitter-python: +execution_images: + launcher-python: base_image: python:3.12-slim - tag: nemotron-customizer-python-task-airgap:latest - tar: task-python-image.tar + tag: nemotron-customizer-python-execution-airgap:latest + tar: execution-python-image.tar nemo-megatron: base_image: nvcr.io/nvidia/nemo:25.11.nemotron_3_nano tag: nemotron-customizer-nemo-megatron-airgap:latest - tar: task-nemo-megatron-image.tar + tar: execution-nemo-megatron-image.tar required_imports: [] nemo-automodel: base_image: nvcr.io/nvidia/nemo-automodel:26.04 tag: nemotron-customizer-nemo-automodel-airgap:latest - tar: task-nemo-automodel-image.tar + tar: execution-nemo-automodel-image.tar required_imports: [] nemo-rl: base_image: nvcr.io/nvidia/nemo-rl:v0.6.0 tag: nemotron-customizer-nemo-rl-airgap:latest - tar: task-nemo-rl-image.tar + tar: execution-nemo-rl-image.tar required_imports: [] nemo-modelopt: base_image: nvcr.io/nvidia/nemo:26.02 tag: nemotron-customizer-nemo-modelopt-airgap:latest - tar: task-nemo-modelopt-image.tar + tar: execution-nemo-modelopt-image.tar required_imports: [] nemo-curator: base_image: nvcr.io/nvidia/nemo-curator:25.07 tag: nemotron-customizer-nemo-curator-airgap:latest - tar: task-nemo-curator-image.tar + tar: execution-nemo-curator-image.tar required_imports: [] nemo-data-designer: base_image: nvcr.io/nvidia/nemo:25.11.nemotron_3_nano tag: nemotron-customizer-nemo-data-designer-airgap:latest - tar: task-nemo-data-designer-image.tar + tar: execution-nemo-data-designer-image.tar required_imports: - data_designer nemo-eval: base_image: nvcr.io/nvidia/nemo:25.11.nemotron_3_nano tag: nemotron-customizer-nemo-eval-airgap:latest - tar: task-nemo-eval-image.tar + tar: execution-nemo-eval-image.tar required_imports: [] diff --git a/deploy/nemotron-customizer/airgap/configs/sft_megatron_bridge_default.yaml b/deploy/nemotron-customizer/airgap/configs/sft_megatron_bridge_default.yaml index d1854c2bf..a2e4b828c 100644 --- a/deploy/nemotron-customizer/airgap/configs/sft_megatron_bridge_default.yaml +++ b/deploy/nemotron-customizer/airgap/configs/sft_megatron_bridge_default.yaml @@ -1,7 +1,7 @@ # Airgap runtime overlay for sft/megatron_bridge:default. # # The connected-machine airgap runner bakes the auto_mount repos from the base -# config into the derivative task image. At runtime, clear those git auto-mounts +# config into the derivative execution image. At runtime, clear those git auto-mounts # so the airgapped job does not clone from GitHub. Env-profile persistent # storage mounts still append normally. diff --git a/deploy/nemotron-customizer/airgap/configs/sft_megatron_bridge_tiny.yaml b/deploy/nemotron-customizer/airgap/configs/sft_megatron_bridge_tiny.yaml index b2f54d38b..eb71f5f96 100644 --- a/deploy/nemotron-customizer/airgap/configs/sft_megatron_bridge_tiny.yaml +++ b/deploy/nemotron-customizer/airgap/configs/sft_megatron_bridge_tiny.yaml @@ -1,7 +1,7 @@ # Airgap runtime overlay for sft/megatron_bridge:tiny. # # The connected-machine airgap runner bakes the auto_mount repos from the base -# config into the derivative task image. At runtime, clear those git auto-mounts +# config into the derivative execution image. At runtime, clear those git auto-mounts # so the airgapped job does not clone from GitHub. Env-profile persistent # storage mounts still append normally. diff --git a/deploy/nemotron-customizer/airgap/runner.py b/deploy/nemotron-customizer/airgap/runner.py index 433a43f93..13d45a8ec 100644 --- a/deploy/nemotron-customizer/airgap/runner.py +++ b/deploy/nemotron-customizer/airgap/runner.py @@ -3,7 +3,7 @@ This file intentionally lives under deploy/nemotron-customizer/airgap instead of adding a new step. It is a connected-machine helper that validates requested -steps, discovers small task-image Python gaps, builds submitter/task images, and +steps, discovers small execution-image Python gaps, builds launcher/execution images, and saves image tarballs. """ @@ -33,8 +33,8 @@ STEP_ROOT = SRC_ROOT / "nemotron" / "steps" DEFAULT_OUTPUT_DIR = AIRGAP_DIR / "out" UV_VERSION = "0.11.1" -PROGRESS_STATE = "airgap-progress.yaml" -COMPLETE_STATE = "airgap-complete.yaml" +PROGRESS_STATE = "airgap-build-state.yaml" +COMPLETE_STATE = "airgap-build-complete.yaml" LOCAL_PREFIXES = ("nemotron", "nemo_runspec") CORE_IMPORTS = { "datasets", @@ -88,7 +88,7 @@ class RepoOverlay: @dataclass -class TaskGroup: +class ExecutionGroup: name: str base_image: str tag: str @@ -133,7 +133,7 @@ def main(argv: list[str] | None = None) -> int: cfg = with_workflow_targets(cfg, normalize_target_specs(args.target)) stages = normalize_stages(args.stage or cfg.get("build_stages") or cfg.get("stages") or []) output_dir = resolve_repo_path(Path(cfg.get("paths", {}).get("output_dir", DEFAULT_OUTPUT_DIR))) - if "build-task-images" in stages: + if "build-execution-images" in stages: validate_docker_context_path(output_dir, field="paths.output_dir") output_dir.mkdir(parents=True, exist_ok=True) run_state = load_or_start_run_state( @@ -152,7 +152,7 @@ def main(argv: list[str] | None = None) -> int: expanded_targets: list[Target] = [] step_infos: dict[str, StepInfo] = {} - groups: list[TaskGroup] = [] + groups: list[ExecutionGroup] = [] workflow_manifest: dict[str, Any] = { "stages": list(workflow.get("stages") or []), } @@ -173,67 +173,78 @@ def main(argv: list[str] | None = None) -> int: print(f"[validate] {len(step_infos)} target(s) ok") complete_action(run_state, "validate", {"targets": [target.spec for target in expanded_targets]}) - if any(stage in stages for stage in ("discover-task-deps", "build-task-images", "save-images")): - groups = task_groups(cfg, output_dir=output_dir, step_infos=step_infos) - manifest["task_groups"] = [task_group_manifest(group) for group in groups] + if any(stage in stages for stage in ("discover-execution-deps", "build-execution-images", "save-images")): + groups = execution_groups(cfg, output_dir=output_dir, step_infos=step_infos) + manifest["execution_groups"] = [execution_group_manifest(group) for group in groups] - if "discover-task-deps" in stages: - if action_completed(run_state, "discover-task-deps") and hydrate_discovered_groups(run_state, groups): - print("[resume] skipping discover-task-deps; using saved probe results") + if "discover-execution-deps" in stages: + if action_completed(run_state, "discover-execution-deps") and hydrate_discovered_groups(run_state, groups): + print("[resume] skipping discover-execution-deps; using saved probe results") else: - begin_action(run_state, "discover-task-deps") + begin_action(run_state, "discover-execution-deps") locked_versions = locked_package_versions(REPO_ROOT / "uv.lock") for group in groups: - discover_task_deps(group, step_infos=step_infos, locked_versions=locked_versions, execute=args.execute) + discover_execution_deps( + group, + step_infos=step_infos, + locked_versions=locked_versions, + execute=args.execute, + ) remember_discovered_groups(run_state, groups) - complete_action(run_state, "discover-task-deps", {"groups": [group.name for group in groups]}) - manifest["task_groups"] = [task_group_manifest(group) for group in groups] - - if "build-submitter" in stages: - submitter = cfg.get("submitter", {}) - submitter_tag = str(submitter.get("tag") or "nemotron-customizer-submit-airgap:latest") - platform = submitter_platform(submitter) - action = "build-submitter" - if action_completed(run_state, action) and docker_image_exists(submitter_tag, platform=platform): - print(f"[resume] skipping {action}; image exists: {submitter_tag}") + complete_action(run_state, "discover-execution-deps", {"groups": [group.name for group in groups]}) + manifest["execution_groups"] = [execution_group_manifest(group) for group in groups] + + if "build-launcher-image" in stages: + launcher_image = cfg.get("launcher_image", {}) + launcher_image_tag = str(launcher_image.get("tag") or "nemotron-customizer-launcher-airgap:latest") + platform = launcher_image_platform(launcher_image) + action = "build-launcher-image" + if action_completed(run_state, action) and docker_image_exists(launcher_image_tag, platform=platform): + print(f"[resume] skipping {action}; image exists: {launcher_image_tag}") else: begin_action(run_state, action) - status = build_submitter(submitter, execute=args.execute) + status = build_launcher_image(launcher_image, execute=args.execute) if status: return status - complete_action(run_state, action, {"image": submitter_tag}) - manifest["submitter"] = submitter_manifest(submitter) + complete_action(run_state, action, {"image": launcher_image_tag}) + manifest["launcher_image"] = launcher_image_manifest(launcher_image) - if "build-task-images" in stages: + if "build-execution-images" in stages: clean_stale_group_dirs(output_dir, groups, execute=args.execute) for group in groups: - action = f"build-task-image:{group.name}" + action = f"build-execution-image:{group.name}" if action_completed(run_state, action) and docker_image_exists(group.tag, platform=group.platform): print(f"[resume] skipping {action}; image exists: {group.tag}") else: begin_action(run_state, action) - status = build_task_image(group, output_dir=output_dir, execute=args.execute) + status = build_execution_image(group, output_dir=output_dir, execute=args.execute) if status: return status complete_action(run_state, action, {"image": group.tag}) - manifest["task_groups"] = [task_group_manifest(group) for group in groups] + manifest["execution_groups"] = [execution_group_manifest(group) for group in groups] if "save-images" in stages: - submitter = cfg.get("submitter", {}) - if submitter: - output = output_dir / str(submitter.get("tar", "submitter-image.tar")) - submitter_tag = str(submitter.get("tag") or "nemotron-customizer-submit-airgap:latest") - action = f"save-image:{submitter_tag}" + launcher_image = cfg.get("launcher_image", {}) + if launcher_image: + output = output_dir / str(launcher_image.get("tar", "launcher-image.tar")) + launcher_image_tag = str(launcher_image.get("tag") or "nemotron-customizer-launcher-airgap:latest") + action = f"save-image:{launcher_image_tag}" if action_completed(run_state, action) and output.exists(): print(f"[resume] skipping {action}; tar exists: {output}") else: begin_action(run_state, action) - status = save_image(submitter_tag, output, args.execute) + status = save_image(launcher_image_tag, output, args.execute) if status: return status complete_action(run_state, action, {"tar": str(output)}) saved_images.append( - saved_image_manifest(submitter_tag, output, execute=args.execute, role="submitter", name="submitter") + saved_image_manifest( + launcher_image_tag, + output, + execute=args.execute, + role="launcher", + name="launcher", + ) ) for group in groups: action = f"save-image:{group.tag}" @@ -246,7 +257,7 @@ def main(argv: list[str] | None = None) -> int: return status complete_action(run_state, action, {"tar": str(group.tar)}) saved_images.append( - saved_image_manifest(group.tag, group.tar, execute=args.execute, role="task", name=group.name) + saved_image_manifest(group.tag, group.tar, execute=args.execute, role="execution", name=group.name) ) manifest["persistent_assets"] = { @@ -254,14 +265,14 @@ def main(argv: list[str] | None = None) -> int: "mounts_from_configs": collect_mounts(step_infos.values()), "baked_repo_overlays": [repo_overlay_manifest(item) for item in collect_repo_overlays(step_infos.values())], } - manifest["step_images"] = step_image_manifest(groups) + manifest["step_execution_images"] = step_execution_image_manifest(groups) manifest["saved_images"] = saved_images manifest_path = output_dir / "airgap-manifest.yaml" manifest_path.write_text(yaml.safe_dump(manifest, sort_keys=False), encoding="utf-8") complete_run_state(run_state, manifest_path=manifest_path) print(f"[airgap] wrote {manifest_path}") if groups: - print("[airgap] selected step images:") + print("[airgap] selected execution images:") for group in groups: image = group.selected_image or group.tag for step_id in group.steps: @@ -372,9 +383,9 @@ def run_signature(*, config_path: Path, cfg: Mapping[str, Any], stages: list[str "stages": stages, "workflow": cfg.get("workflow"), "dependencies": cfg.get("dependencies"), - "step_images": cfg.get("step_images"), - "task_images": cfg.get("task_images"), - "submitter": cfg.get("submitter"), + "step_execution_images": cfg.get("step_execution_images"), + "execution_images": cfg.get("execution_images"), + "launcher_image": cfg.get("launcher_image"), } text = yaml.safe_dump(payload, sort_keys=True) return hashlib.sha256(text.encode("utf-8")).hexdigest()[:16] @@ -414,7 +425,7 @@ def action_completed(state: RunState | None, action: str) -> bool: return action in (state.data.get("completed_actions") or {}) -def remember_discovered_groups(state: RunState | None, groups: Iterable[TaskGroup]) -> None: +def remember_discovered_groups(state: RunState | None, groups: Iterable[ExecutionGroup]) -> None: if state is None: return state.data["discovered_groups"] = { @@ -429,7 +440,7 @@ def remember_discovered_groups(state: RunState | None, groups: Iterable[TaskGrou write_run_state(state) -def hydrate_discovered_groups(state: RunState | None, groups: Iterable[TaskGroup]) -> bool: +def hydrate_discovered_groups(state: RunState | None, groups: Iterable[ExecutionGroup]) -> bool: if state is None: return False saved = state.data.get("discovered_groups") or {} @@ -463,7 +474,13 @@ def normalize_stages(stages: Iterable[str]) -> list[str]: stage = item.strip() if stage and stage not in out: out.append(stage) - out = out or ["validate", "discover-task-deps", "build-submitter", "build-task-images", "save-images"] + out = out or [ + "validate", + "discover-execution-deps", + "build-launcher-image", + "build-execution-images", + "save-images", + ] def ensure_before(required: str, requested: str) -> None: if requested not in out or required in out: @@ -474,17 +491,17 @@ def ensure_before(required: str, requested: str) -> None: # Apply prerequisite edges from later stages toward earlier stages. Each # insertion is idempotent, so a user can ask for any suffix of the pipeline. - ensure_before("build-task-images", "save-images") - ensure_before("build-submitter", "save-images") - ensure_before("discover-task-deps", "build-task-images") - ensure_before("validate", "discover-task-deps") - ensure_before("validate", "build-task-images") + ensure_before("build-execution-images", "save-images") + ensure_before("build-launcher-image", "save-images") + ensure_before("discover-execution-deps", "build-execution-images") + ensure_before("validate", "discover-execution-deps") + ensure_before("validate", "build-execution-images") ensure_before("validate", "save-images") order = { "validate": 0, - "discover-task-deps": 1, - "build-submitter": 2, - "build-task-images": 3, + "discover-execution-deps": 1, + "build-launcher-image": 2, + "build-execution-images": 3, "save-images": 4, } out.sort(key=lambda stage: order.get(stage, len(order))) @@ -492,7 +509,7 @@ def ensure_before(required: str, requested: str) -> None: def stage_needs_targets(stage: str) -> bool: - return stage in {"discover-task-deps", "build-task-images", "save-images"} + return stage in {"discover-execution-deps", "build-execution-images", "save-images"} def expand_targets(cfg: Mapping[str, Any]) -> list[Target]: @@ -577,41 +594,41 @@ def read_config_mounts(config_path: Path | None) -> list[Any]: return mounts if isinstance(mounts, list) else [] -def task_groups( +def execution_groups( cfg: Mapping[str, Any], *, output_dir: Path, step_infos: Mapping[str, StepInfo] | None = None, -) -> list[TaskGroup]: +) -> list[ExecutionGroup]: if not step_infos: - raise SystemExit("validate must run before task images can be planned") - if not cfg.get("step_images"): - raise SystemExit("airgap.yaml must define step_images for the selected workflow stages") - return task_groups_from_step_images(cfg, output_dir=output_dir, step_infos=step_infos) + raise SystemExit("validate must run before execution images can be planned") + if not cfg.get("step_execution_images"): + raise SystemExit("airgap.yaml must define step_execution_images for the selected workflow stages") + return execution_groups_from_step_execution_images(cfg, output_dir=output_dir, step_infos=step_infos) -def task_groups_from_step_images( +def execution_groups_from_step_execution_images( cfg: Mapping[str, Any], *, output_dir: Path, step_infos: Mapping[str, StepInfo], -) -> list[TaskGroup]: - step_images = normalize_step_images(cfg.get("step_images") or {}) - image_defs = normalize_task_images(cfg.get("task_images") or {}) - merged: dict[str, TaskGroup] = {} +) -> list[ExecutionGroup]: + step_execution_images = normalize_step_execution_images(cfg.get("step_execution_images") or {}) + image_defs = normalize_execution_images(cfg.get("execution_images") or {}) + merged: dict[str, ExecutionGroup] = {} for step_id in step_infos: - image_name = step_images.get(step_id) + image_name = step_execution_images.get(step_id) if not image_name: - raise SystemExit(f"{step_id}: missing step_images entry in airgap.yaml") + raise SystemExit(f"{step_id}: missing step_execution_images entry in airgap.yaml") image_def = image_defs.get(image_name) if image_def is None: - raise SystemExit(f"{step_id}: step_images points to unknown task image {image_name!r}") + raise SystemExit(f"{step_id}: step_execution_images points to unknown execution image {image_name!r}") base = str(image_def.get("base_image") or "").strip() if not base: - raise SystemExit(f"task_images.{image_name}.base_image is required") + raise SystemExit(f"execution_images.{image_name}.base_image is required") repo_overlays = getattr(step_infos[step_id], "repo_overlays", []) - group_key = task_group_key(base, repo_overlays) + group_key = execution_group_key(base, repo_overlays) group = merged.get(group_key) if group is None: suffix = short_hash( @@ -620,11 +637,11 @@ def task_groups_from_step_images( "repo_overlays": [repo_overlay_manifest(item) for item in repo_overlays], } ) - group = TaskGroup( + group = ExecutionGroup( name=f"{image_name}-{suffix}", base_image=base, tag="", - tar=output_dir / "task-image.tar", + tar=output_dir / "execution-image.tar", steps=[], platform=str(image_def["platform"]) if image_def.get("platform") else None, pip_no_deps=bool(image_def.get("pip_no_deps", True)), @@ -639,12 +656,12 @@ def task_groups_from_step_images( repo_overlays, ) for group in merged.values(): - finalize_task_group_name(group, image_defs=image_defs, output_dir=output_dir) + finalize_execution_group_name(group, image_defs=image_defs, output_dir=output_dir) return list(merged.values()) -def finalize_task_group_name( - group: TaskGroup, +def finalize_execution_group_name( + group: ExecutionGroup, *, image_defs: Mapping[str, Mapping[str, Any]], output_dir: Path, @@ -659,20 +676,20 @@ def finalize_task_group_name( if len(names) == 1: image_name = names[0] image_def = image_defs[image_name] - tag = str(image_def.get("tag") or f"nemotron-task-{sanitize(image_name)}:airgap") - tar = output_dir / str(image_def.get("tar") or f"task-{sanitize(image_name)}.tar") + tag = str(image_def.get("tag") or f"nemotron-execution-{sanitize(image_name)}:airgap") + tar = output_dir / str(image_def.get("tar") or f"execution-{sanitize(image_name)}.tar") group.name = f"{image_name}-{suffix}" else: merged_name = "-".join(sanitize(name) for name in names) tag = f"nemotron-customizer-{merged_name}-airgap:latest" - tar = output_dir / f"task-{merged_name}-image.tar" + tar = output_dir / f"execution-{merged_name}-image.tar" group.name = f"{merged_name}-{suffix}" group.tag = tag_with_suffix(tag, suffix) group.tar = tar_with_suffix(tar, suffix) group.selected_image = group.tag -def task_group_key(base_image: str, repo_overlays: Iterable[RepoOverlay]) -> str: +def execution_group_key(base_image: str, repo_overlays: Iterable[RepoOverlay]) -> str: overlays = sorted( (repo_overlay_manifest(item) for item in repo_overlays), key=lambda item: (item["target"], item["url"], item["ref"], item["repo"]), @@ -704,17 +721,17 @@ def tar_with_suffix(path: Path, suffix: str) -> Path: return path.with_name(f"{path.stem}-{suffix}{path.suffix}") -def normalize_step_images(raw: Mapping[str, Any]) -> dict[str, str]: +def normalize_step_execution_images(raw: Mapping[str, Any]) -> dict[str, str]: out: dict[str, str] = {} for step_id, value in raw.items(): if isinstance(value, str): out[str(step_id)] = value - elif isinstance(value, Mapping) and value.get("task_image"): - out[str(step_id)] = str(value["task_image"]) + elif isinstance(value, Mapping) and value.get("execution_image"): + out[str(step_id)] = str(value["execution_image"]) return out -def normalize_task_images(raw: Any) -> dict[str, Mapping[str, Any]]: +def normalize_execution_images(raw: Any) -> dict[str, Mapping[str, Any]]: if isinstance(raw, Mapping): return {str(name): spec for name, spec in raw.items() if isinstance(spec, Mapping)} return {} @@ -755,8 +772,8 @@ def merge_repo_overlays(existing: list[RepoOverlay], incoming: Iterable[RepoOver return out -def discover_task_deps( - group: TaskGroup, +def discover_execution_deps( + group: ExecutionGroup, *, step_infos: Mapping[str, StepInfo], locked_versions: Mapping[str, str], @@ -845,7 +862,7 @@ def probe_step_modules( pip_no_deps: bool, platform: str | None = None, ) -> list[str]: - """Import selected step modules in the task image and discover missing imports. + """Import selected step modules in the execution image and discover missing imports. The loop installs only the packages it has already identified, in an ephemeral container, so the final requirements file stays based on actual @@ -899,7 +916,7 @@ def probe_step_modules( if import_name not in missing: missing.append(import_name) if import_name in CORE_IMPORTS: - print(f"[probe] base image is missing core import {import_name!r}; choose a compatible task image") + print(f"[probe] base image is missing core import {import_name!r}; choose a compatible execution image") return missing requirement = requirement_for_import(import_name, locked_versions) if requirement in requirements: @@ -940,15 +957,15 @@ def normalize_package(name: str) -> str: return re.sub(r"[-_.]+", "-", name).lower() -def build_submitter(submitter: Mapping[str, Any], *, execute: bool) -> int: - image = str(submitter.get("tag") or "nemotron-customizer-submit-airgap:latest") - base = str(submitter.get("base_image") or "python:3.12-slim") - platform = submitter_platform(submitter) +def build_launcher_image(launcher_image: Mapping[str, Any], *, execute: bool) -> int: + image = str(launcher_image.get("tag") or "nemotron-customizer-launcher-airgap:latest") + base = str(launcher_image.get("base_image") or "python:3.12-slim") + platform = launcher_image_platform(launcher_image) cmd = [ "docker", "build", "-f", - str(AIRGAP_DIR / "Dockerfile.submitter"), + str(AIRGAP_DIR / "Dockerfile.launcher"), "--build-arg", f"BASE_IMAGE={base}", "--build-arg", @@ -964,12 +981,12 @@ def build_submitter(submitter: Mapping[str, Any], *, execute: bool) -> int: return run_or_print(cmd, execute) -def submitter_platform(submitter: Mapping[str, Any]) -> str | None: - return str(submitter["platform"]) if submitter.get("platform") else None +def launcher_image_platform(launcher_image: Mapping[str, Any]) -> str | None: + return str(launcher_image["platform"]) if launcher_image.get("platform") else None -def build_task_image(group: TaskGroup, *, output_dir: Path, execute: bool) -> int: - group_dir = output_dir / "task-context" / group.name +def build_execution_image(group: ExecutionGroup, *, output_dir: Path, execute: bool) -> int: + group_dir = output_dir / "execution-context" / group.name group_dir.mkdir(parents=True, exist_ok=True) group.requirements_path = group_dir / f"requirements-{group.name}.txt" group.requirements_path.write_text( @@ -987,11 +1004,11 @@ def build_task_image(group: TaskGroup, *, output_dir: Path, execute: bool) -> in "docker", "build", "-f", - str(AIRGAP_DIR / "Dockerfile.task"), + str(AIRGAP_DIR / "Dockerfile.execution"), "--build-arg", f"BASE_IMAGE={group.base_image}", "--build-arg", - f"TASK_REQUIREMENTS={docker_context_path(group.requirements_path)}", + f"EXECUTION_REQUIREMENTS={docker_context_path(group.requirements_path)}", "--build-arg", f"REPO_OVERLAYS={docker_context_path(group.repo_overlays_path)}", "--build-arg", @@ -1009,7 +1026,7 @@ def build_task_image(group: TaskGroup, *, output_dir: Path, execute: bool) -> in return run_or_print(cmd, execute) -def prepare_repo_overlays(group: TaskGroup, *, repos_root: Path, execute: bool) -> None: +def prepare_repo_overlays(group: ExecutionGroup, *, repos_root: Path, execute: bool) -> None: repos_root.mkdir(parents=True, exist_ok=True) (repos_root / ".keep").touch() for overlay in group.repo_overlays: @@ -1086,9 +1103,9 @@ def run_or_print(cmd: list[str], execute: bool, *, mkdir: Path | None = None) -> return subprocess.run(cmd, check=False, cwd=REPO_ROOT).returncode -def clean_stale_group_dirs(output_dir: Path, groups: Iterable[TaskGroup], *, execute: bool) -> None: +def clean_stale_group_dirs(output_dir: Path, groups: Iterable[ExecutionGroup], *, execute: bool) -> None: keep = {group.name for group in groups} - for relative in ("task-context", "repo-overlays"): + for relative in ("execution-context", "repo-overlays"): parent = output_dir / relative if not parent.exists(): continue @@ -1180,7 +1197,7 @@ def step_to_manifest(info: StepInfo) -> dict[str, Any]: } -def task_group_manifest(group: TaskGroup) -> dict[str, Any]: +def execution_group_manifest(group: ExecutionGroup) -> dict[str, Any]: return { "name": group.name, "image_names": sorted(group.image_names), @@ -1201,7 +1218,7 @@ def task_group_manifest(group: TaskGroup) -> dict[str, Any]: } -def step_image_manifest(groups: Iterable[TaskGroup]) -> dict[str, str]: +def step_execution_image_manifest(groups: Iterable[ExecutionGroup]) -> dict[str, str]: out: dict[str, str] = {} for group in groups: image = group.selected_image or group.tag @@ -1210,12 +1227,12 @@ def step_image_manifest(groups: Iterable[TaskGroup]) -> dict[str, str]: return out -def submitter_manifest(submitter: Mapping[str, Any]) -> dict[str, Any]: +def launcher_image_manifest(launcher_image: Mapping[str, Any]) -> dict[str, Any]: return { - "base_image": submitter.get("base_image") or "python:3.12-slim", - "platform": submitter.get("platform"), - "tag": submitter.get("tag") or "nemotron-customizer-submit-airgap:latest", - "tar": submitter.get("tar") or "submitter-image.tar", + "base_image": launcher_image.get("base_image") or "python:3.12-slim", + "platform": launcher_image.get("platform"), + "tag": launcher_image.get("tag") or "nemotron-customizer-launcher-airgap:latest", + "tar": launcher_image.get("tar") or "launcher-image.tar", } diff --git a/tests/deploy/test_airgap_runner.py b/tests/deploy/test_airgap_runner.py index f93f47c26..f1fa51d9e 100644 --- a/tests/deploy/test_airgap_runner.py +++ b/tests/deploy/test_airgap_runner.py @@ -40,7 +40,7 @@ def test_airgap_runner_expands_and_validates_sft_dependency(): ] -def test_airgap_runner_groups_task_images_by_base_image_and_repo_overlays(tmp_path): +def test_airgap_runner_groups_execution_images_by_base_image_and_repo_overlays(tmp_path): runner = _runner_module() overlay = runner.RepoOverlay( repo="Megatron-Bridge", @@ -49,11 +49,11 @@ def test_airgap_runner_groups_task_images_by_base_image_and_repo_overlays(tmp_pa target="/opt/Megatron-Bridge", ) cfg = { - "step_images": { + "step_execution_images": { "prep/sft_packing": "a", "sft/megatron_bridge": "b", }, - "task_images": { + "execution_images": { "a": { "base_image": "image:base", "tag": "image:a", @@ -69,7 +69,7 @@ def test_airgap_runner_groups_task_images_by_base_image_and_repo_overlays(tmp_pa }, } - groups = runner.task_groups( + groups = runner.execution_groups( cfg, output_dir=tmp_path, step_infos={ @@ -107,17 +107,17 @@ def test_airgap_runner_groups_task_images_by_base_image_and_repo_overlays(tmp_pa def test_airgap_runner_only_builds_images_for_selected_steps(tmp_path): runner = _runner_module() cfg = { - "step_images": { + "step_execution_images": { "prep/sft_packing": "nemo-megatron", "sft/automodel": "nemo-automodel", }, - "task_images": { + "execution_images": { "nemo-megatron": {"base_image": "nemo:base"}, "nemo-automodel": {"base_image": "automodel:base"}, }, } - groups = runner.task_groups(cfg, output_dir=tmp_path, step_infos={"prep/sft_packing": object()}) + groups = runner.execution_groups(cfg, output_dir=tmp_path, step_infos={"prep/sft_packing": object()}) assert len(groups) == 1 assert groups[0].name.startswith("nemo-megatron-") @@ -131,7 +131,7 @@ def test_airgap_runner_maps_sdg_to_light_sdk_image(tmp_path): targets = runner.expand_targets(cfg) infos = runner.validate_targets(targets) - groups = runner.task_groups(cfg, output_dir=tmp_path, step_infos=infos) + groups = runner.execution_groups(cfg, output_dir=tmp_path, step_infos=infos) assert [target.spec for target in targets] == ["sdg/data_designer:tiny"] assert len(groups) == 1 @@ -147,7 +147,7 @@ def test_airgap_runner_maps_byob_to_data_designer_image(tmp_path): targets = runner.expand_targets(cfg) infos = runner.validate_targets(targets) - groups = runner.task_groups(cfg, output_dir=tmp_path, step_infos=infos) + groups = runner.execution_groups(cfg, output_dir=tmp_path, step_infos=infos) assert [target.spec for target in targets] == ["byob:tiny"] assert len(groups) == 1 @@ -165,7 +165,7 @@ def test_airgap_runner_target_override_selects_sdg_and_sft(): targets = runner.expand_targets(cfg) infos = runner.validate_targets(targets) - groups = runner.task_groups(cfg, output_dir=runner.AIRGAP_DIR / "out", step_infos=infos) + groups = runner.execution_groups(cfg, output_dir=runner.AIRGAP_DIR / "out", step_infos=infos) assert [target.spec for target in targets] == [ "sdg/data_designer:tiny", @@ -187,11 +187,11 @@ def test_airgap_runner_splits_same_base_image_when_repo_overlays_differ(tmp_path target="/opt/Megatron-Bridge", ) cfg = { - "step_images": { + "step_execution_images": { "prep/sft_packing": "nemo-megatron", "sft/megatron_bridge": "nemo-megatron", }, - "task_images": { + "execution_images": { "nemo-megatron": { "base_image": "nemo:base", "tag": "nemo-airgap:latest", @@ -199,7 +199,7 @@ def test_airgap_runner_splits_same_base_image_when_repo_overlays_differ(tmp_path }, }, } - groups = runner.task_groups( + groups = runner.execution_groups( cfg, output_dir=tmp_path, step_infos={ @@ -250,16 +250,16 @@ def test_airgap_runner_uses_collision_safe_repo_overlay_dirs(): def test_airgap_runner_auto_adds_stage_prerequisites(): runner = _runner_module() - assert runner.normalize_stages(["build-task-images"]) == [ + assert runner.normalize_stages(["build-execution-images"]) == [ "validate", - "discover-task-deps", - "build-task-images", + "discover-execution-deps", + "build-execution-images", ] assert runner.normalize_stages(["save-images"]) == [ "validate", - "discover-task-deps", - "build-submitter", - "build-task-images", + "discover-execution-deps", + "build-launcher-image", + "build-execution-images", "save-images", ] @@ -302,9 +302,9 @@ def test_airgap_runner_saved_image_manifest_has_checksum(tmp_path): image_tar = tmp_path / "image.tar" image_tar.write_text("image bytes", encoding="utf-8") - saved = runner.saved_image_manifest("image:tag", image_tar, execute=True, role="task", name="group") + saved = runner.saved_image_manifest("image:tag", image_tar, execute=True, role="execution", name="group") - assert saved["role"] == "task" + assert saved["role"] == "execution" assert saved["name"] == "group" assert saved["image"] == "image:tag" assert saved["tar"] == str(image_tar) From 4c0246099c822ef81f8803fa9d7e32d508ece358 Mon Sep 17 00:00:00 2001 From: Rakesh Paul Date: Sat, 9 May 2026 00:27:38 +0530 Subject: [PATCH 3/5] Fix launcher image setup and Docker platform inspection - Install git and CA certificates in the launcher image before uv sync - Capture only docker inspect stdout while suppressing stderr during platform checks - Keep the airgap runner platform probe compatible with subprocess stderr handling Signed-off-by: Rakesh Paul --- deploy/nemotron-customizer/airgap/Dockerfile.launcher | 4 ++++ deploy/nemotron-customizer/airgap/runner.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/deploy/nemotron-customizer/airgap/Dockerfile.launcher b/deploy/nemotron-customizer/airgap/Dockerfile.launcher index b65701785..7d26315d5 100644 --- a/deploy/nemotron-customizer/airgap/Dockerfile.launcher +++ b/deploy/nemotron-customizer/airgap/Dockerfile.launcher @@ -17,6 +17,10 @@ ENV WANDB_MODE=offline ENV PYTHONPATH=/workspace/Nemotron/src ENV PATH=/workspace/Nemotron/.venv/bin:$PATH +RUN apt-get update && \ + apt-get install -y --no-install-recommends git ca-certificates && \ + rm -rf /var/lib/apt/lists/* + RUN python -m pip install --no-cache-dir "uv==${UV_VERSION}" COPY . . diff --git a/deploy/nemotron-customizer/airgap/runner.py b/deploy/nemotron-customizer/airgap/runner.py index 13d45a8ec..c6cf33d4e 100644 --- a/deploy/nemotron-customizer/airgap/runner.py +++ b/deploy/nemotron-customizer/airgap/runner.py @@ -1071,7 +1071,7 @@ def docker_image_platform(image: str) -> str | None: "{{.Os}}/{{.Architecture}}{{if .Variant}}/{{.Variant}}{{end}}", image, ], - capture_output=True, + stdout=subprocess.PIPE, text=True, stderr=subprocess.DEVNULL, cwd=REPO_ROOT, From 2097108b30dc2312406242e8eb4ac9096d73996c Mon Sep 17 00:00:00 2001 From: Rakesh Paul Date: Mon, 11 May 2026 15:06:54 +0530 Subject: [PATCH 4/5] Normalize step CLI under nemotron steps - Move generic step commands and backends from commands/step to commands/steps - Register only `nemotron steps`; remove the singular `nemotron step` alias - Expose `steps list`, `steps show`, `steps run`, and `steps translation` - Update imports, tests, docs, skills, and config examples to the plural CLI - Add coverage for plural command registration and singular alias rejection Signed-off-by: Rakesh Paul --- deploy/nemotron-customizer/airgap/README.md | 2 +- skills/nemotron-customize/SKILL.md | 20 ++++----- .../context/automodel-pretrain.txt | 2 +- src/nemotron/cli/bin/nemotron.py | 1 - src/nemotron/cli/commands/step/__init__.py | 45 ------------------- .../cli/commands/{step => steps}/_resolve.py | 0 .../cli/commands/steps/_typer_group.py | 20 ++++++++- .../{step => steps}/backends/__init__.py | 16 +++---- .../commands/{step => steps}/backends/base.py | 0 .../{step => steps}/backends/cloud.py | 2 +- .../{step => steps}/backends/local.py | 2 +- .../{step => steps}/backends/registry.py | 2 +- .../{step => steps}/backends/slurm.py | 2 +- .../cli/commands/{step => steps}/list_cmd.py | 2 +- .../cli/commands/{step => steps}/run_cmd.py | 8 ++-- .../cli/commands/{step => steps}/show_cmd.py | 6 +-- src/nemotron/steps/optimize/SKILL.md | 6 +-- .../modelopt/distill/config/default.yaml | 4 +- .../steps/optimize/modelopt/prune/SKILL.md | 2 +- .../modelopt/prune/config/default.yaml | 4 +- .../modelopt/quantize/config/default.yaml | 4 +- src/nemotron/steps/peft/SKILL.md | 4 +- src/nemotron/steps/prep/SKILL.md | 6 +-- .../steps/prep/pretrain_prep/config/tiny.yaml | 6 +-- src/nemotron/steps/prep/rl_prep/SKILL.md | 2 +- .../steps/prep/sft_packing/config/tiny.yaml | 6 +-- src/nemotron/steps/pretrain/SKILL.md | 6 +-- src/nemotron/steps/rl/SKILL.md | 6 +-- src/nemotron/steps/rl/nemo_rl/dpo/SKILL.md | 2 +- .../steps/rl/nemo_rl/dpo/config/tiny.yaml | 2 +- src/nemotron/steps/rl/nemo_rl/rlhf/SKILL.md | 2 +- src/nemotron/steps/rl/nemo_rl/rlvr/SKILL.md | 2 +- src/nemotron/steps/sdg/SKILL.md | 4 +- src/nemotron/steps/sft/SKILL.md | 4 +- .../sft/megatron_bridge/config/default.yaml | 2 +- .../sft/megatron_bridge/config/tiny.yaml | 2 +- tests/steps/test_cloud_backend.py | 6 +-- tests/steps/test_translation_cli.py | 16 +++++++ 38 files changed, 108 insertions(+), 120 deletions(-) delete mode 100644 src/nemotron/cli/commands/step/__init__.py rename src/nemotron/cli/commands/{step => steps}/_resolve.py (100%) rename src/nemotron/cli/commands/{step => steps}/backends/__init__.py (67%) rename src/nemotron/cli/commands/{step => steps}/backends/base.py (100%) rename src/nemotron/cli/commands/{step => steps}/backends/cloud.py (98%) rename src/nemotron/cli/commands/{step => steps}/backends/local.py (96%) rename src/nemotron/cli/commands/{step => steps}/backends/registry.py (96%) rename src/nemotron/cli/commands/{step => steps}/backends/slurm.py (99%) rename src/nemotron/cli/commands/{step => steps}/list_cmd.py (98%) rename src/nemotron/cli/commands/{step => steps}/run_cmd.py (95%) rename src/nemotron/cli/commands/{step => steps}/show_cmd.py (93%) diff --git a/deploy/nemotron-customizer/airgap/README.md b/deploy/nemotron-customizer/airgap/README.md index 5f64e0e8f..ffcfcfbe7 100644 --- a/deploy/nemotron-customizer/airgap/README.md +++ b/deploy/nemotron-customizer/airgap/README.md @@ -128,7 +128,7 @@ profile still apply. Use the image printed by the runner under `step_execution_images`. ```bash -uv run nemotron step run sft/megatron_bridge \ +uv run nemotron steps run sft/megatron_bridge \ -c deploy/nemotron-customizer/airgap/configs/sft_megatron_bridge_tiny.yaml \ -b \ run.env.container_image= diff --git a/skills/nemotron-customize/SKILL.md b/skills/nemotron-customize/SKILL.md index 94090e0e3..da6034efe 100644 --- a/skills/nemotron-customize/SKILL.md +++ b/skills/nemotron-customize/SKILL.md @@ -63,16 +63,16 @@ Goal: enumerate candidate steps and gather the user's constraints in one pass. machine-readable: ```bash -nemotron step list --json # all steps -nemotron step list --json --category sft # by category -nemotron step list --json --consumes training_jsonl # by input type -nemotron step list --json --produces checkpoint_megatron # by output type -nemotron step show # full manifest +nemotron steps list --json # all steps +nemotron steps list --json --category sft # by category +nemotron steps list --json --consumes training_jsonl # by input type +nemotron steps list --json --produces checkpoint_megatron # by output type +nemotron steps show # full manifest ``` -Implementation: [list_cmd.py](../../src/nemotron/cli/commands/step/list_cmd.py), -[show_cmd.py](../../src/nemotron/cli/commands/step/show_cmd.py), -[run_cmd.py](../../src/nemotron/cli/commands/step/run_cmd.py). +Implementation: [list_cmd.py](../../src/nemotron/cli/commands/steps/list_cmd.py), +[show_cmd.py](../../src/nemotron/cli/commands/steps/show_cmd.py), +[run_cmd.py](../../src/nemotron/cli/commands/steps/run_cmd.py). Per-step JSON schema: `{id, name, category, description, tags, path, consumes:[{type,required,description}], produces:[...], parameters:[...]}`. @@ -406,8 +406,8 @@ configs. ## Tool preferences -- **Catalog discovery**: `nemotron step list --json --consumes ` — don't grep `**/step.toml`. -- **Manifest read**: `nemotron step show ` — fastest single read. +- **Catalog discovery**: `nemotron steps list --json --consumes ` — don't grep `**/step.toml`. +- **Manifest read**: `nemotron steps show ` — fastest single read. - **Context packs**: load one large pack per stage via Act sub-agent — beats many small reads. - **Step.py read**: full file — they're <100 lines. - **Type validation**: read [types.toml](../../src/nemotron/steps/types.toml) once during Orient; keep in context through Verify. diff --git a/skills/nemotron-customize/context/automodel-pretrain.txt b/skills/nemotron-customize/context/automodel-pretrain.txt index 408c39c6f..c34d21e91 100644 --- a/skills/nemotron-customize/context/automodel-pretrain.txt +++ b/skills/nemotron-customize/context/automodel-pretrain.txt @@ -100,7 +100,7 @@ Default `model.pretrained_model_name_or_path` in this repo is `[[models]]`). Override at CLI: ```bash -nemotron step run pretrain/automodel -c default \ +nemotron steps run pretrain/automodel -c default \ model.pretrained_model_name_or_path= ``` diff --git a/src/nemotron/cli/bin/nemotron.py b/src/nemotron/cli/bin/nemotron.py index 30f9f8d0c..806224059 100644 --- a/src/nemotron/cli/bin/nemotron.py +++ b/src/nemotron/cli/bin/nemotron.py @@ -117,7 +117,6 @@ def _register_groups() -> None: ("super3", "nemotron.cli.commands.super3", "super3_app"), ("kit", "nemotron.cli.kit", "kit_app"), ("embed", "nemotron.cli.commands.embed", "embed_app"), - ("step", "nemotron.cli.commands.step", "step_app"), ("steps", "nemotron.cli.commands.steps", "steps_app"), ) diff --git a/src/nemotron/cli/commands/step/__init__.py b/src/nemotron/cli/commands/step/__init__.py deleted file mode 100644 index 9e1b49334..000000000 --- a/src/nemotron/cli/commands/step/__init__.py +++ /dev/null @@ -1,45 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Generic step CLI — list / show / run any discovered step. - -Designed for agentic use: every step.py + step.toml in src/nemotron/steps/ is -auto-discovered. The agent's surface is uniform regardless of the underlying -framework (AutoModel, Megatron-Bridge, NeMo-RL, Data Designer). -""" -from __future__ import annotations - -import typer - -from nemotron.cli.commands.step.list_cmd import list_steps -from nemotron.cli.commands.step.run_cmd import run_step -from nemotron.cli.commands.step.show_cmd import show_step - -step_app = typer.Typer( - name="step", - help="Discover, inspect, and run any registered step.", - no_args_is_help=True, - rich_markup_mode="rich", - context_settings={"help_option_names": ["-h", "--help"]}, -) - -step_app.command("list", help="List discovered steps. Use --json for machine-readable output.")(list_steps) -step_app.command("show", help="Show a step's manifest, runspec, and parameters.")(show_step) -step_app.command( - "run", - help="Run a step on the chosen executor profile.", - context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, -)(run_step) - -__all__ = ["step_app"] diff --git a/src/nemotron/cli/commands/step/_resolve.py b/src/nemotron/cli/commands/steps/_resolve.py similarity index 100% rename from src/nemotron/cli/commands/step/_resolve.py rename to src/nemotron/cli/commands/steps/_resolve.py diff --git a/src/nemotron/cli/commands/steps/_typer_group.py b/src/nemotron/cli/commands/steps/_typer_group.py index 5296c479e..1f4025bdc 100644 --- a/src/nemotron/cli/commands/steps/_typer_group.py +++ b/src/nemotron/cli/commands/steps/_typer_group.py @@ -16,15 +16,33 @@ from __future__ import annotations +import typer + from nemo_runspec.recipe_typer import RecipeTyper +from nemotron.cli.commands.steps.list_cmd import list_steps +from nemotron.cli.commands.steps.run_cmd import run_step +from nemotron.cli.commands.steps.show_cmd import show_step from nemotron.cli.commands.steps.translation import META as TRANSLATION_META from nemotron.cli.commands.steps.translation import translation + +def _add_catalog_commands(app: typer.Typer) -> None: + app.command("list", help="List discovered steps. Use --json for machine-readable output.")(list_steps) + app.command("show", help="Show a step's manifest, runspec, and parameters.")(show_step) + app.command( + "run", + help="Run a step on the chosen executor profile.", + context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, + )(run_step) + + steps_app = RecipeTyper( name="steps", - help="Agentic workflow steps", + help="Discover, inspect, run, and compose agentic workflow steps.", no_args_is_help=True, rich_markup_mode="rich", + context_settings={"help_option_names": ["-h", "--help"]}, ) +_add_catalog_commands(steps_app) steps_app.add_recipe_command(translation, meta=TRANSLATION_META) diff --git a/src/nemotron/cli/commands/step/backends/__init__.py b/src/nemotron/cli/commands/steps/backends/__init__.py similarity index 67% rename from src/nemotron/cli/commands/step/backends/__init__.py rename to src/nemotron/cli/commands/steps/backends/__init__.py index eaabbde89..3b1ddac1c 100644 --- a/src/nemotron/cli/commands/step/backends/__init__.py +++ b/src/nemotron/cli/commands/steps/backends/__init__.py @@ -12,24 +12,24 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Backend protocol + registry for ``nemotron step run``. +"""Backend protocol + registry for ``nemotron steps run``. A ``Backend`` knows how to take a parsed step (script + runspec + rendered -config + env profile) and submit it for execution. ``step run`` selects a +config + env profile) and submit it for execution. ``steps run`` selects a backend by name from the env profile's ``executor`` field and calls :meth:`Backend.submit`. Adding a new backend is one new file under this -package — ``step run`` itself does not change. +package — ``steps run`` itself does not change. """ from __future__ import annotations -from nemotron.cli.commands.step.backends.base import Backend, JobContext -from nemotron.cli.commands.step.backends.cloud import CloudBackend -from nemotron.cli.commands.step.backends.local import LocalBackend -from nemotron.cli.commands.step.backends.registry import get_backend, register +from nemotron.cli.commands.steps.backends.base import Backend, JobContext +from nemotron.cli.commands.steps.backends.cloud import CloudBackend +from nemotron.cli.commands.steps.backends.local import LocalBackend +from nemotron.cli.commands.steps.backends.registry import get_backend, register # Built-in backends are registered here so a fresh import sees them all. register("local", LocalBackend) -register("slurm", "nemotron.cli.commands.step.backends.slurm:SlurmBackend") +register("slurm", "nemotron.cli.commands.steps.backends.slurm:SlurmBackend") register("lepton", CloudBackend) register("dgxcloud", CloudBackend) diff --git a/src/nemotron/cli/commands/step/backends/base.py b/src/nemotron/cli/commands/steps/backends/base.py similarity index 100% rename from src/nemotron/cli/commands/step/backends/base.py rename to src/nemotron/cli/commands/steps/backends/base.py diff --git a/src/nemotron/cli/commands/step/backends/cloud.py b/src/nemotron/cli/commands/steps/backends/cloud.py similarity index 98% rename from src/nemotron/cli/commands/step/backends/cloud.py rename to src/nemotron/cli/commands/steps/backends/cloud.py index 0a1208a6b..55b74af7f 100644 --- a/src/nemotron/cli/commands/step/backends/cloud.py +++ b/src/nemotron/cli/commands/steps/backends/cloud.py @@ -36,7 +36,7 @@ from pathlib import Path from nemo_runspec.execution import execute_cloud, execute_cloud_ray -from nemotron.cli.commands.step.backends.base import JobContext +from nemotron.cli.commands.steps.backends.base import JobContext class CloudBackend: diff --git a/src/nemotron/cli/commands/step/backends/local.py b/src/nemotron/cli/commands/steps/backends/local.py similarity index 96% rename from src/nemotron/cli/commands/step/backends/local.py rename to src/nemotron/cli/commands/steps/backends/local.py index ede023ae4..35a6bc2eb 100644 --- a/src/nemotron/cli/commands/step/backends/local.py +++ b/src/nemotron/cli/commands/steps/backends/local.py @@ -22,7 +22,7 @@ import sys from nemo_runspec.execution import execute_local -from nemotron.cli.commands.step.backends.base import JobContext +from nemotron.cli.commands.steps.backends.base import JobContext class LocalBackend: diff --git a/src/nemotron/cli/commands/step/backends/registry.py b/src/nemotron/cli/commands/steps/backends/registry.py similarity index 96% rename from src/nemotron/cli/commands/step/backends/registry.py rename to src/nemotron/cli/commands/steps/backends/registry.py index 1f754982e..0ecf57673 100644 --- a/src/nemotron/cli/commands/step/backends/registry.py +++ b/src/nemotron/cli/commands/steps/backends/registry.py @@ -25,7 +25,7 @@ import typer -from nemotron.cli.commands.step.backends.base import Backend +from nemotron.cli.commands.steps.backends.base import Backend _BackendFactory = type[Backend] | Callable[[], Backend] | str _REGISTRY: dict[str, _BackendFactory] = {} diff --git a/src/nemotron/cli/commands/step/backends/slurm.py b/src/nemotron/cli/commands/steps/backends/slurm.py similarity index 99% rename from src/nemotron/cli/commands/step/backends/slurm.py rename to src/nemotron/cli/commands/steps/backends/slurm.py index 39126a77b..3ca194b70 100644 --- a/src/nemotron/cli/commands/step/backends/slurm.py +++ b/src/nemotron/cli/commands/steps/backends/slurm.py @@ -32,7 +32,7 @@ CodePackager, SelfContainedPackager, ) -from nemotron.cli.commands.step.backends.base import JobContext +from nemotron.cli.commands.steps.backends.base import JobContext class SlurmBackend: diff --git a/src/nemotron/cli/commands/step/list_cmd.py b/src/nemotron/cli/commands/steps/list_cmd.py similarity index 98% rename from src/nemotron/cli/commands/step/list_cmd.py rename to src/nemotron/cli/commands/steps/list_cmd.py index 053ba4f95..18918d8c3 100644 --- a/src/nemotron/cli/commands/step/list_cmd.py +++ b/src/nemotron/cli/commands/steps/list_cmd.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""`nemotron step list` — discovery for humans and agents.""" +"""`nemotron steps list` — discovery for humans and agents.""" from __future__ import annotations diff --git a/src/nemotron/cli/commands/step/run_cmd.py b/src/nemotron/cli/commands/steps/run_cmd.py similarity index 95% rename from src/nemotron/cli/commands/step/run_cmd.py rename to src/nemotron/cli/commands/steps/run_cmd.py index 75a0e9b17..84d6d43a0 100644 --- a/src/nemotron/cli/commands/step/run_cmd.py +++ b/src/nemotron/cli/commands/steps/run_cmd.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""`nemotron step run` — generic step execution. +"""`nemotron steps run` — generic step execution. Thin dispatcher. The job of this command is to: 1. Resolve a step id → step.py + runspec. @@ -21,7 +21,7 @@ 4. Hand off to ``backend.submit(ctx)``. All execution-mechanics live in the per-backend modules under -``nemotron.cli.commands.step.backends.*``. To add a new submission target, +``nemotron.cli.commands.steps.backends.*``. To add a new submission target, write one Backend subclass and ``register()`` it — no edits here. """ @@ -43,8 +43,8 @@ from nemo_runspec.display import display_job_config, display_job_submission from nemo_runspec.env import parse_env from nemo_runspec.execution import build_env_vars, get_startup_commands -from nemotron.cli.commands.step._resolve import resolve_step -from nemotron.cli.commands.step.backends import JobContext, get_backend +from nemotron.cli.commands.steps._resolve import resolve_step +from nemotron.cli.commands.steps.backends import JobContext, get_backend def run_step( diff --git a/src/nemotron/cli/commands/step/show_cmd.py b/src/nemotron/cli/commands/steps/show_cmd.py similarity index 93% rename from src/nemotron/cli/commands/step/show_cmd.py rename to src/nemotron/cli/commands/steps/show_cmd.py index 98f049e5a..2197dd3a6 100644 --- a/src/nemotron/cli/commands/step/show_cmd.py +++ b/src/nemotron/cli/commands/steps/show_cmd.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""`nemotron step show` — full manifest + runspec for one step.""" +"""`nemotron steps show` — full manifest + runspec for one step.""" from __future__ import annotations import json as json_module @@ -23,8 +23,8 @@ from rich.console import Console from nemo_runspec import parse as parse_runspec -from nemotron.cli.commands.step._resolve import resolve_step -from nemotron.cli.commands.step.list_cmd import _step_to_dict +from nemotron.cli.commands.steps._resolve import resolve_step +from nemotron.cli.commands.steps.list_cmd import _step_to_dict console = Console() diff --git a/src/nemotron/steps/optimize/SKILL.md b/src/nemotron/steps/optimize/SKILL.md index 40352dbf6..e97d96bef 100644 --- a/src/nemotron/steps/optimize/SKILL.md +++ b/src/nemotron/steps/optimize/SKILL.md @@ -69,9 +69,9 @@ prep/pretrain_prep → optimize/modelopt/distill → eval/model_eval # stand ## Smoke commands ```bash -nemotron step run optimize/modelopt/quantize -c tiny -nemotron step run optimize/modelopt/prune -c tiny -nemotron step run optimize/modelopt/distill -c tiny # uses use_mock_data=true +nemotron steps run optimize/modelopt/quantize -c tiny +nemotron steps run optimize/modelopt/prune -c tiny +nemotron steps run optimize/modelopt/distill -c tiny # uses use_mock_data=true ``` ## Guardrails diff --git a/src/nemotron/steps/optimize/modelopt/distill/config/default.yaml b/src/nemotron/steps/optimize/modelopt/distill/config/default.yaml index f72f4c718..b9f6b0d4a 100644 --- a/src/nemotron/steps/optimize/modelopt/distill/config/default.yaml +++ b/src/nemotron/steps/optimize/modelopt/distill/config/default.yaml @@ -21,8 +21,8 @@ # prep/pretrain_prep first and pass the produced data prefixes via data_paths. # # Usage: -# nemotron step run optimize/modelopt/distill -c default -# nemotron step run optimize/modelopt/distill -c default args.use_mock_data=true args.train_iters=100 +# nemotron steps run optimize/modelopt/distill -c default +# nemotron steps run optimize/modelopt/distill -c default args.use_mock_data=true args.train_iters=100 # # Script arguments are forwarded from args: using underscore CLI flags. For # example, args.teacher_hf_path becomes --teacher_hf_path. New upstream flags can diff --git a/src/nemotron/steps/optimize/modelopt/prune/SKILL.md b/src/nemotron/steps/optimize/modelopt/prune/SKILL.md index fc2510317..65a4961de 100644 --- a/src/nemotron/steps/optimize/modelopt/prune/SKILL.md +++ b/src/nemotron/steps/optimize/modelopt/prune/SKILL.md @@ -13,7 +13,7 @@ Before changing configs or code, read `step.toml` to understand the step flow, c - Consume `checkpoint_hf`. - Produce pruned `checkpoint_hf`. -- Smoke with `nemotron step run optimize/modelopt/prune -c tiny`. +- Smoke with `nemotron steps run optimize/modelopt/prune -c tiny`. ## Configure diff --git a/src/nemotron/steps/optimize/modelopt/prune/config/default.yaml b/src/nemotron/steps/optimize/modelopt/prune/config/default.yaml index be04398a1..4c2d1dae2 100644 --- a/src/nemotron/steps/optimize/modelopt/prune/config/default.yaml +++ b/src/nemotron/steps/optimize/modelopt/prune/config/default.yaml @@ -25,8 +25,8 @@ # extra_args when using your own data. # # Usage: -# nemotron step run optimize/modelopt/prune -c default -# nemotron step run optimize/modelopt/prune -c default args.prune_target_params=6e9 +# nemotron steps run optimize/modelopt/prune -c default +# nemotron steps run optimize/modelopt/prune -c default args.prune_target_params=6e9 # # Script arguments are forwarded from args: using underscore CLI flags. For # example, args.prune_target_params becomes --prune_target_params. New upstream diff --git a/src/nemotron/steps/optimize/modelopt/quantize/config/default.yaml b/src/nemotron/steps/optimize/modelopt/quantize/config/default.yaml index 991f3ceed..2fd5e05d7 100644 --- a/src/nemotron/steps/optimize/modelopt/quantize/config/default.yaml +++ b/src/nemotron/steps/optimize/modelopt/quantize/config/default.yaml @@ -22,8 +22,8 @@ # Megatron-Bridge quantization scripts. # # Usage: -# nemotron step run optimize/modelopt/quantize -c default -# nemotron step run optimize/modelopt/quantize -c default args.export_quant_cfg=fp8 +# nemotron steps run optimize/modelopt/quantize -c default +# nemotron steps run optimize/modelopt/quantize -c default args.export_quant_cfg=fp8 # # Script arguments are forwarded from args: using hyphenated CLI flags. For # example, args.hf_model_id becomes --hf-model-id. New upstream flags can be diff --git a/src/nemotron/steps/peft/SKILL.md b/src/nemotron/steps/peft/SKILL.md index 761b11663..bd049e3d0 100644 --- a/src/nemotron/steps/peft/SKILL.md +++ b/src/nemotron/steps/peft/SKILL.md @@ -61,8 +61,8 @@ instruction-format adherence). ## Smoke commands ```bash -nemotron step run peft/automodel -c tiny -nemotron step run peft/megatron_bridge -c tiny # requires compatible packed_parquet + base checkpoint +nemotron steps run peft/automodel -c tiny +nemotron steps run peft/megatron_bridge -c tiny # requires compatible packed_parquet + base checkpoint ``` ## Guardrails diff --git a/src/nemotron/steps/prep/SKILL.md b/src/nemotron/steps/prep/SKILL.md index 7cb192f5e..1f3b36fd8 100644 --- a/src/nemotron/steps/prep/SKILL.md +++ b/src/nemotron/steps/prep/SKILL.md @@ -46,9 +46,9 @@ Skip packing when: ## Smoke commands ```bash -nemotron step run prep/sft_packing -c tiny -nemotron step run prep/pretrain_prep -c tiny -nemotron step run prep/rl_prep -c tiny +nemotron steps run prep/sft_packing -c tiny +nemotron steps run prep/pretrain_prep -c tiny +nemotron steps run prep/rl_prep -c tiny ``` ## Patterns to cite diff --git a/src/nemotron/steps/prep/pretrain_prep/config/tiny.yaml b/src/nemotron/steps/prep/pretrain_prep/config/tiny.yaml index 95fd49e9b..a66a1976a 100644 --- a/src/nemotron/steps/prep/pretrain_prep/config/tiny.yaml +++ b/src/nemotron/steps/prep/pretrain_prep/config/tiny.yaml @@ -17,9 +17,9 @@ # step's data/ dir so the same config works under any source layout. # # Usage: -# nemotron step run prep/pretrain_prep -c tiny # local -# nemotron step run prep/pretrain_prep -c tiny -r lepton_pretrain_dataprep -# nemotron step run prep/pretrain_prep -c tiny -r slurm_pretrain_dataprep +# nemotron steps run prep/pretrain_prep -c tiny # local +# nemotron steps run prep/pretrain_prep -c tiny -r lepton_pretrain_dataprep +# nemotron steps run prep/pretrain_prep -c tiny -r slurm_pretrain_dataprep # blend_path is omitted on purpose — step.py defaults to data/blend_tiny.json. output_dir: ${oc.env:PRETRAIN_OUTPUT_DIR,./output/pretrain_dataprep_tiny} diff --git a/src/nemotron/steps/prep/rl_prep/SKILL.md b/src/nemotron/steps/prep/rl_prep/SKILL.md index f23c69c70..6e47ff8f2 100644 --- a/src/nemotron/steps/prep/rl_prep/SKILL.md +++ b/src/nemotron/steps/prep/rl_prep/SKILL.md @@ -13,7 +13,7 @@ Before changing configs or code, read `step.toml` to understand the step flow, c - Consume `training_jsonl` through an RL data blend. - Produce sharded `training_jsonl` ready for `rl/nemo_rl/dpo`, `rl/nemo_rl/rlvr`, or `rl/nemo_rl/rlhf`. -- Smoke with `nemotron step run prep/rl_prep -c tiny`. +- Smoke with `nemotron steps run prep/rl_prep -c tiny`. ## Configure diff --git a/src/nemotron/steps/prep/sft_packing/config/tiny.yaml b/src/nemotron/steps/prep/sft_packing/config/tiny.yaml index e346d6de1..96ca8c7da 100644 --- a/src/nemotron/steps/prep/sft_packing/config/tiny.yaml +++ b/src/nemotron/steps/prep/sft_packing/config/tiny.yaml @@ -17,9 +17,9 @@ # without dragging the recipes/ tree along. # # Usage: -# nemotron step run prep/sft_packing -c tiny # local -# nemotron step run prep/sft_packing -c tiny -r slurm_sft_dataprep_tiny -# nemotron step run prep/sft_packing -c tiny -r test_lepton_sft_dataprep +# nemotron steps run prep/sft_packing -c tiny # local +# nemotron steps run prep/sft_packing -c tiny -r slurm_sft_dataprep_tiny +# nemotron steps run prep/sft_packing -c tiny -r test_lepton_sft_dataprep # blend_path is omitted on purpose — step.py defaults to the in-step # data/blend_tiny.json so the same config works under any source layout diff --git a/src/nemotron/steps/pretrain/SKILL.md b/src/nemotron/steps/pretrain/SKILL.md index 1fbcc4787..a9dacd500 100644 --- a/src/nemotron/steps/pretrain/SKILL.md +++ b/src/nemotron/steps/pretrain/SKILL.md @@ -20,7 +20,7 @@ The "default model" column shows what the shipped `config/default.yaml` selects. Override at CLI: ```bash -nemotron step run pretrain/automodel -c default \ +nemotron steps run pretrain/automodel -c default \ model.pretrained_model_name_or_path= ``` @@ -88,8 +88,8 @@ curate/nemo_curator → prep/pretrain_prep → pretrain/automodel → che ## Smoke commands ```bash -nemotron step run pretrain/automodel -c tiny -nemotron step run pretrain/megatron_bridge -c tiny +nemotron steps run pretrain/automodel -c tiny +nemotron steps run pretrain/megatron_bridge -c tiny ``` ## Patterns to cite diff --git a/src/nemotron/steps/rl/SKILL.md b/src/nemotron/steps/rl/SKILL.md index fe5946bed..5e4ca5eec 100644 --- a/src/nemotron/steps/rl/SKILL.md +++ b/src/nemotron/steps/rl/SKILL.md @@ -74,9 +74,9 @@ when the next consumer (eval, deployment) expects HF. ## Smoke commands ```bash -nemotron step run rl/nemo_rl/dpo -c tiny -nemotron step run rl/nemo_rl/rlvr -c tiny -nemotron step run rl/nemo_rl/rlhf -c tiny +nemotron steps run rl/nemo_rl/dpo -c tiny +nemotron steps run rl/nemo_rl/rlvr -c tiny +nemotron steps run rl/nemo_rl/rlhf -c tiny ``` ## Guardrails diff --git a/src/nemotron/steps/rl/nemo_rl/dpo/SKILL.md b/src/nemotron/steps/rl/nemo_rl/dpo/SKILL.md index 58ae297eb..41efd655b 100644 --- a/src/nemotron/steps/rl/nemo_rl/dpo/SKILL.md +++ b/src/nemotron/steps/rl/nemo_rl/dpo/SKILL.md @@ -14,7 +14,7 @@ Before changing configs or code, read `step.toml` to understand the step flow, c - Consume `training_jsonl` with prompt, chosen, and rejected fields. - Consume an SFT `checkpoint_megatron` policy. - Produce a DPO-aligned `checkpoint_megatron`. -- Smoke with `nemotron step run rl/nemo_rl/dpo -c tiny`. +- Smoke with `nemotron steps run rl/nemo_rl/dpo -c tiny`. ## Configure diff --git a/src/nemotron/steps/rl/nemo_rl/dpo/config/tiny.yaml b/src/nemotron/steps/rl/nemo_rl/dpo/config/tiny.yaml index 9e2224d94..12c556424 100644 --- a/src/nemotron/steps/rl/nemo_rl/dpo/config/tiny.yaml +++ b/src/nemotron/steps/rl/nemo_rl/dpo/config/tiny.yaml @@ -23,7 +23,7 @@ # - dataset = a HF preference set via BinaryPreferenceDataset. # # Usage: -# nemotron step run rl/nemo_rl/dpo -c tiny -r lepton_rl +# nemotron steps run rl/nemo_rl/dpo -c tiny -r lepton_rl dpo: max_num_epochs: 1 diff --git a/src/nemotron/steps/rl/nemo_rl/rlhf/SKILL.md b/src/nemotron/steps/rl/nemo_rl/rlhf/SKILL.md index 3d132a34f..16eb5d234 100644 --- a/src/nemotron/steps/rl/nemo_rl/rlhf/SKILL.md +++ b/src/nemotron/steps/rl/nemo_rl/rlhf/SKILL.md @@ -15,7 +15,7 @@ Before changing configs or code, read `step.toml` to understand the step flow, c - Consume an SFT `checkpoint_megatron` policy. - Consume a reward-model `checkpoint_hf`. - Produce an RLHF-aligned `checkpoint_megatron`. -- Smoke with `nemotron step run rl/nemo_rl/rlhf -c tiny`. +- Smoke with `nemotron steps run rl/nemo_rl/rlhf -c tiny`. ## Configure diff --git a/src/nemotron/steps/rl/nemo_rl/rlvr/SKILL.md b/src/nemotron/steps/rl/nemo_rl/rlvr/SKILL.md index 1178e1794..3798a6077 100644 --- a/src/nemotron/steps/rl/nemo_rl/rlvr/SKILL.md +++ b/src/nemotron/steps/rl/nemo_rl/rlvr/SKILL.md @@ -14,7 +14,7 @@ Before changing configs or code, read `step.toml` to understand the step flow, c - Consume prompt `training_jsonl` with verifier fields such as answers. - Consume an SFT `checkpoint_megatron` policy. - Produce an RLVR-aligned `checkpoint_megatron`. -- Smoke with `nemotron step run rl/nemo_rl/rlvr -c tiny`. +- Smoke with `nemotron steps run rl/nemo_rl/rlvr -c tiny`. ## Configure diff --git a/src/nemotron/steps/sdg/SKILL.md b/src/nemotron/steps/sdg/SKILL.md index 7352daf32..a3a4cdd8f 100644 --- a/src/nemotron/steps/sdg/SKILL.md +++ b/src/nemotron/steps/sdg/SKILL.md @@ -63,8 +63,8 @@ sdg/data_designer (rl_pref.yaml) → prep/rl_prep → rl/nemo_ ## Smoke commands ```bash -nemotron step run sdg/data_designer -c tiny -nemotron step run sdg/data_designer -c default --extra-args=--preview +nemotron steps run sdg/data_designer -c tiny +nemotron steps run sdg/data_designer -c default --extra-args=--preview ``` ## Patterns to cite diff --git a/src/nemotron/steps/sft/SKILL.md b/src/nemotron/steps/sft/SKILL.md index 375d9ca69..0c1a9cf65 100644 --- a/src/nemotron/steps/sft/SKILL.md +++ b/src/nemotron/steps/sft/SKILL.md @@ -50,8 +50,8 @@ Pick an SFT backend and keep data and checkpoint formats compatible. ## Smoke commands ```bash -nemotron step run sft/automodel -c tiny -nemotron step run sft/megatron_bridge -c tiny # requires compatible packed_parquet +nemotron steps run sft/automodel -c tiny +nemotron steps run sft/megatron_bridge -c tiny # requires compatible packed_parquet ``` ## Patterns to cite diff --git a/src/nemotron/steps/sft/megatron_bridge/config/default.yaml b/src/nemotron/steps/sft/megatron_bridge/config/default.yaml index 27cb6cab0..3860b6dcf 100644 --- a/src/nemotron/steps/sft/megatron_bridge/config/default.yaml +++ b/src/nemotron/steps/sft/megatron_bridge/config/default.yaml @@ -15,7 +15,7 @@ # Tiny SFT for the 2-node slurm_sft profile. # Pulls base weights from HuggingFace via AutoBridge so no pre-existing # Megatron checkpoint is required. Uses the packed_parquet shards produced by -# `nemotron step run prep/sft_packing -c tiny -r slurm_sft_dataprep_tiny`. +# `nemotron steps run prep/sft_packing -c tiny -r slurm_sft_dataprep_tiny`. # # Note: dataset paths are explicit (Megatron-Bridge's vanilla schema). This # step is generic — recipe-specific dir shortcuts live in recipes/, not here. diff --git a/src/nemotron/steps/sft/megatron_bridge/config/tiny.yaml b/src/nemotron/steps/sft/megatron_bridge/config/tiny.yaml index 7eed9c173..82ec2ce68 100644 --- a/src/nemotron/steps/sft/megatron_bridge/config/tiny.yaml +++ b/src/nemotron/steps/sft/megatron_bridge/config/tiny.yaml @@ -16,7 +16,7 @@ # Points at lepton's lustre-shared mount by default. # Pulls base weights from HuggingFace via AutoBridge so no pre-existing # Megatron checkpoint is required. Uses the packed_parquet shards produced by -# `nemotron step run prep/sft_packing -c tiny -r lepton_sft_dataprep`. +# `nemotron steps run prep/sft_packing -c tiny -r lepton_sft_dataprep`. # Override the container's stale Megatron-Bridge with a branch that supports # packed-parquet specs. diff --git a/tests/steps/test_cloud_backend.py b/tests/steps/test_cloud_backend.py index ba1fad64f..428b04fae 100644 --- a/tests/steps/test_cloud_backend.py +++ b/tests/steps/test_cloud_backend.py @@ -19,9 +19,9 @@ import pytest -import nemotron.cli.commands.step.backends.cloud as cloud_mod -from nemotron.cli.commands.step.backends.base import JobContext -from nemotron.cli.commands.step.backends.cloud import CloudBackend +import nemotron.cli.commands.steps.backends.cloud as cloud_mod +from nemotron.cli.commands.steps.backends.base import JobContext +from nemotron.cli.commands.steps.backends.cloud import CloudBackend def _ctx(step_id: str, *, launch: str = "ray") -> JobContext: diff --git a/tests/steps/test_translation_cli.py b/tests/steps/test_translation_cli.py index 56b01d9a8..ca4863ec1 100644 --- a/tests/steps/test_translation_cli.py +++ b/tests/steps/test_translation_cli.py @@ -46,6 +46,22 @@ def test_root_cli_registers_steps_translation_command() -> None: assert "translation" in result.output +def test_root_cli_registers_steps_catalog_commands() -> None: + result = CliRunner().invoke(app, ["steps", "--help"]) + + assert result.exit_code == 0 + assert "list" in result.output + assert "show" in result.output + assert "run" in result.output + + +def test_root_cli_does_not_register_step_alias() -> None: + result = CliRunner().invoke(app, ["step", "--help"]) + + assert result.exit_code != 0 + assert "No such command" in result.output + + def test_translation_cli_runs_checked_in_step(monkeypatch: pytest.MonkeyPatch) -> None: config = { "input_path": "/data/source.jsonl", From 6332e3b3eb5156e6dbece685c019786cf4633dfc Mon Sep 17 00:00:00 2001 From: Rakesh Paul Date: Mon, 11 May 2026 15:49:10 +0530 Subject: [PATCH 5/5] Airgap SKILL addition Signed-off-by: Rakesh Paul --- deploy/nemotron-customizer/airgap/SKILL.md | 115 +++++++++++++++++++++ skills/nemotron-customize/SKILL.md | 17 ++- 2 files changed, 131 insertions(+), 1 deletion(-) create mode 100644 deploy/nemotron-customizer/airgap/SKILL.md diff --git a/deploy/nemotron-customizer/airgap/SKILL.md b/deploy/nemotron-customizer/airgap/SKILL.md new file mode 100644 index 000000000..48a37cd2a --- /dev/null +++ b/deploy/nemotron-customizer/airgap/SKILL.md @@ -0,0 +1,115 @@ +--- +name: nemotron-customizer-airgap +description: Prepare, validate, build, and use Nemotron Customizer airgap image bundles for offline clusters. Use when planning airgapped deployments, editing deploy/nemotron-customizer/airgap/airgap.yaml, selecting workflow targets, grouping step execution images, baking repo overlays or wheel additions, resuming airgap runner builds, or submitting `nemotron steps run` jobs inside an airgapped environment. +--- + +# Nemotron Customizer Airgap + +Use this skill to help an agent produce a connected-machine airgap bundle and +then submit Nemotron Customizer steps from the airgapped side. Keep it grounded +in the checked-in runner and manifests; do not invent a parallel packaging flow. + +## Read First + +- `deploy/nemotron-customizer/airgap/README.md` for the operator flow. +- `deploy/nemotron-customizer/airgap/airgap.yaml` for the current image map. +- `deploy/nemotron-customizer/airgap/runner.py` when changing behavior. +- `tests/deploy/test_airgap_runner.py` before editing runner logic. +- `deploy/nemotron-customizer/airgap/configs/` for runtime overlay configs. + +For selected steps, inspect the catalog through the CLI: + +```bash +uv run nemotron steps show --json +``` + +## Workflow + +1. Establish the side of the workflow: + - Connected machine: validate, build, save image tarballs. + - Airgapped side: load images, set env profiles, run selected steps. + +2. Gather the minimum inputs: + - Target steps and config names, for example `sft/megatron_bridge:tiny`. + - Target architecture or Docker platform, for example `linux/amd64`. + - Available base images and whether the connected machine can pull them. + - Airgapped env profile name, mounts, model/data/checkpoint locations. + - Whether destructive or expensive actions such as `--execute`, Docker build, + Docker volume cleanup, or state-file removal are explicitly allowed. + +3. Plan with the runner first: + +```bash +uv run python deploy/nemotron-customizer/airgap/runner.py \ + --config deploy/nemotron-customizer/airgap/airgap.yaml +``` + +Use `--target :` for one-off selections without editing YAML. +The runner expands dependencies from `dependencies`, validates selected step +files/configs, groups execution images, and prints selected execution images. + +4. Edit `airgap.yaml` only where the runner expects configuration: + - `workflow.stages` or CLI `--target` for selected customer steps. + - `dependencies` for explicit upstream Nemotron Customizer step outputs. + - `step_execution_images` for step-to-image mapping. + - `execution_images` for base image, tag, tar, platform, and import probes. + - `launcher_image` for the launcher container. + +5. Execute only when the user asks for a real build: + +```bash +uv run python deploy/nemotron-customizer/airgap/runner.py \ + --config deploy/nemotron-customizer/airgap/airgap.yaml \ + --execute +``` + +If a build fails midway, keep `airgap-build-state.yaml` and rerun the same +command. Remove or move that state only when intentionally changing the plan. + +6. On the airgapped side, use images from `out/airgap-manifest.yaml` under +`step_execution_images`. Submit with the plural CLI: + +```bash +uv run nemotron steps run \ + -c \ + -b \ + run.env.container_image= +``` + +For `sft/megatron_bridge`, prefer the airgap overlay configs under +`deploy/nemotron-customizer/airgap/configs/`; they clear runtime git auto-mounts +because the runner bakes those repos into the execution image. + +## Guardrails + +- Keep models, datasets, checkpoints, secrets, and customer files out of images. + Put them on persistent storage and reference them through config overrides and + `run.env.mounts`. +- Treat `${auto_mount:git+...}` as a connected-machine build input. The runner + bakes pinned repo overlays into execution images so airgapped jobs do not clone + from GitHub. +- Do not add missing packages blindly. Let `discover-execution-deps` and + import probes determine small additions; keep heavyweight framework deps in + the base image choice. +- Preserve offline defaults unless the user has an internal mirror: + `HF_HUB_OFFLINE=1`, `TRANSFORMERS_OFFLINE=1`, `HF_DATASETS_OFFLINE=1`, + and `WANDB_MODE=offline`. +- Use `nemotron steps ...`; do not reintroduce `nemotron step ...`. + +## Validation + +After edits to runner logic, YAML structure, or airgap docs, run: + +```bash +uv run pytest tests/deploy/test_airgap_runner.py -q +``` + +For CLI-facing examples, also smoke the command shape: + +```bash +uv run nemotron steps --help +uv run nemotron steps show prep/sft_packing --json +``` + +Do not run Docker build/save stages during validation unless the user explicitly +asked for a real connected-machine bundle build. diff --git a/skills/nemotron-customize/SKILL.md b/skills/nemotron-customize/SKILL.md index da6034efe..fb3d5a491 100644 --- a/skills/nemotron-customize/SKILL.md +++ b/skills/nemotron-customize/SKILL.md @@ -40,6 +40,7 @@ Concise. Technical. No fluff. | Cross-step constraint (tokenizer lock, eval bookends, ...) | `src/nemotron/steps/patterns/.md` | | Artifact compatibility / `is_a` / `convert_to` | [src/nemotron/steps/types.toml](../../src/nemotron/steps/types.toml) | | GPU memory / parallelism heuristics | [src/nemotron/steps/hardware.md](../../src/nemotron/steps/hardware.md) | +| Explicit airgap/offline bundle request only | [deploy/nemotron-customizer/airgap/SKILL.md](../../deploy/nemotron-customizer/airgap/SKILL.md) | | Library API extracts for code generation | [context/index.toml](context/index.toml) → `context/.txt` | | Project scaffold rules (CLI, pyproject, README, deploy) | [act/PROJECT.md](act/PROJECT.md) | | Per-stage code rules (R1–R5, dry-run, W&B) | [act/STAGE.md](act/STAGE.md) | @@ -144,7 +145,6 @@ Goal: produce a markdown plan the user reviews before any code is written. | 6 | RL warm-starts from SFT; rewards validated before scale. | [patterns/rl-validate-rewards-before-scale.md](../../src/nemotron/steps/patterns/rl-validate-rewards-before-scale.md) | | 7 | GPU count ≥ chosen model's `min_gpus` (from `[[models]]` block in each `step.toml`). | step.toml + [hardware.md](../../src/nemotron/steps/hardware.md) | | 8 | Sovereign / customization patterns checked: `cpt-data-blend-scoping`, `sft-data-blending`, `multilingual-tokenizer-check`, `data-quality-before-quantity`, `sdg-pipeline-versioning`, `byob-benchmark-design`, `pretrain-token-budget-before-scale`, `sft-small-dataset-prefer-lora`, `convert-checkpoint-safety`. | [patterns/](../../src/nemotron/steps/patterns/) | - When a check fails: surface it as a `⚠` warning in the plan and propose a fix. When the user can't satisfy it (e.g. hardware), propose alternatives in descending preference: smaller model → AutoModel instead of Megatron-Bridge → @@ -187,6 +187,7 @@ graph LR | Resource | Required by | Notes | |---|---|---| | | | | + ```` **Step 2.5 — Present the plan and wait.** Don't proceed to Act until the @@ -356,6 +357,17 @@ catalog-based stage." If the same Explorer build keeps appearing across projects, suggest the user run `/nemotron-add-step` to land it in the catalog. +### Explicit airgap handoff + +Do this only when the user explicitly asks for airgap, offline/no-internet +execution, image tarballs, or Nemotron Customizer airgap bundle work. Do not +include it in normal local, Slurm, Lepton, Airflow, or Kubeflow planning. + +When triggered, stop the generic project-generation path and load +[deploy/nemotron-customizer/airgap/SKILL.md](../../deploy/nemotron-customizer/airgap/SKILL.md). +Use the approved catalog step IDs as airgap runner `--target :` +values, then follow that skill's validate/build/run workflow. + ### Choosing a mode | User says | Mode | @@ -367,6 +379,7 @@ run `/nemotron-add-step` to land it in the catalog. | "Translate EN → \" | Catalog ([translate/nemo_skills](../../src/nemotron/steps/translate/nemo_skills/)) | | "Curate web text" | Catalog ([curate/nemo_curator](../../src/nemotron/steps/curate/nemo_curator/)) | | "Deploy to TensorRT-LLM" | Explorer (no step yet — derive from upstream library docs and add a `convert/*` step if the path stabilizes) | +| "Build an airgap bundle", "offline cluster", "no internet", "image tarballs for these steps" | Explicit airgap handoff | | "Train with X exotic backend" | Explorer or **ask** | | Ambiguous | **Ask** | @@ -437,6 +450,8 @@ configs. - Tune parallelism beyond what `hardware.md` and `[[strategies]]` advise. - Assume GPU count, type, or interconnect. - Generate Slurm/Airflow/Kubeflow wrappers unless requested. +- Route to airgap for generic deployment requests; require an explicit airgap, + offline, no-internet, or image-tar bundle ask. - Modify [src/nemotron/steps/](../../src/nemotron/steps/). To extend the catalog, route the user to `/nemotron-add-step`. - Restate per-step rules in this skill — link to the step's `SKILL.md` instead.